Jul 26, 2007

Fun with Unicode

I caught a blog entry by Alexey Zakhlestin via Planet-PHP today which asked the question "Why aren't unicode math symbols supported by programming languages?". The obvious answer, of course, is that they work just fine with the symbols they have and there's no need to mess with a good thing (it doesn't help that typing these symbols is a pita on your average terminal).

Being a whimsical sort, I decided that actually implementing his request would be more fun than simply pish-poshing it. I'm not suggesting this be part of PHP6 (I still don't personally think it's a good idea), but it's a fun exercise and good for a conversation starter... Here's just a few of the things I can do now:

<?php
var_dump(¼, ½ ¾);
// float(0.25)
// float(0.5)
// float(0.75)

var_dump(1 ≤ 2, 2 ≯ 3, 5 ≠ 6);
// bool(true)
// bool(true)
// bool(true)

var_dump(3 × 4, 15 ÷ 5);
// int(12)
// int(3)

var_dump(1 « 3);
// int(8)

/* Your font may be too small,
* but that's a skull and crossbones
*/
☠('aka die/exit');

P.S. - Just FYI, I am aware that the patch referenced above has flaws... The problems are fixable, I just didn't bother fixing 'em because this isn't a serious patch anyway.... Best spend that time on something worthwhile...


P.P.S. - Since the original posting of this article, PHP6-Unicode has been... well, essentially scrapped. So the patch itself is no longer relevant.


Index: Zend/zend_language_scanner.l
===================================================================
RCS file: /repository/ZendEngine2/zend_language_scanner.l,v
retrieving revision 1.167
diff -u -p -r1.167 zend_language_scanner.l
--- Zend/zend_language_scanner.l 12 Jul 2007 09:23:48 -0000 1.167
+++ Zend/zend_language_scanner.l 27 Jul 2007 05:05:47 -0000
@@ -402,6 +402,109 @@ ZEND_API int zend_copy_scanner_string(zv
return 1;
}

+/* Used by {LABEL} for converting unicode operator symbols */
+static inline int zend_scan_unicode_operator(zval *zendlval, char *str, zend_uint str_len, UConverter *conv, int *oplen, int have_equal TSRMLS_DC)
+{
+ int ret = 0;
+
+ switch (Z_USTRVAL_P(zendlval)[0]) {
+ case 0x2260: /* NOT EQUAL TO */
+ ret = T_IS_NOT_EQUAL;
+ break;
+
+ case 0x2264: /* LESS-THAN OR EQUAL TO */
+ case 0x226F: /* NOT GREATER-THAN */
+ ret = T_IS_SMALLER_OR_EQUAL;
+ break;
+
+ case 0x2265: /* GREATER-THAN OR EQUAL TO */
+ case 0x226E: /* NOT LESS-THAN */
+ ret = T_IS_GREATER_OR_EQUAL;
+ break;
+
+ case 0x00AB: /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK */
+ case 0x226A: /* MUCH LESS THAN */
+ ret = have_equal ? T_SL_EQUAL : T_SL;
+ break;
+
+ case 0x00BB: /* RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */
+ case 0x226B: /* MUCH MORE THAN */
+ ret = have_equal ? T_SR_EQUAL : T_SR;
+ break;
+
+ case 0x2270: /* NEITHER LESS-THAN NOR EQUAL TO */
+ ret = '>';
+ break;
+
+ case 0x2271: /* NEITHER GREATER-THAN NOR EQUAL TO */
+ ret = '<';
+ break;
+
+ case 0x2276: /* LESS-THAN OR GREATER-THAN */
+ case 0x2277: /* GREATER-THAN OR LESS-THAN */
+ ret = T_IS_NOT_EQUAL;
+ break;
+
+ case 0x2278: /* NEITHER LESS-THAN NOR GREATER-THAN */
+ case 0x2279: /* NEITHER GREATER-THAN NOR LESS-THAN */
+ ret = T_IS_EQUAL;
+ break;
+
+ case 0x00D7: /* MULTIPLICATION SIGN */
+ ret = have_equal ? T_MUL_EQUAL : '*';
+ break;
+
+ case 0x00F7: /* DIVISION SIGN */
+ ret = have_equal ? T_DIV_EQUAL : '/';
+ break;
+
+ case 0x00BC: /* VULGAR FRACTION ONE QUARTER */
+ zval_dtor(zendlval);
+ ZVAL_DOUBLE(zendlval, 0.25);
+ ret = T_DNUMBER;
+ break;
+
+ case 0x00BD: /* VULGAR FRACTION ONE HALF */
+ zval_dtor(zendlval);
+ ZVAL_DOUBLE(zendlval, 0.5);
+ ret = T_DNUMBER;
+ break;
+
+ case 0x00BE: /* VULGAR FRACTION THREE QUARTERS */
+ zval_dtor(zendlval);
+ ZVAL_DOUBLE(zendlval, 0.75);
+ ret = T_DNUMBER;
+ break;
+
+ case 0x2620: /* SKULL AND CROSSBONES */
+ ret = T_EXIT;
+ break;
+ }
+
+ if (ret) {
+ /* How much of the input do we need to consume for this codepoint in this character set? */
+ UChar buffer[2], *buffer_ptr = buffer;
+ UErrorCode status = U_ZERO_ERROR;
+ const char *str_ptr = str;
+
+ ucnv_toUnicode(conv, &buffer_ptr, buffer + 1, &str_ptr, str + str_len, NULL, TRUE, &status);
+ if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
+ /* Shouldn't happen... */
+ ret = 0;
+ } else {
+ *oplen = (str_ptr - str);
+ }
+ }
+
+ if (ret && ret != T_DNUMBER) {
+ zval_dtor(zendlval);
+ ZVAL_NULL(zendlval);
+ }
+
+
+ return ret;
+}
+
static inline int zend_check_and_normalize_identifier(zval *zendlval)
{
UChar *norm;
@@ -2217,13 +2320,47 @@ HEREDOC_CHARS ("{"*([^$\n\r\\{]|("
return T_ENCAPSED_AND_WHITESPACE;
}

-{LABEL} {
- if (!zend_copy_scanner_string(zendlval, yytext, yyleng, UG(unicode)?IS_UNICODE:IS_STRING, SCNG(output_conv) TSRMLS_CC)) {
- return 0;
+{LABEL}"="? {
+ zend_bool have_equal = 0;
+
+ if (yytext[yyleng-1] == '=') {
+ have_equal = 1;
}
- if (UG(unicode) && !zend_check_and_normalize_identifier(zendlval)) {
+
+ if (!zend_copy_scanner_string(zendlval, yytext, yyleng - have_equal, UG(unicode)?IS_UNICODE:IS_STRING, SCNG(output_conv) TSRMLS_CC)) {
return 0;
}
+
+ if (UG(unicode)) {
+ UChar *norm;
+ int norm_len;
+
+ if (!zend_is_valid_identifier(Z_USTRVAL_P(zendlval), Z_USTRLEN_P(zendlval))) {
+ int oplen = 0, ret = zend_scan_unicode_operator(zendlval, yytext, yyleng, SCNG(output_conv), &oplen, have_equal TSRMLS_CC);
+
+ if (ret) {
+ yyless(oplen + have_equal);
+ return ret;
+ }
+
+ zval_dtor(zendlval);
+ zend_error(E_COMPILE_WARNING, "Invalid identifier syntax: %r", Z_USTRVAL_P(zendlval));
+ return 0;
+ }
+
+ if (zend_normalize_identifier(&norm, &norm_len, Z_USTRVAL_P(zendlval), Z_USTRLEN_P(zendlval), 0) == FAILURE) {
+ zend_error(E_COMPILE_WARNING, "Could not normalize identifier: %r", Z_USTRVAL_P(zendlval));
+ efree(Z_USTRVAL_P(zendlval));
+ return 0;
+ }
+
+ if (norm != Z_USTRVAL_P(zendlval)) {
+ efree(Z_USTRVAL_P(zendlval));
+ ZVAL_UNICODEL(zendlval, norm, norm_len, 0);
+ }
+ }
+ yyless(yyleng - have_equal);
+
return T_STRING;
}