Commit f67794c5 authored by simonmar's avatar simonmar
Browse files

[project @ 2006-01-06 16:34:56 by simonmar]

Unicode source tests
parent 9a5b6efd
TOP=../../../..
include $(TOP)/mk/boilerplate.mk
include $(TOP)/mk/test.mk
# test that we catch UTF-8 decoding errors
test('utf8_002', normal, compile_fail, [''])
test('utf8_003', normal, compile_fail, [''])
test('utf8_004', normal, compile_fail, [''])
test('utf8_005', normal, compile_fail, [''])
test('utf8_010', normal, compile_fail, [''])
test('utf8_011', normal, compile_fail, [''])
test('utf8_020', normal, compile_fail, [''])
test('utf8_021', normal, compile_fail, [''])
test('utf8_022', normal, compile_fail, [''])
# test that we can understand unicode characters in lexemes
test('unicode001', normal, compile_and_run, [''])
{-# OPTIONS_GHC -fglasgow-exts #-}
{-
Test for valid unicode identifiers
*** This file is UTF-8 encoded.
*** BE CAREFUL WHEN EDITING THIS FILE WITH EMACS. Emacs' UTF-8 engine
has several times got the encoding wrong for me and inserted bogus
bytes, especially in the 4-byte characters. Edit the file literally
(M-x find-file-literally). By all means view it in Emacs' UTF-8
more (C-x RET c utf-8, C-x f unicode001.hs), but don't edit and save.
Here's a selection of characters I pulled from UnicodeData.txt that we
can use to test with:
-- upper/lower case letters
À LATIN CAPITAL LETTER A WITH GRAVE;Lu;0;L;0041 0300;;;;N;LATIN CAPITAL LETTER A GRAVE;;;00E0;
à LATIN SMALL LETTER A WITH GRAVE;Ll;0;L;0061 0300;;;;N;LATIN SMALL LETTER A GRAVE;;00C0;;00C0
Α GREEK CAPITAL LETTER ALPHA;Lu;0;L;;;;;N;;;;03B1;
α GREEK SMALL LETTER ALPHA;Ll;0;L;;;;;N;;;0391;;0391
α GREEK SMALL LETTER ALPHA;Ll;0;L;;;;;N;;;0391;;0391
β GREEK SMALL LETTER BETA;Ll;0;L;;;;;N;;;0392;;0392
γ GREEK SMALL LETTER GAMMA;Ll;0;L;;;;;N;;;0393;;0393
δ GREEK SMALL LETTER DELTA;Ll;0;L;;;;;N;;;0394;;0394
Ⴀ GEORGIAN CAPITAL LETTER AN;Lu;0;L;;;;;N;;Khutsuri;;;
ა GEORGIAN LETTER AN;Lo;0;L;;;;;N;GEORGIAN SMALL LETTER AN;;;;
Ϣ COPTIC CAPITAL LETTER SHEI;Lu;0;L;;;;;N;GREEK CAPITAL LETTER SHEI;;;03E3;
ϣ COPTIC SMALL LETTER SHEI;Ll;0;L;;;;;N;GREEK SMALL LETTER SHEI;;03E2;;03E2
А CYRILLIC CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0430;
а CYRILLIC SMALL LETTER A;Ll;0;L;;;;;N;;;0410;;0410
Ա ARMENIAN CAPITAL LETTER AYB;Lu;0;L;;;;;N;;;;0561;
ա ARMENIAN SMALL LETTER AYB;Ll;0;L;;;;;N;;;0531;;0531
𝐴 MATHEMATICAL ITALIC CAPITAL A;Lu;0;L;<font> 0041;;;;N;;;;;
𝑎 MATHEMATICAL ITALIC SMALL A;Ll;0;L;<font> 0061;;;;N;;;;;
𝔸 MATHEMATICAL DOUBLE-STRUCK CAPITAL A;Lu;0;L;<font> 0041;;;;N;;;;;
𝕒 MATHEMATICAL DOUBLE-STRUCK SMALL A;Ll;0;L;<font> 0061;;;;N;;;;;
-- title case letters
Dž LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON;Lt;0;L;<compat> 0044 017E;;;;N;LATIN LETTER CAPITAL D SMALL Z HACEK;;01C4;01C6;01C5
Lj LATIN CAPITAL LETTER L WITH SMALL LETTER J;Lt;0;L;<compat> 004C 006A;;;;N;LATIN LETTER CAPITAL L SMALL J;;01C7;01C9;01C8
-- small caps
ᴀ LATIN LETTER SMALL CAPITAL A;Ll;0;L;;;;;N;;;;;
ᴦ GREEK LETTER SMALL CAPITAL GAMMA;Ll;0;L;;;;;N;;;;;
-- caseless letters
ଅ ;ORIYA LETTER A;Lo;0;L;;;;;N;;;;;
அ TAMIL LETTER A;Lo;0;L;;;;;N;;;;;
అ TELUGU LETTER A;Lo;0;L;;;;;N;;;;;
ಅ KANNADA LETTER A;Lo;0;L;;;;;N;;;;;
അ MALAYALAM LETTER A;Lo;0;L;;;;;N;;;;;
අ SINHALA LETTER AYANNA;Lo;0;L;;;;;N;;;;;
ก THAI CHARACTER KO KAI;Lo;0;L;;;;;N;THAI LETTER KO KAI;;;;
ກ LAO LETTER KO;Lo;0;L;;;;;N;;;;;
ཀ TIBETAN LETTER KA;Lo;0;L;;;;;N;;;;;
က MYANMAR LETTER KA;Lo;0;L;;;;;N;;;;;
ᄀ HANGUL CHOSEONG KIYEOK;Lo;0;L;;;;;N;;g *;;;
ሀ ETHIOPIC SYLLABLE HA;Lo;0;L;;;;;N;;;;;
Ꭰ CHEROKEE LETTER A;Lo;0;L;;;;;N;;;;;
ᐁ CANADIAN SYLLABICS E;Lo;0;L;;;;;N;;;;;
ᚁ OGHAM LETTER BEITH;Lo;0;L;;;;;N;;;;;
ᚠ RUNIC LETTER FEHU FEOH FE F;Lo;0;L;;;;;N;;;;;
ᜀ TAGALOG LETTER A;Lo;0;L;;;;;N;;;;;
ᜠ HANUNOO LETTER A;Lo;0;L;;;;;N;;;;;
ᝀ BUHID LETTER A;Lo;0;L;;;;;N;;;;;
ᝠ TAGBANWA LETTER A;Lo;0;L;;;;;N;;;;;
ក KHMER LETTER KA;Lo;0;L;;;;;N;;;;;
ᠠ MONGOLIAN LETTER A;Lo;0;L;;;;;N;;;;;
ᤁ LIMBU LETTER KA;Lo;0;L;;;;;N;;;;;
ᥐ TAI LE LETTER KA;Lo;0;L;;;;;N;;;;;
ぁ HIRAGANA LETTER SMALL A;Lo;0;L;;;;;N;;;;;
ア KATAKANA LETTER A;Lo;0;L;;;;;N;;;;;
ㄅ BOPOMOFO LETTER B;Lo;0;L;;;;;N;;;;;
ㄱ HANGUL LETTER KIYEOK;Lo;0;L;<compat> 1100;;;;N;HANGUL LETTER GIYEOG;;;;
ㆠ BOPOMOFO LETTER BU;Lo;0;L;;;;;N;;;;;
ꀀ YI SYLLABLE IT;Lo;0;L;;;;;N;;;;;
-- spaces
  NO-BREAK SPACE;Zs;0;CS;<noBreak> 0020;;;;N;NON-BREAKING SPACE;;;;
  EN QUAD;Zs;0;WS;2002;;;;N;;;;;
  EN SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
  THIN SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
​ ZERO WIDTH SPACE;Zs;0;BN;;;;;N;;;;;
-- some symbols we might find useful in Haskell
← LEFTWARDS ARROW;Sm;0;ON;;;;;N;LEFT ARROW;;;;
→ RIGHTWARDS ARROW;Sm;0;ON;;;;;N;RIGHT ARROW;;;;
‖ DOUBLE VERTICAL LINE;Po;0;ON;;;;;N;DOUBLE VERTICAL BAR;;;;
∀ FOR ALL;Sm;0;ON;;;;;N;;;;;
∁ COMPLEMENT;Sm;0;ON;;;;;Y;;;;;
∃ THERE EXISTS;Sm;0;ON;;;;;Y;;;;;
∄ THERE DOES NOT EXIST;Sm;0;ON;2203 0338;;;;Y;;;;;
∅ EMPTY SET;Sm;0;ON;;;;;N;;;;;
∆ INCREMENT;Sm;0;ON;;;;;N;;;;;
∇ NABLA;Sm;0;ON;;;;;N;;;;;
∈ ELEMENT OF;Sm;0;ON;;;;;Y;;;;;
∉ NOT AN ELEMENT OF;Sm;0;ON;2208 0338;;;;Y;;;;;
∏ N-ARY PRODUCT;Sm;0;ON;;;;;N;;;;;
∑ N-ARY SUMMATION;Sm;0;ON;;;;;Y;;;;;
− MINUS SIGN;Sm;0;ET;;;;;N;;;;;
∓ MINUS-OR-PLUS SIGN;Sm;0;ET;;;;;N;;;;;
∕ DIVISION SLASH;Sm;0;ON;;;;;Y;;;;;
∘ RING OPERATOR;Sm;0;ON;;;;;N;;;;;
∙ BULLET OPERATOR;Sm;0;ON;;;;;N;;;;;
√ SQUARE ROOT;Sm;0;ON;;;;;Y;;;;;
∧ LOGICAL AND;Sm;0;ON;;;;;N;;;;;
∨ LOGICAL OR;Sm;0;ON;;;;;N;;;;;
∩ INTERSECTION;Sm;0;ON;;;;;N;;;;;
∪ UNION;Sm;0;ON;;;;;N;;;;;
≃ ASYMPTOTICALLY EQUAL TO;Sm;0;ON;;;;;Y;;;;;
≈ ALMOST EQUAL TO;Sm;0;ON;;;;;Y;;;;;
≠ NOT EQUAL TO;Sm;0;ON;003D 0338;;;;Y;;;;;
≙ ESTIMATES;Sm;0;ON;;;;;N;;;;;
≤ LESS-THAN OR EQUAL TO;Sm;0;ON;;;;;Y;LESS THAN OR EQUAL TO;;;;
≥ GREATER-THAN OR EQUAL TO;Sm;0;ON;;;;;Y;GREATER THAN OR EQUAL TO;;;;
≪ MUCH LESS-THAN;Sm;0;ON;;;;;Y;MUCH LESS THAN;;;;
≫ MUCH GREATER-THAN;Sm;0;ON;;;;;Y;MUCH GREATER THAN;;;;
⊂ SUBSET OF;Sm;0;ON;;;;;Y;;;;;
⊃ SUPERSET OF;Sm;0;ON;;;;;Y;;;;;
⊄ NOT A SUBSET OF;Sm;0;ON;2282 0338;;;;Y;;;;;
⊅ NOT A SUPERSET OF;Sm;0;ON;2283 0338;;;;Y;;;;;
⊆ SUBSET OF OR EQUAL TO;Sm;0;ON;;;;;Y;;;;;
⊇ SUPERSET OF OR EQUAL TO;Sm;0;ON;;;;;Y;;;;;
⊕ CIRCLED PLUS;Sm;0;ON;;;;;N;;;;;
⊖ CIRCLED MINUS;Sm;0;ON;;;;;N;;;;;
⊗ CIRCLED TIMES;Sm;0;ON;;;;;N;;;;;
⊘ CIRCLED DIVISION SLASH;Sm;0;ON;;;;;Y;;;;;
⊙ CIRCLED DOT OPERATOR;Sm;0;ON;;;;;N;;;;;
⊢ RIGHT TACK;Sm;0;ON;;;;;Y;;;;;
⊣ LEFT TACK;Sm;0;ON;;;;;Y;;;;;
⊤ DOWN TACK;Sm;0;ON;;;;;N;;;;;
⊥ UP TACK;Sm;0;ON;;;;;N;;;;;
⊦ ASSERTION;Sm;0;ON;;;;;Y;;;;;
⊧ MODELS;Sm;0;ON;;;;;Y;;;;;
⊨ TRUE;Sm;0;ON;;;;;Y;;;;;
⋂ N-ARY INTERSECTION;Sm;0;ON;;;;;N;;;;;
⋃ N-ARY UNION;Sm;0;ON;;;;;N;;;;;
⋅ DOT OPERATOR;Sm;0;ON;;;;;N;;;;;
⋯ MIDLINE HORIZONTAL ELLIPSIS;Sm;0;ON;;;;;N;;;;;
〈 LEFT-POINTING ANGLE BRACKET;Ps;0;ON;3008;;;;Y;BRA;;;;
〉 RIGHT-POINTING ANGLE BRACKET;Pe;0;ON;3009;;;;Y;KET;;;;
☹ WHITE FROWNING FACE;So;0;ON;;;;;N;;;;;
☺ WHITE SMILING FACE;So;0;ON;;;;;N;;;;;
⧺ DOUBLE PLUS;Sm;0;ON;;;;;N;;;;;
-- other random symbols
☣ BIOHAZARD SIGN;So;0;ON;;;;;N;;;;;
𝄬 MUSICAL SYMBOL FLAT UP;So;0;L;;;;;N;;;;;
𝌋 TETRAGRAM FOR CONTRARIETY;So;0;ON;;;;;N;;;;;
-- braille
⡍ ;BRAILLE PATTERN DOTS-1347;So;0;ON;;;;;N;;;;;
⣿ ;BRAILLE PATTERN DOTS-12345678;So;0;ON;;;;;N;;;;;
-- numbers
Ⅰ ;ROMAN NUMERAL ONE;Nl;0;L;<compat> 0049;;;1;N;;;;2170;
Ⅼ ;ROMAN NUMERAL FIFTY;Nl;0;L;<compat> 004C;;;50;N;;;;217C;
① ;CIRCLED DIGIT ONE;No;0;EN;<circle> 0031;;1;1;N;;;;;
⑴ ;PARENTHESIZED DIGIT ONE;No;0;EN;<compat> 0028 0031 0029;;1;1;N;;;;;
⒈ ;DIGIT ONE FULL STOP;No;0;EN;<compat> 0031 002E;;1;1;N;DIGIT ONE PERIOD;;;;
-}
module Main where
-- Test upper-case recognition:
data T
= À -- latin
| Α -- greek
| -- georgian
| Ϣ -- coptic
| А -- cyrillic
| Ա -- armenian
| 𝐴 -- maths italic
| 𝔸 -- maths double-struck
| Dž -- title case latin
-- Test lower-case recognition:
à α ϣ а ա 𝑎 𝕒 = undefined
-- Caseless characters in a string:
string = "ଅஅఅಅഅඅกກཀကᄀሀᎠᐁᚁᚠᜀᜠᝀᝠកᠠᤁᥐぁアㄅㄱㆠ" -- 29 chars
-- composition using a ring, greek type variables, and right arrows
() :: α β γ . (β γ) (α β) (α γ)
(f g) x = f (g x)
main = print length $ string
-- 0x80 is an invalid character
bad = '€'
-- buffer ends in 0xC0
À
\ No newline at end of file
-- buffer ends in 0xD0
Ð
\ No newline at end of file
-- buffer ends in 0xE0
à
\ No newline at end of file
-- buffer ends in 0xF0
ð
\ No newline at end of file
-- 0x80 is an invalid character
bad = '€'
-- 0xbf is an invalid character
bad = '¿'
-- A start sequence byte (0xC0) followed by an invalid continuation:
bad = "À."
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment