Commit e4a73f4f authored by Ben Gamari's avatar Ben Gamari Committed by Austin Seipp

Move GeneralCategory et al to GHC.Unicode

This allows these to be used from Text.Read.Lex import cycles.

Reviewed By: thomie, austin

Differential Revision: https://phabricator.haskell.org/D1121

GHC Trac Issues: #10444
parent 8be43dd9
......@@ -53,14 +53,12 @@ module Data.Char
) where
import GHC.Base
import GHC.Arr (Ix)
import GHC.Char
import GHC.Real (fromIntegral)
import GHC.Show
import GHC.Read (Read, readLitChar, lexLitChar)
import GHC.Read (readLitChar, lexLitChar)
import GHC.Unicode
import GHC.Num
import GHC.Enum
-- $setup
-- Allow the use of Prelude in doctests.
......@@ -105,121 +103,6 @@ digitToInt c
hexl = ord c - ord 'a'
hexu = ord c - ord 'A'
-- | Unicode General Categories (column 2 of the UnicodeData table) in
-- the order they are listed in the Unicode standard (the Unicode
-- Character Database, in particular).
--
-- ==== __Examples__
--
-- Basic usage:
--
-- >>> :t OtherLetter
-- OtherLetter :: GeneralCategory
--
-- 'Eq' instance:
--
-- >>> UppercaseLetter == UppercaseLetter
-- True
-- >>> UppercaseLetter == LowercaseLetter
-- False
--
-- 'Ord' instance:
--
-- >>> NonSpacingMark <= MathSymbol
-- True
--
-- 'Enum' instance:
--
-- >>> enumFromTo ModifierLetter SpacingCombiningMark
-- [ModifierLetter,OtherLetter,NonSpacingMark,SpacingCombiningMark]
--
-- 'Read' instance:
--
-- >>> read "DashPunctuation" :: GeneralCategory
-- DashPunctuation
-- >>> read "17" :: GeneralCategory
-- *** Exception: Prelude.read: no parse
--
-- 'Show' instance:
--
-- >>> show EnclosingMark
-- "EnclosingMark"
--
-- 'Bounded' instance:
--
-- >>> minBound :: GeneralCategory
-- UppercaseLetter
-- >>> maxBound :: GeneralCategory
-- NotAssigned
--
-- 'Ix' instance:
--
-- >>> import Data.Ix ( index )
-- >>> index (OtherLetter,Control) FinalQuote
-- 12
-- >>> index (OtherLetter,Control) Format
-- *** Exception: Error in array index
--
data GeneralCategory
= UppercaseLetter -- ^ Lu: Letter, Uppercase
| LowercaseLetter -- ^ Ll: Letter, Lowercase
| TitlecaseLetter -- ^ Lt: Letter, Titlecase
| ModifierLetter -- ^ Lm: Letter, Modifier
| OtherLetter -- ^ Lo: Letter, Other
| NonSpacingMark -- ^ Mn: Mark, Non-Spacing
| SpacingCombiningMark -- ^ Mc: Mark, Spacing Combining
| EnclosingMark -- ^ Me: Mark, Enclosing
| DecimalNumber -- ^ Nd: Number, Decimal
| LetterNumber -- ^ Nl: Number, Letter
| OtherNumber -- ^ No: Number, Other
| ConnectorPunctuation -- ^ Pc: Punctuation, Connector
| DashPunctuation -- ^ Pd: Punctuation, Dash
| OpenPunctuation -- ^ Ps: Punctuation, Open
| ClosePunctuation -- ^ Pe: Punctuation, Close
| InitialQuote -- ^ Pi: Punctuation, Initial quote
| FinalQuote -- ^ Pf: Punctuation, Final quote
| OtherPunctuation -- ^ Po: Punctuation, Other
| MathSymbol -- ^ Sm: Symbol, Math
| CurrencySymbol -- ^ Sc: Symbol, Currency
| ModifierSymbol -- ^ Sk: Symbol, Modifier
| OtherSymbol -- ^ So: Symbol, Other
| Space -- ^ Zs: Separator, Space
| LineSeparator -- ^ Zl: Separator, Line
| ParagraphSeparator -- ^ Zp: Separator, Paragraph
| Control -- ^ Cc: Other, Control
| Format -- ^ Cf: Other, Format
| Surrogate -- ^ Cs: Other, Surrogate
| PrivateUse -- ^ Co: Other, Private Use
| NotAssigned -- ^ Cn: Other, Not Assigned
deriving (Eq, Ord, Enum, Read, Show, Bounded, Ix)
-- | The Unicode general category of the character. This relies on the
-- 'Enum' instance of 'GeneralCategory', which must remain in the
-- same order as the categories are presented in the Unicode
-- standard.
--
-- ==== __Examples__
--
-- Basic usage:
--
-- >>> generalCategory 'a'
-- LowercaseLetter
-- >>> generalCategory 'A'
-- UppercaseLetter
-- >>> generalCategory '0'
-- DecimalNumber
-- >>> generalCategory '%'
-- OtherPunctuation
-- >>> generalCategory '♥'
-- OtherSymbol
-- >>> generalCategory '\31'
-- Control
-- >>> generalCategory ' '
-- Space
--
generalCategory :: Char -> GeneralCategory
generalCategory c = toEnum $ fromIntegral $ wgencat $ fromIntegral $ ord c
-- derived character classifiers
-- | Selects alphabetic Unicode characters (lower-case, upper-case and
......@@ -360,96 +243,6 @@ isNumber c = case generalCategory c of
OtherNumber -> True
_ -> False
-- | Selects Unicode punctuation characters, including various kinds
-- of connectors, brackets and quotes.
--
-- This function returns 'True' if its argument has one of the
-- following 'GeneralCategory's, or 'False' otherwise:
--
-- * 'ConnectorPunctuation'
-- * 'DashPunctuation'
-- * 'OpenPunctuation'
-- * 'ClosePunctuation'
-- * 'InitialQuote'
-- * 'FinalQuote'
-- * 'OtherPunctuation'
--
-- These classes are defined in the
-- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>,
-- part of the Unicode standard. The same document defines what is
-- and is not a \"Punctuation\".
--
-- ==== __Examples__
--
-- Basic usage:
--
-- >>> isPunctuation 'a'
-- False
-- >>> isPunctuation '7'
-- False
-- >>> isPunctuation '♥'
-- False
-- >>> isPunctuation '"'
-- True
-- >>> isPunctuation '?'
-- True
-- >>> isPunctuation '—'
-- True
--
isPunctuation :: Char -> Bool
isPunctuation c = case generalCategory c of
ConnectorPunctuation -> True
DashPunctuation -> True
OpenPunctuation -> True
ClosePunctuation -> True
InitialQuote -> True
FinalQuote -> True
OtherPunctuation -> True
_ -> False
-- | Selects Unicode symbol characters, including mathematical and
-- currency symbols.
--
-- This function returns 'True' if its argument has one of the
-- following 'GeneralCategory's, or 'False' otherwise:
--
-- * 'MathSymbol'
-- * 'CurrencySymbol'
-- * 'ModifierSymbol'
-- * 'OtherSymbol'
--
-- These classes are defined in the
-- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>,
-- part of the Unicode standard. The same document defines what is
-- and is not a \"Symbol\".
--
-- ==== __Examples__
--
-- Basic usage:
--
-- >>> isSymbol 'a'
-- False
-- >>> isSymbol '6'
-- False
-- >>> isSymbol '='
-- True
--
-- The definition of \"math symbol\" may be a little
-- counter-intuitive depending on one's background:
--
-- >>> isSymbol '+'
-- True
-- >>> isSymbol '-'
-- False
--
isSymbol :: Char -> Bool
isSymbol c = case generalCategory c of
MathSymbol -> True
CurrencySymbol -> True
ModifierSymbol -> True
OtherSymbol -> True
_ -> False
-- | Selects Unicode space and separator characters.
--
-- This function returns 'True' if its argument has one of the
......
......@@ -59,7 +59,7 @@ import Text.ParserCombinators.ReadPrec
import Data.Maybe
import GHC.Unicode ( isDigit )
import GHC.Unicode
import GHC.Num
import GHC.Real
import GHC.Float
......@@ -312,6 +312,8 @@ choose sps = foldr ((+++) . try_one) pfail sps
-- Simple instances of Read
--------------------------------------------------------------
deriving instance Read GeneralCategory
instance Read Char where
readPrec =
parens
......
......@@ -50,8 +50,8 @@ module GHC.Show
where
import GHC.Base
import GHC.Num
import GHC.List ((!!), foldr1, break)
import GHC.Num
-- | The @shows@ functions return a function that prepends the
-- output 'String' to an existing 'String'. This allows constant-time
......
{-# LANGUAGE Trustworthy #-}
{-# LANGUAGE CPP, NoImplicitPrelude #-}
{-# LANGUAGE CPP, NoImplicitPrelude, StandaloneDeriving #-}
{-# OPTIONS_HADDOCK hide #-}
-----------------------------------------------------------------------------
......@@ -19,11 +19,13 @@
-----------------------------------------------------------------------------
module GHC.Unicode (
GeneralCategory (..), generalCategory,
isAscii, isLatin1, isControl,
isAsciiUpper, isAsciiLower,
isPrint, isSpace, isUpper,
isLower, isAlpha, isDigit,
isOctDigit, isHexDigit, isAlphaNum,
isPunctuation, isSymbol,
toUpper, toLower, toTitle,
wgencat
) where
......@@ -31,10 +33,131 @@ module GHC.Unicode (
import GHC.Base
import GHC.Char (chr)
import GHC.Real
import GHC.Enum ( Enum (..), Bounded (..) )
import GHC.Arr ( Ix (..) )
import GHC.Num
-- Data.Char.chr already imports this and we need to define a Show instance
-- for GeneralCategory
import GHC.Show ( Show )
#include "HsBaseConfig.h"
-- | Unicode General Categories (column 2 of the UnicodeData table) in
-- the order they are listed in the Unicode standard (the Unicode
-- Character Database, in particular).
--
-- ==== __Examples__
--
-- Basic usage:
--
-- >>> :t OtherLetter
-- OtherLetter :: GeneralCategory
--
-- 'Eq' instance:
--
-- >>> UppercaseLetter == UppercaseLetter
-- True
-- >>> UppercaseLetter == LowercaseLetter
-- False
--
-- 'Ord' instance:
--
-- >>> NonSpacingMark <= MathSymbol
-- True
--
-- 'Enum' instance:
--
-- >>> enumFromTo ModifierLetter SpacingCombiningMark
-- [ModifierLetter,OtherLetter,NonSpacingMark,SpacingCombiningMark]
--
-- 'Read' instance:
--
-- >>> read "DashPunctuation" :: GeneralCategory
-- DashPunctuation
-- >>> read "17" :: GeneralCategory
-- *** Exception: Prelude.read: no parse
--
-- 'Show' instance:
--
-- >>> show EnclosingMark
-- "EnclosingMark"
--
-- 'Bounded' instance:
--
-- >>> minBound :: GeneralCategory
-- UppercaseLetter
-- >>> maxBound :: GeneralCategory
-- NotAssigned
--
-- 'Ix' instance:
--
-- >>> import Data.Ix ( index )
-- >>> index (OtherLetter,Control) FinalQuote
-- 12
-- >>> index (OtherLetter,Control) Format
-- *** Exception: Error in array index
--
data GeneralCategory
= UppercaseLetter -- ^ Lu: Letter, Uppercase
| LowercaseLetter -- ^ Ll: Letter, Lowercase
| TitlecaseLetter -- ^ Lt: Letter, Titlecase
| ModifierLetter -- ^ Lm: Letter, Modifier
| OtherLetter -- ^ Lo: Letter, Other
| NonSpacingMark -- ^ Mn: Mark, Non-Spacing
| SpacingCombiningMark -- ^ Mc: Mark, Spacing Combining
| EnclosingMark -- ^ Me: Mark, Enclosing
| DecimalNumber -- ^ Nd: Number, Decimal
| LetterNumber -- ^ Nl: Number, Letter
| OtherNumber -- ^ No: Number, Other
| ConnectorPunctuation -- ^ Pc: Punctuation, Connector
| DashPunctuation -- ^ Pd: Punctuation, Dash
| OpenPunctuation -- ^ Ps: Punctuation, Open
| ClosePunctuation -- ^ Pe: Punctuation, Close
| InitialQuote -- ^ Pi: Punctuation, Initial quote
| FinalQuote -- ^ Pf: Punctuation, Final quote
| OtherPunctuation -- ^ Po: Punctuation, Other
| MathSymbol -- ^ Sm: Symbol, Math
| CurrencySymbol -- ^ Sc: Symbol, Currency
| ModifierSymbol -- ^ Sk: Symbol, Modifier
| OtherSymbol -- ^ So: Symbol, Other
| Space -- ^ Zs: Separator, Space
| LineSeparator -- ^ Zl: Separator, Line
| ParagraphSeparator -- ^ Zp: Separator, Paragraph
| Control -- ^ Cc: Other, Control
| Format -- ^ Cf: Other, Format
| Surrogate -- ^ Cs: Other, Surrogate
| PrivateUse -- ^ Co: Other, Private Use
| NotAssigned -- ^ Cn: Other, Not Assigned
deriving (Show, Eq, Ord, Enum, Bounded, Ix)
-- | The Unicode general category of the character. This relies on the
-- 'Enum' instance of 'GeneralCategory', which must remain in the
-- same order as the categories are presented in the Unicode
-- standard.
--
-- ==== __Examples__
--
-- Basic usage:
--
-- >>> generalCategory 'a'
-- LowercaseLetter
-- >>> generalCategory 'A'
-- UppercaseLetter
-- >>> generalCategory '0'
-- DecimalNumber
-- >>> generalCategory '%'
-- OtherPunctuation
-- >>> generalCategory '♥'
-- OtherSymbol
-- >>> generalCategory '\31'
-- Control
-- >>> generalCategory ' '
-- Space
--
generalCategory :: Char -> GeneralCategory
generalCategory c = toEnum $ fromIntegral $ wgencat $ fromIntegral $ ord c
-- | Selects the first 128 characters of the Unicode character set,
-- corresponding to the ASCII character set.
isAscii :: Char -> Bool
......@@ -118,6 +241,96 @@ isHexDigit c = isDigit c ||
(fromIntegral (ord c - ord 'A')::Word) <= 5 ||
(fromIntegral (ord c - ord 'a')::Word) <= 5
-- | Selects Unicode punctuation characters, including various kinds
-- of connectors, brackets and quotes.
--
-- This function returns 'True' if its argument has one of the
-- following 'GeneralCategory's, or 'False' otherwise:
--
-- * 'ConnectorPunctuation'
-- * 'DashPunctuation'
-- * 'OpenPunctuation'
-- * 'ClosePunctuation'
-- * 'InitialQuote'
-- * 'FinalQuote'
-- * 'OtherPunctuation'
--
-- These classes are defined in the
-- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>,
-- part of the Unicode standard. The same document defines what is
-- and is not a \"Punctuation\".
--
-- ==== __Examples__
--
-- Basic usage:
--
-- >>> isPunctuation 'a'
-- False
-- >>> isPunctuation '7'
-- False
-- >>> isPunctuation '♥'
-- False
-- >>> isPunctuation '"'
-- True
-- >>> isPunctuation '?'
-- True
-- >>> isPunctuation '—'
-- True
--
isPunctuation :: Char -> Bool
isPunctuation c = case generalCategory c of
ConnectorPunctuation -> True
DashPunctuation -> True
OpenPunctuation -> True
ClosePunctuation -> True
InitialQuote -> True
FinalQuote -> True
OtherPunctuation -> True
_ -> False
-- | Selects Unicode symbol characters, including mathematical and
-- currency symbols.
--
-- This function returns 'True' if its argument has one of the
-- following 'GeneralCategory's, or 'False' otherwise:
--
-- * 'MathSymbol'
-- * 'CurrencySymbol'
-- * 'ModifierSymbol'
-- * 'OtherSymbol'
--
-- These classes are defined in the
-- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>,
-- part of the Unicode standard. The same document defines what is
-- and is not a \"Symbol\".
--
-- ==== __Examples__
--
-- Basic usage:
--
-- >>> isSymbol 'a'
-- False
-- >>> isSymbol '6'
-- False
-- >>> isSymbol '='
-- True
--
-- The definition of \"math symbol\" may be a little
-- counter-intuitive depending on one's background:
--
-- >>> isSymbol '+'
-- True
-- >>> isSymbol '-'
-- False
--
isSymbol :: Char -> Bool
isSymbol c = case generalCategory c of
MathSymbol -> True
CurrencySymbol -> True
ModifierSymbol -> True
OtherSymbol -> True
_ -> False
-- | Convert a letter to the corresponding upper-case letter, if any.
-- Any other character is returned unchanged.
toUpper :: Char -> Char
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment