Commit c67f3dcc authored by Bodigrim's avatar Bodigrim
Browse files

Experiment with case conversions

parent 1cbc95b0
......@@ -11,6 +11,7 @@ module CaseFolding
) where
import Arsec
import Data.Bits
data Fold = Fold {
code :: Char
......@@ -34,13 +35,19 @@ parseCF :: FilePath -> IO (Either ParseError CaseFolding)
parseCF name = parse entries name <$> readFile name
mapCF :: CaseFolding -> [String]
mapCF (CF _ ms) = typ ++ (map nice . filter p $ ms) ++ [last]
mapCF (CF _ ms) = typ ++ map printUnusual ms' ++ map printUsual usual ++ [last]
where
typ = ["foldMapping :: forall s. Char -> s -> Step (CC s) Char"
,"{-# NOINLINE foldMapping #-}"]
last = "foldMapping c s = Yield (toLower c) (CC s '\\0' '\\0')"
nice c = "-- " ++ name c ++ "\n" ++
"foldMapping " ++ showC (code c) ++ " s = Yield " ++ x ++ " (CC s " ++ y ++ " " ++ z ++ ")"
where [x,y,z] = (map showC . take 3) (mapping c ++ repeat '\0')
ms' = filter p ms
p f = status f `elem` "CF" &&
mapping f /= [toLower (code f)]
unusual = map code ms'
usual = filter (\c -> toLower c /= c && c `notElem` unusual) [minBound..maxBound]
typ = ["foldMapping :: Char# -> _ {- unboxed Int64 -}"
,"{-# NOINLINE foldMapping #-}"
,"foldMapping = \\case"]
last = " _ -> unI64 0"
printUnusual c = " -- " ++ name c ++ "\n" ++
" " ++ showC (code c) ++ "# -> unI64 " ++ show (ord x + (ord y `shiftL` 21) + (ord z `shiftL` 42))
where x:y:z:_ = mapping c ++ repeat '\0'
printUsual c = " " ++ showC c ++ "# -> unI64 " ++ show (ord (toLower c))
......@@ -22,14 +22,17 @@ main = do
let comments = map ("--" ++) $
take 2 (cfComments cfs) ++ take 2 (scComments scs)
mapM_ (hPutStrLn h) $
["{-# LANGUAGE Rank2Types #-}"
,"-- AUTOMATICALLY GENERATED - DO NOT EDIT"
["-- AUTOMATICALLY GENERATED - DO NOT EDIT"
,"-- Generated by scripts/CaseMapping.hs"] ++
comments ++
[""
,"{-# LANGUAGE LambdaCase, MagicHash, PartialTypeSignatures #-}"
,"{-# OPTIONS_GHC -Wno-partial-type-signatures #-}"
,"module Data.Text.Internal.Fusion.CaseMapping where"
,"import Data.Char"
,"import Data.Text.Internal.Fusion.Types"
,"import GHC.Int"
,"import GHC.Exts"
,"unI64 :: Int64 -> _ {- unboxed Int64 -}"
,"unI64 (I64# n) = n"
,""]
mapM_ (hPutStrLn h) (mapSC "upper" upper toUpper scs)
mapM_ (hPutStrLn h) (mapSC "lower" lower toLower scs)
......
......@@ -11,6 +11,7 @@ module SpecialCasing
) where
import Arsec
import Data.Bits
data SpecialCasing = SC { scComments :: [Comment], scCasing :: [Case] }
deriving (Show)
......@@ -40,17 +41,23 @@ parseSC name = parse entries name <$> readFile name
mapSC :: String -> (Case -> String) -> (Char -> Char) -> SpecialCasing
-> [String]
mapSC which access twiddle (SC _ ms) =
typ ++ (map nice . filter p $ ms) ++ [last]
typ ++ map printUnusual ms' ++ map printUsual usual ++ [last]
where
typ = [which ++ "Mapping :: forall s. Char -> s -> Step (CC s) Char"
,"{-# NOINLINE " ++ which ++ "Mapping #-}"]
last = which ++ "Mapping c s = Yield (to" ++ ucFirst which ++ " c) (CC s '\\0' '\\0')"
nice c = "-- " ++ name c ++ "\n" ++
which ++ "Mapping " ++ showC (code c) ++ " s = Yield " ++ x ++ " (CC s " ++ y ++ " " ++ z ++ ")"
where [x,y,z] = (map showC . take 3) (access c ++ repeat '\0')
ms' = filter p ms
p c = [k] /= a && a /= [twiddle k] && null (conditions c)
where a = access c
k = code c
unusual = map code ms'
usual = filter (\c -> twiddle c /= c && c `notElem` unusual) [minBound..maxBound]
typ = [which ++ "Mapping :: Char# -> _ {- unboxed Int64 -}"
,"{-# NOINLINE " ++ which ++ "Mapping #-}"
,which ++ "Mapping = \\case"]
last = " _ -> unI64 0"
printUnusual c = " -- " ++ name c ++ "\n" ++
" " ++ showC (code c) ++ "# -> unI64 " ++ show (ord x + (ord y `shiftL` 21) + (ord z `shiftL` 42))
where x:y:z:_ = access c ++ repeat '\0'
printUsual c = " " ++ showC c ++ "# -> unI64 " ++ show (ord (twiddle c))
ucFirst (c:cs) = toUpper c : cs
ucFirst [] = []
{-# LANGUAGE BangPatterns, MagicHash, Rank2Types #-}
{-# LANGUAGE BangPatterns, MagicHash, Rank2Types, PartialTypeSignatures #-}
{-# OPTIONS_GHC -Wno-partial-type-signatures #-}
-- |
-- Module : Data.Text.Internal.Fusion.Common
-- Copyright : (c) Bryan O'Sullivan 2009, 2012
......@@ -124,13 +125,15 @@ import Prelude (Bool(..), Char, Eq(..), Int, Integral, Maybe(..),
(&&), fromIntegral, otherwise)
import qualified Data.List as L
import qualified Prelude as P
import Data.Bits (shiftL, shiftR, (.&.))
import Data.Char (isLetter, isSpace)
import Data.Int (Int64)
import GHC.Int (Int64(..))
import Data.Text.Internal.Encoding.Utf8 (chr2, chr3, chr4, utf8LengthByLeader)
import Data.Text.Internal.Fusion.Types
import Data.Text.Internal.Fusion.CaseMapping (foldMapping, lowerMapping, titleMapping,
upperMapping)
import Data.Text.Internal.Fusion.Size
import GHC.Exts (Char(..), Char#, chr#)
import GHC.Prim (Addr#, indexWord8OffAddr#)
import GHC.Types (Int(..))
import Data.Text.Internal.Unsafe.Char (unsafeChr8)
......@@ -478,17 +481,27 @@ intersperse c (Stream next0 s0 len) = Stream next (I1 s0) (len + unknownSize)
-- characters.
-- | Map a 'Stream' through the given case-mapping function.
caseConvert :: (forall s. Char -> s -> Step (CC s) Char)
caseConvert :: (Char# -> _ {- unboxed Int64 -})
-> Stream Char -> Stream Char
caseConvert remap (Stream next0 s0 len) =
Stream next (CC s0 '\0' '\0') (len `unionSize` (3*len))
Stream next (CC s0 0) (len `unionSize` (3*len))
where
next (CC s '\0' _) =
next (CC s 0) =
case next0 s of
Done -> Done
Skip s' -> Skip (CC s' '\0' '\0')
Yield c s' -> remap c s'
next (CC s a b) = Yield a (CC s b '\0')
Skip s' -> Skip (CC s' 0)
Yield c@(C# c#) s' -> case I64# (remap c#) of
0 -> Yield c (CC s' 0)
ab -> let (a, b) = chopOffChar ab in
Yield a (CC s' b)
next (CC s ab) = let (a, b) = chopOffChar ab in Yield a (CC s b)
chopOffChar :: Int64 -> (Char, Int64)
chopOffChar ab = (chr a, ab `shiftR` 21)
where
chr (I# n) = C# (chr# n)
mask = (1 `shiftL` 21) - 1
a = fromIntegral $ ab .&. mask
-- | /O(n)/ Convert a string to folded case. This function is mainly
-- useful for performing caseless (or case insensitive) string
......@@ -556,20 +569,25 @@ toLower = caseConvert lowerMapping
--
-- @ 'Data.Text.Internal.unstream' . 'toTitle' . 'Data.Text.Internal.Fusion.stream' = 'Data.Text.toTitle' @
toTitle :: Stream Char -> Stream Char
toTitle (Stream next0 s0 len) = Stream next (CC (False :*: s0) '\0' '\0') (len + unknownSize)
toTitle (Stream next0 s0 len) = Stream next (CC (False :*: s0) 0) (len + unknownSize)
where
next (CC (letter :*: s) '\0' _) =
next (CC (letter :*: s) 0) =
case next0 s of
Done -> Done
Skip s' -> Skip (CC (letter :*: s') '\0' '\0')
Yield c s'
| nonSpace -> if letter
then lowerMapping c (nonSpace :*: s')
else titleMapping c (letter' :*: s')
| otherwise -> Yield c (CC (letter' :*: s') '\0' '\0')
Skip s' -> Skip (CC (letter :*: s') 0)
Yield c@(C# c#) s'
| nonSpace, letter -> case I64# (lowerMapping c#) of
0 -> Yield c (CC (nonSpace :*: s') 0)
ab -> let (a, b) = chopOffChar ab in
Yield a (CC (nonSpace :*: s') b)
| nonSpace -> case I64# (titleMapping c#) of
0 -> Yield c (CC (letter' :*: s') 0)
ab -> let (a, b) = chopOffChar ab in
Yield a (CC (letter' :*: s') b)
| otherwise -> Yield c (CC (letter' :*: s') 0)
where nonSpace = P.not (isSpace c)
letter' = isLetter c
next (CC s a b) = Yield a (CC s b '\0')
next (CC s ab) = let (a, b) = chopOffChar ab in Yield a (CC s b)
{-# INLINE [0] toTitle #-}
data Justify i s = Just1 !i !s
......
......@@ -29,10 +29,11 @@ module Data.Text.Internal.Fusion.Types
) where
import Data.Text.Internal.Fusion.Size
import Data.Int (Int64)
import Data.Word (Word8)
-- | Specialised tuple for case conversion.
data CC s = CC !s {-# UNPACK #-} !Char {-# UNPACK #-} !Char
data CC s = CC !s {-# UNPACK #-} !Int64
-- | Restreaming state.
data RS s
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment