Skip to content
Snippets Groups Projects
Commit adb1fe42 authored by Serge S. Gulin's avatar Serge S. Gulin :construction_worker: Committed by Marge Bot
Browse files

Unicode: make ucd2haskell build-able again

ucd2haskell tool used streamly library which version in cabal was out of date. It is updated to the latest version at hackage with deprecated parts rewritten.

Also following fixes were applied to existing code in suppose that from its last run the code structure was changed and now it was required to be up to date with actual folder structures:
1. Ghc module path environment got a suffix with `src`.
2. Generated code got
2.1 `GHC.Internal` prefix for `Data.*`.
2.2 `GHC.Unicode.Internal` swapped on `GHC.Internal.Unicode` according to actual structure.
parent 92aa65ea
No related branches found
No related tags found
No related merge requests found
# Generating GHC’s Unicode modules
`GHC.Unicode.Internal.*` modules are generated with the internal tool `ucd2haskell`.
`GHC.Internal.Unicode.*` modules are generated with the internal tool `ucd2haskell`.
```bash
cd ucd2haskell
......@@ -13,7 +13,7 @@ cd ucd2haskell
2. _Comment_ the line in `ucd.sh` with `VERIFY_CHECKSUM=y`.
3. Run `./ucd.sh download`.
4. Update the checksums in `ucd.sh` and _uncomment_ `VERIFY_CHECKSUM=y`.
5. Run `./ucd.sh generate`. This will generate the `GHC.Unicode.Internal.*`
5. Run `./ucd.sh generate`. This will generate the `GHC.Internal.Unicode.*`
modules.
6. Check and update the output of the tests `base/tests/unicodeXXX.hs`.
7. Compare with Python (see hereinafter) and fix any error.
......@@ -32,7 +32,7 @@ __Warning:__ A Python version with the _exact same Unicode version_ is required.
Check the properties of all the characters.
```bash
ghc -O2 tests/export_all_chars.hs
ghc -O2 tests/export_all_chars.hs
./tests/export_all_chars > tests/all_chars.csv
python3 tests/check_all_chars.py tests/all_chars.csv
```
......
......@@ -21,7 +21,6 @@ module Parser.Text (genModules) where
import Control.Exception (catch, IOException)
import Control.Monad (void)
import Control.Monad.IO.Class (MonadIO(liftIO))
import Data.Bits (Bits(..))
import Data.Word (Word8)
import Data.Char (chr, ord, isSpace)
......@@ -31,19 +30,20 @@ import Data.List (intersperse, unfoldr)
import Data.List.Split (splitWhen)
import Numeric (showHex)
import Streamly.Data.Fold (Fold)
import Streamly.Prelude (IsStream, SerialT)
import System.Directory (createDirectoryIfMissing)
import System.Environment (getEnv)
import System.FilePath ((</>), (<.>))
-- import qualified Data.Set as Set
import qualified Streamly.Prelude as Stream
import Streamly.Data.Stream (Stream)
import qualified Streamly.Data.Stream.Prelude as Stream
import qualified Streamly.Data.Fold as Fold
import qualified Streamly.Internal.Data.Fold as Fold
import qualified Streamly.Data.Unfold as Unfold
import qualified Streamly.FileSystem.Handle as Handle
import qualified System.IO as Sys
import qualified Streamly.Unicode.Stream as Unicode
import qualified Streamly.Internal.Unicode.Stream as Unicode
import qualified System.IO as Sys
import Prelude hiding (pred)
......@@ -271,7 +271,7 @@ genUnicodeVersion outdir = do
(\(_ :: IOException) -> return "<unknown>")
Stream.fold f (Stream.fromList (body version))
where
moduleName = "GHC.Unicode.Internal.Version"
moduleName = "GHC.Internal.Unicode.Version"
f = moduleFileEmitter Nothing outdir
(moduleName, \_ -> Fold.foldMap (<> "\n"))
body :: String -> [String]
......@@ -284,12 +284,12 @@ genUnicodeVersion outdir = do
, "(unicodeVersion)"
, "where"
, ""
, "import {-# SOURCE #-} Data.Version"
, "import {-# SOURCE #-} GHC.Internal.Data.Version"
, ""
, "-- | Version of Unicode standard used by @base@:"
, "-- [" <> version <> "](https://www.unicode.org/versions/Unicode" <> version <> "/)."
, "--"
, "-- @since 4.15.0.0"
, "-- @since base-4.15.0.0"
, "unicodeVersion :: Version"
, "unicodeVersion = makeVersion [" <> mkVersion version <> "]" ]
mkVersion = foldr (\c acc -> case c of {'.' -> ',':' ':acc; _ -> c:acc}) mempty
......@@ -331,8 +331,8 @@ genGeneralCategoryModule moduleName =
, "(generalCategory)"
, "where"
, ""
, "import GHC.Base (Char, Int, Ord(..), ord)"
, "import GHC.Unicode.Internal.Bits (lookupIntN)"
, "import GHC.Internal.Base (Char, Int, Ord(..), ord)"
, "import GHC.Internal.Unicode.Bits (lookupIntN)"
, ""
, genEnumBitmap "generalCategory" Cn (reverse acc)
]
......@@ -415,7 +415,7 @@ genDecomposableModule moduleName dtype =
, "where"
, ""
, "import Data.Char (ord)"
, "import GHC.Unicode.Internal.Bits (lookupBit64)"
, "import GHC.Internal.Unicode.Bits (lookupBit64)"
, ""
, genBitmap "isDecomposable" (reverse st)
]
......@@ -443,7 +443,7 @@ genCombiningClassModule moduleName =
, "where"
, ""
, "import Data.Char (ord)"
, "import GHC.Unicode.Internal.Bits (lookupBit64)"
, "import GHC.Internal.Unicode.Bits (lookupBit64)"
, ""
, "combiningClass :: Char -> Int"
, unlines (reverse st1)
......@@ -566,8 +566,8 @@ genCompositionsModule moduleName compExclu non0CC =
, "(compose, composeStarters, isSecondStarter)"
, "where"
, ""
, "import GHC.Base (Char, ord)"
, "import GHC.Unicode.Internal.Bits (lookupBit64)"
, "import GHC.Internal.Base (Char, ord)"
, "import GHC.Internal.Unicode.Bits (lookupBit64)"
, ""
]
......@@ -616,7 +616,7 @@ genSimpleCaseMappingModule moduleName funcName field =
, "(" <> funcName <> ")"
, "where"
, ""
, "import GHC.Base (Char)"
, "import GHC.Internal.Base (Char)"
, ""
]
genSign =
......@@ -670,8 +670,8 @@ genCorePropertiesModule moduleName isProp =
, "(" <> unwords (intersperse "," (map prop2FuncName exports)) <> ")"
, "where"
, ""
, "import GHC.Base (Bool, Char, Ord(..), (&&), ord)"
, "import GHC.Unicode.Internal.Bits (lookupBit64)"
, "import GHC.Internal.Base (Bool, Char, Ord(..), (&&), ord)"
, "import GHC.Internal.Unicode.Bits (lookupBit64)"
, ""
]
......@@ -818,7 +818,7 @@ parsePropertyLine ln
isDivider :: String -> Bool
isDivider x = x == "# ================================================"
parsePropertyLines :: (IsStream t, Monad m) => t m String -> t m PropertyLine
parsePropertyLines :: (Monad m) => Stream m String -> Stream m PropertyLine
parsePropertyLines =
Stream.splitOn isDivider
$ Fold.lmap parsePropertyLine
......@@ -843,11 +843,11 @@ Parse ranges according to https://www.unicode.org/reports/tr44/#Code_Point_Range
__Note:__ this does /not/ fill missing char entries,
i.e. entries with no explicit entry nor within a range.
-}
parseUnicodeDataLines :: forall t m. (IsStream t, Monad m) => t m String -> t m DetailedChar
parseUnicodeDataLines :: forall m. (Monad m) => Stream m String -> Stream m DetailedChar
parseUnicodeDataLines
= Stream.unfoldMany (Unfold.unfoldr unitToRange)
. Stream.foldMany ( Fold.lmap parseDetailedChar
$ Fold.mkFold_ step initial )
$ Fold.foldt' step initial id)
where
......@@ -913,19 +913,14 @@ parseDetailedChar line = case splitWhen (== ';') line of
-- Generation
-------------------------------------------------------------------------------
readLinesFromFile :: String -> SerialT IO String
readLinesFromFile :: String -> Stream IO String
readLinesFromFile file =
withFile file Sys.ReadMode
$ \h ->
Stream.unfold Handle.read h & Unicode.decodeUtf8
& unicodeLines Fold.toList
$ \h -> Handle.read h & Unicode.decodeUtf8 & Unicode.lines Fold.toList
where
unicodeLines = Stream.splitOnSuffix (== '\n')
withFile file_ mode =
Stream.bracket (liftIO $ Sys.openFile file_ mode) (liftIO . Sys.hClose)
Stream.bracketIO (Sys.openFile file_ mode) (Sys.hClose)
moduleToFileName :: String -> String
......@@ -995,7 +990,7 @@ testOutputFileEmitter outdir (name, fldGen) = Fold.rmapM action fldGen
runGenerator ::
FilePath
-> FilePath
-> (SerialT IO String -> SerialT IO a)
-> (Stream IO String -> Stream IO a)
-> FilePath
-> GeneratorRecipe a
-> IO ()
......@@ -1067,64 +1062,64 @@ genModules indir outdir props = do
-- [NOTE] Disabled generator
-- propList =
-- ("GHC.Unicode.Internal.Char.PropList"
-- ("GHC.Internal.Unicode.Char.PropList"
-- , (`genCorePropertiesModule` (`elem` props)))
derivedCoreProperties =
("GHC.Unicode.Internal.Char.DerivedCoreProperties"
("GHC.Internal.Unicode.Char.DerivedCoreProperties"
, (`genCorePropertiesModule` (`elem` props)))
-- [NOTE] Disabled generator
-- compositions exc non0 =
-- ( "GHC.Unicode.Internal.Char.UnicodeData.Compositions"
-- ( "GHC.Internal.Unicode.Char.UnicodeData.Compositions"
-- , \m -> genCompositionsModule m exc non0)
-- [NOTE] Disabled generator
-- combiningClass =
-- ( "GHC.Unicode.Internal.Char.UnicodeData.CombiningClass"
-- ( "GHC.Internal.Unicode.Char.UnicodeData.CombiningClass"
-- , genCombiningClassModule)
-- [NOTE] Disabled generator
-- decomposable =
-- ( "GHC.Unicode.Internal.Char.UnicodeData.Decomposable"
-- ( "GHC.Internal.Unicode.Char.UnicodeData.Decomposable"
-- , (`genDecomposableModule` Canonical))
-- [NOTE] Disabled generator
-- decomposableK =
-- ( "GHC.Unicode.Internal.Char.UnicodeData.DecomposableK"
-- ( "GHC.Internal.Unicode.Char.UnicodeData.DecomposableK"
-- , (`genDecomposableModule` Kompat))
-- [NOTE] Disabled generator
-- decompositions =
-- ( "GHC.Unicode.Internal.Char.UnicodeData.Decompositions"
-- ( "GHC.Internal.Unicode.Char.UnicodeData.Decompositions"
-- , \m -> genDecomposeDefModule m [] [] Canonical (const True))
-- [NOTE] Disabled generator
-- decompositionsK2 =
-- ( "GHC.Unicode.Internal.Char.UnicodeData.DecompositionsK2"
-- ( "GHC.Internal.Unicode.Char.UnicodeData.DecompositionsK2"
-- , \m -> genDecomposeDefModule m [] [] Kompat (>= 60000))
-- [NOTE] Disabled generator
-- decompositionsK =
-- let pre = ["import qualified " <> fst decompositionsK2 <> " as DK2", ""]
-- post = ["decompose c = DK2.decompose c"]
-- in ( "GHC.Unicode.Internal.Char.UnicodeData.DecompositionsK"
-- in ( "GHC.Internal.Unicode.Char.UnicodeData.DecompositionsK"
-- , \m -> genDecomposeDefModule m pre post Kompat (< 60000))
generalCategory =
( "GHC.Unicode.Internal.Char.UnicodeData.GeneralCategory"
( "GHC.Internal.Unicode.Char.UnicodeData.GeneralCategory"
, genGeneralCategoryModule)
simpleUpperCaseMapping =
( "GHC.Unicode.Internal.Char.UnicodeData.SimpleUpperCaseMapping"
( "GHC.Internal.Unicode.Char.UnicodeData.SimpleUpperCaseMapping"
, \m -> genSimpleCaseMappingModule m "toSimpleUpperCase" _simpleUppercaseMapping)
simpleLowerCaseMapping =
( "GHC.Unicode.Internal.Char.UnicodeData.SimpleLowerCaseMapping"
( "GHC.Internal.Unicode.Char.UnicodeData.SimpleLowerCaseMapping"
, \m -> genSimpleCaseMappingModule m "toSimpleLowerCase" _simpleLowercaseMapping)
simpleTitleCaseMapping =
( "GHC.Unicode.Internal.Char.UnicodeData.SimpleTitleCaseMapping"
( "GHC.Internal.Unicode.Char.UnicodeData.SimpleTitleCaseMapping"
, \m -> genSimpleCaseMappingModule m "toSimpleTitleCase" _simpleTitlecaseMapping)
-- unicode002Test =
......
......@@ -52,7 +52,7 @@ download_files() {
done
}
GHC_MODULE_PATH=$(realpath "$SCRIPT_DIR/../../")
GHC_MODULE_PATH=$(realpath "$SCRIPT_DIR/../../src")
# Generate the Haskell files.
run_generator() {
......@@ -63,7 +63,7 @@ run_generator() {
--core-prop Uppercase \
--core-prop Lowercase
# [NOTE] disabled generator
# --core-prop Alphabetic
# --core-prop Alphabetic
# --core-prop White_Space \
# --core-prop ID_Start \
# --core-prop ID_Continue \
......
......@@ -52,10 +52,11 @@ executable ucd2haskell
main-is: UCD2Haskell.hs
other-modules: Parser.Text
build-depends:
base >= 4.7 && < 4.18
, streamly >= 0.8 && < 0.9
base >= 4.7 && < 4.20
, streamly-core >= 0.2.2 && < 0.3
, streamly >= 0.10 && < 0.11
, split >= 0.2.3 && < 0.3
, getopt-generics >= 0.13 && < 0.14
, containers >= 0.5 && < 0.7
, directory >= 1.3.6 && < 1.3.7
, directory >= 1.3.6 && < 1.3.8
, filepath >= 1.4.2 && < 1.5
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment