Skip to content
GitLab
Projects Groups Snippets
  • /
  • Help
    • Help
    • Support
    • Community forum
    • Submit feedback
  • Sign in / Register
  • GHC GHC
  • Project information
    • Project information
    • Activity
    • Labels
    • Members
  • Repository
    • Repository
    • Files
    • Commits
    • Branches
    • Tags
    • Contributors
    • Graph
    • Compare
    • Locked Files
  • Issues 5,264
    • Issues 5,264
    • List
    • Boards
    • Service Desk
    • Milestones
    • Iterations
  • Merge requests 568
    • Merge requests 568
  • CI/CD
    • CI/CD
    • Pipelines
    • Jobs
    • Schedules
    • Test Cases
  • Deployments
    • Deployments
    • Releases
  • Analytics
    • Analytics
    • Value stream
    • CI/CD
    • Code review
    • Insights
    • Issue
    • Repository
  • Wiki
    • Wiki
  • Snippets
    • Snippets
  • Activity
  • Graph
  • Create a new issue
  • Jobs
  • Commits
  • Issue Boards
Collapse sidebar
  • Glasgow Haskell CompilerGlasgow Haskell Compiler
  • GHCGHC
  • Issues
  • #18304
Closed
Open
Issue created Jun 06, 2020 by Ryan Scott@RyanGlScottMaintainer

Compiling tagsoup with optimizations loops infinitely on HEAD

Compiling the tagsoup-0.14.8 library on GHC HEAD (commit ad44b504) causes an infinite loop (or possibly just exponential compile times). Here is a standalone version of the module from tagsoup that blows up compile times:

{-# LANGUAGE RecordWildCards, PatternGuards #-}

module Text.HTML.TagSoup.Specification(dat) where

-- Code taken from the tagsoup library, which is BSD-3-licensed.

import Data.Char (isAlpha, isAlphaNum, isDigit, toLower)

-- We make some generalisations:
-- <!name is a valid tag start closed by >
-- <?name is a valid tag start closed by ?>
-- </!name> is a valid closing tag
-- </?name> is a valid closing tag
-- <a "foo"> is a valid tag attibute in ! and ?, i.e missing an attribute name
-- We also don't do lowercase conversion
-- Entities are handled without a list of known entity names
-- We don't have RCData, CData or Escape modes (only effects dat and tagOpen)


data TypeTag = TypeNormal -- <foo
             | TypeXml    -- <?foo
             | TypeDecl   -- <!foo
             | TypeScript -- <script
               deriving Eq


-- 2.4.1 Common parser idioms
white x = x `elem` " \t\n\f\r"


-- 8.2.4 Tokenization

type Parser = S -> [Out]

-- 8.2.4.1 Data state
dat :: Parser
dat S{..} = pos $ case hd of
    '&' -> charReference tl
    '<' -> tagOpen tl
    _ | eof -> []
    _ -> hd & dat tl


-- 8.2.4.2 Character reference data state
charReference s = charRef dat False Nothing s


-- 8.2.4.3 Tag open state
tagOpen S{..} = case hd of
    '!' -> markupDeclOpen tl
    '/' -> closeTagOpen tl
    _ | isAlpha hd -> Tag & hd & tagName (if isScript s then TypeScript else TypeNormal) tl
    '>' -> errSeen "<>" & '<' & '>' & dat tl
    '?' -> neilXmlTagOpen tl -- NEIL
    _ -> errSeen  "<" & '<' & dat s

isScript = f "script"
    where
        f (c:cs) S{..} = toLower hd == c && f cs tl
        f [] S{..} = white hd || hd == '/' || hd == '>' || hd == '?' || eof


-- seen "<?", emitted []
neilXmlTagOpen S{..} = case hd of
    _ | isAlpha hd -> Tag & '?' & hd & tagName TypeXml tl
    _ -> errSeen "<?" & '<' & '?' & dat s

-- seen "?", expecting ">"
neilXmlTagClose S{..} = pos $ case hd of
    '>' -> TagEnd & dat tl
    _ -> errSeen "?" & beforeAttName TypeXml s


-- just seen ">" at the end, am given tl
neilTagEnd typ S{..}
    | typ == TypeXml = pos $ errWant "?>" & TagEnd & dat s
    | typ == TypeScript = pos $ TagEnd & neilScriptBody s
    | otherwise = pos $ TagEnd & dat s

-- Inside a <script> tag, only break on </script
neilScriptBody o@S{..}
    | hd == '<', S{..} <- tl
    , hd == '/', S{..} <- tl
    , isScript s
    = dat o
    | eof = []
    | otherwise =  pos $ hd & neilScriptBody tl


-- 8.2.4.4 Close tag open state
-- Deviation: We ignore the if CDATA/RCDATA bits and tag matching
-- Deviation: On </> we output </> to the text
-- Deviation: </!name> is a closing tag, not a bogus comment
closeTagOpen S{..} = case hd of
    _ | isAlpha hd || hd `elem` "?!" -> TagShut & hd & tagName TypeNormal tl
    '>' -> errSeen "</>" & '<' & '/' & '>' & dat tl
    _ | eof -> '<' & '/' & dat s
    _ -> errWant "tag name" & bogusComment s


-- 8.2.4.5 Tag name state
tagName typ S{..} = pos $ case hd of
    _ | white hd -> beforeAttName typ tl
    '/' -> selfClosingStartTag typ tl
    '>' -> neilTagEnd typ tl
    '?' | typ == TypeXml -> neilXmlTagClose tl
    _ | isAlpha hd -> hd & tagName typ tl
    _ | eof -> errWant (if typ == TypeXml then "?>" else ">") & dat s
    _ -> hd & tagName typ tl


-- 8.2.4.6 Before attribute name state
beforeAttName typ S{..} = pos $ case hd of
    _ | white hd -> beforeAttName typ tl
    '/' -> selfClosingStartTag typ tl
    '>' -> neilTagEnd typ tl
    '?' | typ == TypeXml -> neilXmlTagClose tl
    _ | typ /= TypeNormal && hd `elem` "\'\"" -> beforeAttValue typ s -- NEIL
    _ | hd `elem` "\"'<=" -> errSeen [hd] & AttName & hd & attName typ tl
    _ | eof -> errWant (if typ == TypeXml then "?>" else ">") & dat s
    _ -> AttName & hd & attName typ tl


-- 8.2.4.7 Attribute name state
attName typ S{..} = pos $ case hd of
    _ | white hd -> afterAttName typ tl
    '/' -> selfClosingStartTag typ tl
    '=' -> beforeAttValue typ tl
    '>' -> neilTagEnd typ tl
    '?' | typ == TypeXml -> neilXmlTagClose tl
    _ | hd `elem` "\"'<" -> errSeen [hd] & def
    _ | eof -> errWant (if typ == TypeXml then "?>" else ">") & dat s
    _ -> def
    where def = hd & attName typ tl


-- 8.2.4.8 After attribute name state
afterAttName typ S{..} = pos $ case hd of
    _ | white hd -> afterAttName typ tl
    '/' -> selfClosingStartTag typ tl
    '=' -> beforeAttValue typ tl
    '>' -> neilTagEnd typ tl
    '?' | typ == TypeXml -> neilXmlTagClose tl
    _ | typ /= TypeNormal && hd `elem` "\"'" -> AttVal & beforeAttValue typ s -- NEIL
    _ | hd `elem` "\"'<" -> errSeen [hd] & def
    _ | eof -> errWant (if typ == TypeXml then "?>" else ">") & dat s
    _ -> def
    where def = AttName & hd & attName typ tl

-- 8.2.4.9 Before attribute value state
beforeAttValue typ S{..} = pos $ case hd of
    _ | white hd -> beforeAttValue typ tl
    '\"' -> AttVal & attValueDQuoted typ tl
    '&' -> AttVal & attValueUnquoted typ s
    '\'' -> AttVal & attValueSQuoted typ tl
    '>' -> errSeen "=" & neilTagEnd typ tl
    '?' | typ == TypeXml -> neilXmlTagClose tl
    _ | hd `elem` "<=" -> errSeen [hd] & def
    _ | eof -> errWant (if typ == TypeXml then "?>" else ">") & dat s
    _ -> def
    where def = AttVal & hd & attValueUnquoted typ tl


-- 8.2.4.10 Attribute value (double-quoted) state
attValueDQuoted typ S{..} = pos $ case hd of
    '\"' -> afterAttValueQuoted typ tl
    '&' -> charRefAttValue (attValueDQuoted typ) (Just '\"') tl
    _ | eof -> errWant "\"" & dat s
    _ -> hd & attValueDQuoted typ tl


-- 8.2.4.11 Attribute value (single-quoted) state
attValueSQuoted typ S{..} = pos $ case hd of
    '\'' -> afterAttValueQuoted typ tl
    '&' -> charRefAttValue (attValueSQuoted typ) (Just '\'') tl
    _ | eof -> errWant "\'" & dat s
    _ -> hd & attValueSQuoted typ tl


-- 8.2.4.12 Attribute value (unquoted) state
attValueUnquoted typ S{..} = pos $ case hd of
    _ | white hd -> beforeAttName typ tl
    '&' -> charRefAttValue (attValueUnquoted typ) Nothing tl
    '>' -> neilTagEnd typ tl
    '?' | typ == TypeXml -> neilXmlTagClose tl
    _ | hd `elem` "\"'<=" -> errSeen [hd] & def
    _ | eof -> errWant (if typ == TypeXml then "?>" else ">") & dat s
    _ -> def
    where def = hd & attValueUnquoted typ tl


-- 8.2.4.13 Character reference in attribute value state
charRefAttValue :: Parser -> Maybe Char -> Parser
charRefAttValue resume c s = charRef resume True c s


-- 8.2.4.14 After attribute value (quoted) state
afterAttValueQuoted typ S{..} = pos $ case hd of
    _ | white hd -> beforeAttName typ tl
    '/' -> selfClosingStartTag typ tl
    '>' -> neilTagEnd typ tl
    '?' | typ == TypeXml -> neilXmlTagClose tl
    _ | eof -> dat s
    _ -> errSeen [hd] & beforeAttName typ s


-- 8.2.4.15 Self-closing start tag state
selfClosingStartTag typ S{..} = pos $ case hd of
    _ | typ == TypeXml -> errSeen "/" & beforeAttName typ s
    '>' -> TagEndClose & dat tl
    _ | eof -> errWant ">" & dat s
    _ -> errSeen "/" & beforeAttName typ s


-- 8.2.4.16 Bogus comment state
bogusComment S{..} = Comment & bogusComment1 s
bogusComment1 S{..} = pos $ case hd of
    '>' -> CommentEnd & dat tl
    _ | eof -> CommentEnd & dat s
    _ -> hd & bogusComment1 tl


-- 8.2.4.17 Markup declaration open state
markupDeclOpen S{..} = case hd of
    _ | Just s <- next "--" -> Comment & commentStart s
    _ | isAlpha hd -> Tag & '!' & hd & tagName TypeDecl tl -- NEIL
    _ | Just s <- next "[CDATA[" -> cdataSection s
    _ -> errWant "tag name" & bogusComment s


-- 8.2.4.18 Comment start state
commentStart S{..} = pos $ case hd of
    '-' -> commentStartDash tl
    '>' -> errSeen "<!-->" & CommentEnd & dat tl
    _ | eof -> errWant "-->" & CommentEnd & dat s
    _ -> hd & comment tl


-- 8.2.4.19 Comment start dash state
commentStartDash S{..} = pos $ case hd of
    '-' -> commentEnd tl
    '>' -> errSeen "<!--->" & CommentEnd & dat tl
    _ | eof -> errWant "-->" & CommentEnd & dat s
    _ -> '-' & hd & comment tl


-- 8.2.4.20 Comment state
comment S{..} = pos $ case hd of
    '-' -> commentEndDash tl
    _ | eof -> errWant "-->" & CommentEnd & dat s
    _ -> hd & comment tl


-- 8.2.4.21 Comment end dash state
commentEndDash S{..} = pos $ case hd of
    '-' -> commentEnd tl
    _ | eof -> errWant "-->" & CommentEnd & dat s
    _ -> '-' & hd & comment tl


-- 8.2.4.22 Comment end state
commentEnd S{..} = pos $ case hd of
    '>' -> CommentEnd & dat tl
    '-' -> errWant "-->" & '-' & commentEnd tl
    _ | white hd -> errSeen "--" & '-' & '-' & hd & commentEndSpace tl
    '!' -> errSeen "!" & commentEndBang tl
    _ | eof -> errWant "-->" & CommentEnd & dat s
    _ -> errSeen "--" & '-' & '-' & hd & comment tl


-- 8.2.4.23 Comment end bang state
commentEndBang S{..} = pos $ case hd of
    '>' -> CommentEnd & dat tl
    '-' -> '-' & '-' & '!' & commentEndDash tl
    _ | eof -> errWant "-->" & CommentEnd & dat s
    _ -> '-' & '-' & '!' & hd & comment tl


-- 8.2.4.24 Comment end space state
commentEndSpace S{..} = pos $ case hd of
    '>' -> CommentEnd & dat tl
    '-' -> commentEndDash tl
    _ | white hd -> hd & commentEndSpace tl
    _ | eof -> errWant "-->" & CommentEnd & dat s
    _ -> hd & comment tl


-- 8.2.4.38 CDATA section state
cdataSection S{..} = pos $ case hd of
    _ | Just s <- next "]]>" -> dat s
    _ | eof -> dat s
    _ | otherwise -> hd & cdataSection tl


-- 8.2.4.39 Tokenizing character references
-- Change from spec: this is reponsible for writing '&' if nothing is to be written
charRef :: Parser -> Bool -> Maybe Char -> S -> [Out]
charRef resume att end S{..} = case hd of
    _ | eof || hd `elem` "\t\n\f <&" || maybe False (== hd) end -> '&' & resume s
    '#' -> charRefNum resume s tl
    _ -> charRefAlpha resume att s

charRefNum resume o S{..} = case hd of
    _ | hd `elem` "xX" -> charRefNum2 resume o True tl
    _ -> charRefNum2 resume o False s

charRefNum2 resume o hex S{..} = case hd of
    _ | hexChar hex hd -> (if hex then EntityHex else EntityNum) & hd & charRefNum3 resume hex tl
    _ -> errSeen "&" & '&' & resume o

charRefNum3 resume hex S{..} = case hd of
    _ | hexChar hex hd -> hd & charRefNum3 resume hex tl
    ';' -> EntityEnd True & resume tl
    _ -> EntityEnd False & errWant ";" & resume s

charRefAlpha resume att S{..} = case hd of
    _ | isAlpha hd -> EntityName & hd & charRefAlpha2 resume att tl
    _ -> errSeen "&" & '&' & resume s

charRefAlpha2 resume att S{..} = case hd of
    _ | alphaChar hd -> hd & charRefAlpha2 resume att tl
    ';' -> EntityEnd True & resume tl
    _ | att -> EntityEnd False & resume s
    _ -> EntityEnd False & errWant ";" & resume s


alphaChar x = isAlphaNum x || x `elem` ":-_"

hexChar False x = isDigit x
hexChar True  x = isDigit x || (x >= 'a' && x <= 'f') || (x >= 'A' && x <= 'F')

-----
-- Text.HTML.TagSoup.Implementation
-----

data Out
    = Char Char
    | Tag             -- <
    | TagShut         -- </
    | AttName
    | AttVal
    | TagEnd          -- >
    | TagEndClose     -- />
    | Comment         -- <!--
    | CommentEnd      -- -->
    | EntityName      -- &
    | EntityNum       -- &#
    | EntityHex       -- &#x
    | EntityEnd Bool  -- Attributed followed by ; for True, missing ; for False
    | Warn String
    -- | Pos Position

errSeen x = Warn $ "Unexpected " ++ show x
errWant x = Warn $ "Expected " ++ show x

data S = S
    {s :: S
    ,tl :: S
    ,hd :: Char
    ,eof :: Bool
    ,next :: String -> Maybe S
    ,pos :: [Out] -> [Out]
    }

infixr &

class Outable a where (&) :: a -> [Out] -> [Out]
instance Outable Char where (&) = ampChar
instance Outable Out where (&) = ampOut
ampChar x y = Char x : y
ampOut x y = x : y

Note that you will need to compile with optimizations enabled:

$ ghc-stage2 -fforce-recomp Bug.hs -O

I have not attempted to minimize this further.

To upload designs, you'll need to enable LFS and have an admin enable hashed storage. More information
Assignee
Assign to
Time tracking