Optimizing Memory in Haskell, pipes, attoparsec, and containers

Optimizing Memory in Haskell, pipes, attoparsec, and containers - haskell

I'm trying to further optimize my pipes-attoparsec parser and storage, but having trouble getting memory usage any lower.
Given account-parser.hs
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE NoImplicitPrelude #-}
import Protolude hiding (for)
import Data.Hashable
import Data.IntMap.Strict (IntMap)
import Data.Vector (Vector)
import Pipes
import Pipes.Parse
import Pipes.Safe (MonadSafe, runSafeT)
import qualified Data.Attoparsec.ByteString.Char8 as AB
import qualified Data.IntMap.Strict as IM
import qualified Data.Vector as Vector
import qualified Pipes.Attoparsec as PA
import qualified Pipes.ByteString as PB
import qualified Pipes.Safe.Prelude as PSP
-- accountid|account-name|contractid|code
data AccountLine = AccountLine {
_accountId :: !ByteString,
_accountName :: !ByteString,
_accountContractId :: !ByteString,
_accountCode :: !Word32
} deriving (Show)
type MapCodetoAccountIdIdx = IntMap Int
data Accounts = Accounts {
_accountIds :: !(Vector ByteString),
_cache :: !(IntMap Int),
_accountCodes :: !MapCodetoAccountIdIdx
} deriving (Show)
parseAccountLine :: AB.Parser AccountLine
parseAccountLine = AccountLine <$>
getSubfield <* delim <*>
getSubfield <* delim <*>
getSubfield <* delim <*>
AB.decimal <* AB.endOfLine
where getSubfield = AB.takeTill (== '|')
delim = AB.char '|'
--
aempty :: Accounts
aempty = Accounts Vector.empty IM.empty IM.empty
aappend :: Accounts -> AccountLine -> Accounts
aappend (Accounts ids a2i cps) (AccountLine aid an cid cp) =
case IM.lookup (hash aid) a2i of
Nothing -> Accounts
(Vector.snoc ids (toS aid))
(IM.insert (hash aid) (length ids) a2i)
(IM.insert (fromIntegral cp) (length ids) cps)
Just idx -> Accounts ids a2i (IM.insert (fromIntegral cp) idx cps)
foldAccounts :: (Monad m) => Parser AccountLine m Accounts
foldAccounts = foldAll aappend aempty identity
readByteStringFile :: (MonadSafe m) => FilePath -> Producer' ByteString m ()
readByteStringFile file = PSP.withFile file ReadMode PB.fromHandle
accountLines :: Text -> MonadSafe m => Producer AccountLine m (Either (PA.ParsingError, Producer ByteString m ()) ())
accountLines filename = PA.parsed parseAccountLine (readByteStringFile (toS filename))
main :: IO ()
main = do
[filename] <- getArgs
x <- runSafeT $ runEffect $ Pipes.Parse.evalStateT foldAccounts (accountLines (toS filename))
print $ sizes x
sizes :: Accounts -> (Int, Int, Int)
sizes (Accounts aid xxx acp) = (Vector.length aid, IM.size xxx, IM.size acp)
Compiled with GHC 8.0.2 (stack ghc -- -O2 -rtsopts -threaded -Wall account-parser.hs)
I can't get the memory usage any lower. I have to do fast look ups hence the IntMaps. The file is around 20 MB (and not efficient). Most of the data should be able to fit in 5 MB.
$ ./account-parser /tmp/accounts +RTS -s
(5837,5837,373998)
1,631,040,680 bytes allocated in the heap
221,765,464 bytes copied during GC
41,709,048 bytes maximum residency (13 sample(s))
2,512,560 bytes maximum slop
82 MB total memory in use (0 MB lost due to fragmentation)
Tot time (elapsed) Avg pause Max pause
Gen 0 2754 colls, 0 par 0.105s 0.142s 0.0001s 0.0002s
Gen 1 13 colls, 0 par 0.066s 0.074s 0.0057s 0.0216s
TASKS: 4 (1 bound, 3 peak workers (3 total), using -N1)
SPARKS: 0 (0 converted, 0 overflowed, 0 dud, 0 GC'd, 0 fizzled)
INIT time 0.000s ( 0.001s elapsed)
MUT time 0.324s ( 0.298s elapsed)
GC time 0.171s ( 0.216s elapsed)
EXIT time 0.000s ( 0.005s elapsed)
Total time 0.495s ( 0.520s elapsed)
Alloc rate 5,026,660,297 bytes per MUT second
Productivity 65.5% of total user, 58.4% of total elapsed
gc_alloc_block_sync: 0
whitehole_spin: 0
gen[0].sync: 0
gen[1].sync: 0
And the profile:

If I,
remove the intermediate look up cache
use a HashMap Text (Set Word32)
turn on in place compaction +RTS -c
I can get the total memory down to 34 MB, but my lookups now go to O(n). This is likely the best I'm going to get.
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE NoImplicitPrelude #-}
import Protolude hiding (for)
import qualified Data.Attoparsec.ByteString.Char8 as AB
import Data.HashMap.Strict (HashMap)
import qualified Data.HashMap.Strict as HashMap
import Data.Set (Set)
import qualified Data.Set as Set
import Pipes
import qualified Pipes.Attoparsec as PA
import qualified Pipes.ByteString as PB
import Pipes.Parse
import Pipes.Safe (MonadSafe, runSafeT)
import qualified Pipes.Safe.Prelude as PSP
-- accountid|account-name|contractid|code
data AccountLine = AccountLine {
_accountId :: !ByteString,
_accountName :: !ByteString,
_accountContractId :: !ByteString,
_accountCode :: !Word32
} deriving (Show)
newtype Accounts = Accounts (HashMap Text (Set Word32))
deriving (Show)
parseAccountLine :: AB.Parser AccountLine
parseAccountLine = AccountLine <$>
getSubfield <* delim <*>
getSubfield <* delim <*>
getSubfield <* delim <*>
AB.decimal <* AB.endOfLine
where getSubfield = AB.takeTill (== '|')
delim = AB.char '|'
--
aempty :: Accounts
aempty = Accounts HashMap.empty
aappend :: Accounts -> AccountLine -> Accounts
aappend (Accounts cps) (AccountLine aid an cid cp) =
case HashMap.lookup (toS aid) cps of
Nothing -> Accounts (HashMap.insert (toS aid) (Set.singleton cp) cps)
Just value -> Accounts (HashMap.update (\codes -> Just (Set.insert cp value)) (toS aid) cps)
foldAccounts :: (Monad m) => Parser AccountLine m Accounts
foldAccounts = foldAll aappend aempty identity
readByteStringFile :: (MonadSafe m) => FilePath -> Producer' ByteString m ()
readByteStringFile file = PSP.withFile file ReadMode PB.fromHandle
accountLines :: Text -> MonadSafe m => Producer AccountLine m (Either (PA.ParsingError, Producer ByteString m ()) ())
accountLines filename = PA.parsed parseAccountLine (readByteStringFile (toS filename))
main :: IO ()
main = do
[filename] <- getArgs
x <- runSafeT $ runEffect $ Pipes.Parse.evalStateT foldAccounts (accountLines (toS filename))
print $ sizes x
-- print x
print $ lookupAccountFromCode x 254741
print $ lookupAccountFromCode x 196939
sizes :: Accounts -> Int
sizes (Accounts acp) = HashMap.size acp
lookupAccountFromCode :: Accounts -> Word32 -> Maybe Text
lookupAccountFromCode (Accounts accts) cp = do
let f a k v = bool a (Just k) (Set.member cp v)
HashMap.foldlWithKey' f Nothing accts
And running
$ ./account-parser /tmp/accounts +RTS -s -c
5837
Just "1-PCECJ5"
Just "AANA-76KOUU"
1,652,177,904 bytes allocated in the heap
83,767,440 bytes copied during GC
17,563,800 bytes maximum residency (18 sample(s))
751,144 bytes maximum slop
34 MB total memory in use (0 MB lost due to fragmentation)
Tot time (elapsed) Avg pause Max pause
Gen 0 3083 colls, 0 par 0.058s 0.069s 0.0000s 0.0002s
Gen 1 18 colls, 0 par 0.115s 0.151s 0.0084s 0.0317s
TASKS: 4 (1 bound, 3 peak workers (3 total), using -N1)
SPARKS: 0 (0 converted, 0 overflowed, 0 dud, 0 GC'd, 0 fizzled)
INIT time 0.000s ( 0.002s elapsed)
MUT time 0.263s ( 0.289s elapsed)
GC time 0.173s ( 0.219s elapsed)
EXIT time 0.009s ( 0.008s elapsed)
Total time 0.445s ( 0.518s elapsed)
Alloc rate 6,286,682,587 bytes per MUT second
Productivity 61.0% of total user, 57.4% of total elapsed
gc_alloc_block_sync: 0
whitehole_spin: 0
gen[0].sync: 0
gen[1].sync: 0

Related

Memoized Collatz sequence

I've posted the same question in CodeReview but failed to get an answer. so I am trying my luck here in SO.
Here is one of my programs that utilized memoization and array to improve performance and memory usage. The performance seems satisfactory but the memory usage is ridiculous and I can't figure out what's wrong:
{-# LANGUAGE BangPatterns #-}
import Data.Functor
import Data.Array (Array)
import qualified Data.Array as Arr
import Control.DeepSeq
genColtzArr n = collatzArr
where collatzArr = Arr.array (1, n) $ take n $ map (\v -> (v, collatz v 0)) [1..]
collatz 1 !acc = 1 + acc
collatz !m !acc
| even m = go (m `div` 2) acc
| otherwise = go (3 * m + 1) acc
where go !l !acc
| l <= n = let !v = collatzArr Arr.! l in 1 + acc + v
| otherwise = collatz l $ 1 + acc
collatz here means this guy. This function is supposed to receive a number n, and then return an array indexing from 1 to n, and in which each cell contains the length of the link from the index to 1 by applying Collatz formula.
But the memory usage of this method is so high. Here is the profiler result (ghc option -prof -fprof-auto -rtsopts, run time option +RTS -p, n == 500000):
total alloc = 730,636,136 bytes (excludes profiling overheads)
COST CENTRE MODULE %time %alloc
genColtzArr.collatz Main 40.4 34.7
genColtzArr.collatz.go Main 25.5 14.4
COST CENTRE MODULE no. entries %time %alloc %time %alloc
genColtzArr Main 105 1 0.0 0.0 74.7 72.1
genColtzArr.collatzArr Main 106 1 8.0 20.8 74.7 72.1
genColtzArr.collatzArr.\ Main 107 500000 0.9 2.2 66.8 51.3
genColtzArr.collatz Main 109 1182582 40.4 34.7 65.9 49.1
genColtzArr.collatz.go Main 110 1182581 25.5 14.4 25.5 14.4
Please note that -O2 is not a desired answer. I want to figure out what's the problem in this program and in general, how should I spot time and memory inefficiencies in Haskell code. Specifically, I have no idea why this code, with tail recursion and bang pattern, can consume so much memory.
UPDATE1:
the same code with -s produces this:
1,347,869,264 bytes allocated in the heap
595,901,528 bytes copied during GC
172,105,056 bytes maximum residency (7 sample(s))
897,704 bytes maximum slop
315 MB total memory in use (0 MB lost due to fragmentation)
Tot time (elapsed) Avg pause Max pause
Gen 0 2408 colls, 0 par 0.412s 0.427s 0.0002s 0.0075s
Gen 1 7 colls, 0 par 0.440s 0.531s 0.0759s 0.1835s
INIT time 0.000s ( 0.000s elapsed)
MUT time 0.828s ( 0.816s elapsed)
GC time 0.852s ( 0.958s elapsed)
RP time 0.000s ( 0.000s elapsed)
PROF time 0.000s ( 0.000s elapsed)
EXIT time 0.004s ( 0.017s elapsed)
Total time 1.684s ( 1.791s elapsed)
%GC time 50.6% (53.5% elapsed)
Alloc rate 1,627,861,429 bytes per MUT second
Productivity 49.4% of total user, 46.4% of total elapsed
so it takes 300 meg. that is still too large.
Update2
full code
{-# LANGUAGE BangPatterns #-}
import Data.Functor
import Data.Array (Array)
import qualified Data.Array as Arr
import Control.DeepSeq
genColtzArr n = collatzArr
where collatzArr = Arr.array (1, n) $ take n $ map (\v -> (v, collatz v 0)) [1..]
collatz 1 !acc = 1 + acc
collatz !m !acc
| even m = go (m `div` 2) acc
| otherwise = go (3 * m + 1) acc
where go !l !acc
| l <= n = let !v = collatzArr Arr.! l in 1 + acc + v
| otherwise = collatz l $ 1 + acc
genLongestArr n = Arr.array (1, n) llist
where colatz = genColtzArr n
llist = (1, 1):zipWith (\(n1, a1) l2 ->
let l1 = colatz Arr.! a1
in (n1 + 1, if l2 < l1 then a1 else n1 + 1))
llist (tail $ Arr.elems colatz)
main :: IO ()
main = getLine >> do
ns <- map read <$> lines <$> getContents
let m = maximum ns
let lar = genLongestArr m
let iter [] = return ()
iter (h:t) = (putStrLn $ show $ lar Arr.! h) >> iter t
iter ns

As the other answer on CodeReview hints, it's alright for a 500000-element boxed array to comsume ~20MB memory, however it's not only the array but a lot of things all together:
Although you put bang patterns every where, array initialization itself is a lazy foldr:
-- from GHC.Arr
array (l,u) ies
= let n = safeRangeSize (l,u)
in unsafeArray' (l,u) n
[(safeIndex (l,u) n i, e) | (i, e) <- ies]
unsafeArray' :: Ix i => (i,i) -> Int -> [(Int, e)] -> Array i e
unsafeArray' (l,u) n#(I# n#) ies = runST (ST $ \s1# ->
case newArray# n# arrEleBottom s1# of
(# s2#, marr# #) ->
foldr (fill marr#) (done l u n marr#) ies s2#)
So unless you evaluated the last bit of an array, it's holding reference to the list used in initialization. Usually the list can be GC'd on fly while you evaluating the array, but in your case the mutual references and self references disturbed the common GC pattern.
llist is self-referencing to produce every single element, so it will not be GC'd until you evaluated the last element of it
it also holds a reference to genColtzArr so genColtzArr won't be GC'd until llist is fully evaluated
you might think collatz is tail recursive but it's not, it's mutual recursive with collatzArr so again both of them won't be GC'd until fully evaluated
Everything combined, your program will keep three 500000-element list-like structures in memory and results ~80MB peak heap size.
Solution
The obvious solution is to force every array / list to normal form before it's used in another so you won't keep multiple copys of the same data in the memory.
genLongestArr :: Int -> Array Int Int
genLongestArr n =
let collatz = genColtzArr n
-- deepseq genColtzArr before mapping over it
-- this is equivalent to your recursive definition
in collatz `deepseq` (Arr.listArray (1,n) $ fmap fst $ scanl' (maxWith snd) (0, 0) $ Arr.assocs collatz)
maxWith :: Ord a => (b -> a) -> b -> b -> b
maxWith f b b' = case compare (f b) (f b') of
LT -> b'
_ -> b
And in main:
-- deepseq lar before mapping over it
-- this is equivalent to your iter loop
lar `deepseq` mapM_ (print . (lar Arr.!)) ns
Nothing can be done with genColtzArr, it's using itself for memorization so the mutual recursion is kind of necessary.
Now the heap graph peaks at ~20MB as it should:
(Disclaimer: All programs in this answer were compiled with -O0)

Profiling/Improving memory usage and/or GC time

Original
I'm trying to aggregate a CSV file and experiencing [what I consider to be] excessive memory usage and/or GC effort. The issue seems to arise when the number of groups increases. There is no problem when the keys are in the hundreds or thousands, but quickly starts spending a majority of time in the GC when the keys reach tens of thousands.
Update
Moving from Data.ByteString.Lazy.ByteString to Data.ByteString.Short.ShortByteString significantly reduced the memory consumption (to a level I think is reasonable). However, the amount of time spent in the GC still seems far higher than I would expect to be necessary. I moved from Data.HashMap.Strict.HashMap to Data.HashTable.ST.Basic.HashTable to see if the mutation in ST would help but it did not appear to. The following is the current full test code, including generateFile to create a test sample:
{-# LANGUAGE OverloadedStrings #-}
module Main where
import System.IO (withFile, IOMode(WriteMode))
import qualified System.Random as Random
import qualified Data.ByteString.Short as BSS
import qualified Data.ByteString.Lazy.Char8 as BL
import qualified Data.Vector as V
import qualified Data.Vector.Mutable as MV
import qualified Control.Monad.ST as ST
import qualified Data.HashTable.ST.Basic as HT
import qualified Data.HashTable.Class as HT (toList)
import Data.Hashable (Hashable, hashWithSalt)
import Data.List (unfoldr)
import qualified Data.Traversable as T
import Control.Monad (forM_)
instance Hashable a => Hashable (V.Vector a) where
hashWithSalt s = hashWithSalt s . V.toList
data CSVFormat = CSVFormat {
csvSeparator :: Char,
csvWrapper :: Char
}
readCSV :: CSVFormat -> Int -> FilePath -> IO [V.Vector BSS.ShortByteString]
readCSV format skip filepath = BL.readFile filepath >>= return . parseCSV format skip
parseCSV :: CSVFormat -> Int -> BL.ByteString -> [V.Vector BSS.ShortByteString]
parseCSV (CSVFormat sep wrp) skp = drop skp . unfoldr (\bs -> if BL.null bs then Nothing else Just (apfst V.fromList (parseLine bs)))
where
{-# INLINE apfst #-}
apfst f (x,y) = (f x,y)
{-# INLINE isCr #-}
isCr c = c == '\r'
{-# INLINE isLf #-}
isLf c = c == '\n'
{-# INLINE isSep #-}
isSep c = c == sep || isLf c || isCr c
{-# INLINE isWrp #-}
isWrp c = c == wrp
{-# INLINE parseLine #-}
parseLine :: BL.ByteString -> ([BSS.ShortByteString], BL.ByteString)
parseLine bs =
let (field,bs') = parseField bs in
case BL.uncons bs' of
Just (c,bs1)
| isLf c -> (field : [],bs1)
| isCr c ->
case BL.uncons bs1 of
Just (c,bs2) | isLf c -> (field : [],bs2)
_ -> (field : [],bs1)
| otherwise -> apfst (field :) (parseLine bs1)
Nothing -> (field : [],BL.empty)
{-# INLINE parseField #-}
parseField :: BL.ByteString -> (BSS.ShortByteString, BL.ByteString)
parseField bs =
case BL.uncons bs of
Just (c,bs')
| isWrp c -> apfst (BSS.toShort . BL.toStrict . BL.concat) (parseEscaped bs')
| otherwise -> apfst (BSS.toShort . BL.toStrict) (BL.break isSep bs)
Nothing -> (BSS.empty,BL.empty)
{-# INLINE parseEscaped #-}
parseEscaped :: BL.ByteString -> ([BL.ByteString], BL.ByteString)
parseEscaped bs =
let (chunk,bs') = BL.break isWrp bs in
case BL.uncons bs' of
Just (_,bs1) ->
case BL.uncons bs1 of
Just (c,bs2)
| isWrp c -> apfst (\xs -> chunk : BL.singleton wrp : xs) (parseEscaped bs2)
| otherwise -> (chunk : [],bs1)
Nothing -> (chunk : [],BL.empty)
Nothing -> error "EOF within quoted string"
aggregate :: [Int]
-> Int
-> [V.Vector BSS.ShortByteString]
-> [V.Vector BSS.ShortByteString]
aggregate groups size records =
let indices = [0..size - 1] in
ST.runST $ do
state <- HT.new
forM_ records (\record -> do
let key = V.fromList (map (\g -> record V.! g) groups)
existing <- HT.lookup state key
case existing of
Just x ->
forM_ indices (\i -> do
current <- MV.read x i
MV.write x i $! const current (record V.! i)
)
Nothing -> do
x <- MV.new size
forM_ indices (\i -> MV.write x i $! record V.! i)
HT.insert state key x
)
HT.toList state >>= T.traverse V.unsafeFreeze . map snd
filedata :: IO ([Int],Int,[V.Vector BSS.ShortByteString])
filedata = do
records <- readCSV (CSVFormat ',' '"') 1 "file.csv"
return ([0,1,2],18,records)
main :: IO ()
main = do
(key,len,records) <- filedata
print (length (aggregate key len records))
generateFile :: IO ()
generateFile = do
withFile "file.csv" WriteMode $ \handle -> do
forM_ [0..650000] $ \_ -> do
x <- BL.pack . show . truncate . (* 15 ) <$> (Random.randomIO :: IO Double)
y <- BL.pack . show . truncate . (* 50 ) <$> (Random.randomIO :: IO Double)
z <- BL.pack . show . truncate . (* 200) <$> (Random.randomIO :: IO Double)
BL.hPut handle (BL.intercalate "," (x:y:z:replicate 15 (BL.replicate 20 ' ')))
BL.hPut handle "\n"
I receive the following profiling result:
17,525,392,208 bytes allocated in the heap
27,394,021,360 bytes copied during GC
285,382,192 bytes maximum residency (129 sample(s))
3,714,296 bytes maximum slop
831 MB total memory in use (0 MB lost due to fragmentation)
Tot time (elapsed) Avg pause Max pause
Gen 0 577 colls, 0 par 1.576s 1.500s 0.0026s 0.0179s
Gen 1 129 colls, 0 par 25.335s 25.663s 0.1989s 0.2889s
TASKS: 3 (1 bound, 2 peak workers (2 total), using -N1)
SPARKS: 0 (0 converted, 0 overflowed, 0 dud, 0 GC'd, 0 fizzled)
INIT time 0.000s ( 0.002s elapsed)
MUT time 11.965s ( 23.939s elapsed)
GC time 15.148s ( 15.400s elapsed)
RP time 0.000s ( 0.000s elapsed)
PROF time 11.762s ( 11.763s elapsed)
EXIT time 0.000s ( 0.088s elapsed)
Total time 38.922s ( 39.429s elapsed)
Alloc rate 1,464,687,582 bytes per MUT second
Productivity 30.9% of total user, 30.5% of total elapsed
gc_alloc_block_sync: 0
whitehole_spin: 0
gen[0].sync: 0
gen[1].sync: 0
And the following heap visualization:

This turned out to be the V.! calls not being strict enough. Replacing them with indexM hugely reduced the memory consumption.

What are possible Haskell optimizations keys?

I found benchmark that solves really simple task in different languages https://github.com/starius/lang-bench . Here 's the code for Haskell :
cmpsum i j k =
if i + j == k then 1 else 0
main = print (sum([cmpsum i j k |
i <- [1..1000], j <- [1..1000], k <- [1..1000]]))
This code runs very slow as you can see in benchmark and I found this very strange.
I tried to inline the function cmpsum and compile with the next flags:
ghc -c -O2 main.hs
but it really didn't help. I am not asking about optimizing the algorithm cause it's the same for all languages, but about possible compiler or code optimizations that can make this code run faster.

Not a complete answer, sorry. Compiling with GHC 7.10 on my machine I get ~12s for your version.
I'd suggest always compiling with -Wall which shows us that our numbers are being defaulted to the infinite precision Integer type. Fixing that:
module Main where
cmpsum :: Int -> Int -> Int -> Int
cmpsum i j k =
if i + j == k then 1 else 0
main :: IO ()
main = print (sum([cmpsum i j k |
i <- [1..1000], j <- [1..1000], k <- [1..1000]]))
This runs in ~5s for me. Running with +RTS -s seems to show we have a loop in constant memory:
87,180 bytes allocated in the heap
1,704 bytes copied during GC
42,580 bytes maximum residency (1 sample(s))
18,860 bytes maximum slop
1 MB total memory in use (0 MB lost due to fragmentation)
Tot time (elapsed) Avg pause Max pause
Gen 0 0 colls, 0 par 0.000s 0.000s 0.0000s 0.0000s
Gen 1 1 colls, 0 par 0.000s 0.000s 0.0001s 0.0001s
INIT time 0.000s ( 0.001s elapsed)
MUT time 4.920s ( 4.919s elapsed)
GC time 0.000s ( 0.000s elapsed)
EXIT time 0.000s ( 0.000s elapsed)
Total time 4.920s ( 4.921s elapsed)
%GC time 0.0% (0.0% elapsed)
Alloc rate 17,719 bytes per MUT second
Productivity 100.0% of total user, 100.0% of total elapsed
-fllvm shaves off another second or so. Maybe someone else can look into it further.
Edit: Just digging into this a little further. It doesn't look like fusion is happening. Even if I change sum to a foldr (+) 0 which is an explicit "good producer/good consumer" pair.
Rec {
$wgo [InlPrag=[0], Occ=LoopBreaker] :: Int# -> Int#
[GblId, Arity=1, Str=DmdType <S,U>]
$wgo =
\ (w :: Int#) ->
let {
$j :: Int# -> Int#
[LclId, Arity=1, Str=DmdType]
$j =
\ (ww [OS=OneShot] :: Int#) ->
letrec {
$wgo1 [InlPrag=[0], Occ=LoopBreaker] :: [Int] -> Int#
[LclId, Arity=1, Str=DmdType <S,1*U>]
$wgo1 =
\ (w1 :: [Int]) ->
case w1 of _ [Occ=Dead] {
[] -> ww;
: y ys ->
case $wgo1 ys of ww1 { __DEFAULT ->
case lvl of _ [Occ=Dead] {
[] -> ww1;
: y1 ys1 ->
case y of _ [Occ=Dead] { I# y2 ->
case y1 of _ [Occ=Dead] { I# y3 ->
case tagToEnum# # Bool (==# (+# w y2) y3) of _ [Occ=Dead] {
False ->
letrec {
$wgo2 [InlPrag=[0], Occ=LoopBreaker] :: [Int] -> Int#
[LclId, Arity=1, Str=DmdType <S,1*U>]
$wgo2 =
\ (w2 :: [Int]) ->
case w2 of _ [Occ=Dead] {
[] -> ww1;
: y4 ys2 ->
case y4 of _ [Occ=Dead] { I# y5 ->
case tagToEnum# # Bool (==# (+# w y2) y5) of _ [Occ=Dead] {
False -> $wgo2 ys2;
True -> case $wgo2 ys2 of ww2 { __DEFAULT -> +# 1 ww2 }
}
}
}; } in
$wgo2 ys1;
True ->
letrec {
$wgo2 [InlPrag=[0], Occ=LoopBreaker] :: [Int] -> Int#
[LclId, Arity=1, Str=DmdType <S,1*U>]
$wgo2 =
\ (w2 :: [Int]) ->
case w2 of _ [Occ=Dead] {
[] -> ww1;
: y4 ys2 ->
case y4 of _ [Occ=Dead] { I# y5 ->
case tagToEnum# # Bool (==# (+# w y2) y5) of _ [Occ=Dead] {
False -> $wgo2 ys2;
True -> case $wgo2 ys2 of ww2 { __DEFAULT -> +# 1 ww2 }
}
}
}; } in
case $wgo2 ys1 of ww2 { __DEFAULT -> +# 1 ww2 }
}
}
}
}
}
}; } in
$wgo1 lvl } in
case w of wild {
__DEFAULT -> case $wgo (+# wild 1) of ww { __DEFAULT -> $j ww };
1000 -> $j 0
}
end Rec }
In fact, looking at the core for print $ foldr (+) (0:: Int) $ [ i+j | i <- [0..10000], j <- [0..10000]] it seems as though only the first layer of the list comprehension is fused. Is that a bug?

This code gets the job done in 1 second and no extra allocation in GHC 7.10 with -O2 (see the bottom for profiling output):
cmpsum :: Int -> Int -> Int -> Int
cmpsum i j k = fromEnum (i+j==k)
main = print $ sum [cmpsum i j k | i <- [1..1000],
j <- [1..const 1000 i],
k <- [1..const 1000 j]]
In GHC 7.8, you can get almost the same results in this case (1.4 seconds) if you add the following at the beginning:
import Prelude hiding (sum)
sum xs = foldr (\x r a -> a `seq` r (a+x)) id xs 0
There are three issues here:
Specializing the code to Int instead of letting it default to Integer is crucial.
GHC 7.10 offers list fusion for sum that GHC 7.8 does not. This is because the new definition of sum, based on a new definition of foldl, can be very bad in some cases without the "call arity" analysis Joachim Breitner created for GHC 7.10.
GHC performs a limited "full laziness" pass very early in compilation, before any inlining occurs. As a result, the constant [1..1000] terms for j and k, which are used multiple times in the loop, get hoisted out of the loop. This would be good if these were actually expensive to calculate, but in this context it's much cheaper to do the additions over and over and over instead of saving the results. What the code above does is trick GHC. Since const isn't inlined until a little bit later, this first full laziness pass doesn't see that the lists are constant, so it doesn't hoist them out. I wrote it this way because it's nice and short, but it is, admittedly, a little on the fragile side. To make it more robust, use phased inlining:
main = print $ sum [cmpsum i j k | i <- [1..1000],
j <- [1..konst 1000 i],
k <- [1..konst 1000 j]]
{-# INLINE [1] konst #-}
konst = const
This guarantees that konst will be inlined in simplifier phase 1, but no earlier. Phase 1 occurs after list fusion is complete, so it's perfectly safe to let GHC see everything then.
51,472 bytes allocated in the heap
3,408 bytes copied during GC
44,312 bytes maximum residency (1 sample(s))
17,128 bytes maximum slop
1 MB total memory in use (0 MB lost due to fragmentation)
Tot time (elapsed) Avg pause Max pause
Gen 0 0 colls, 0 par 0.000s 0.000s 0.0000s 0.0000s
Gen 1 1 colls, 0 par 0.000s 0.000s 0.0002s 0.0002s
INIT time 0.000s ( 0.000s elapsed)
MUT time 1.071s ( 1.076s elapsed)
GC time 0.000s ( 0.000s elapsed)
EXIT time 0.000s ( 0.000s elapsed)
Total time 1.073s ( 1.077s elapsed)
%GC time 0.0% (0.0% elapsed)
Alloc rate 48,059 bytes per MUT second
Productivity 99.9% of total user, 99.6% of total elapsed

You are comparing looping over a single statement to counting by generating an intermediate structure (a list) and folding over it. I don't know how great the performance in Java would be if you created a linked list with a billion elements iterated over it.
Here is Haskell code which is (approximately) equivalent to your Java code.
{-# LANGUAGE BangPatterns #-}
main = print (loop3 1 1 1 0)
loop1 :: Int -> Int -> Int -> Int -> Int
loop1 !i !j !k !cc | k <= 1000 = loop1 i j (k+1) (cc + fromEnum (i + j == k))
| otherwise = cc
loop2 :: Int -> Int -> Int -> Int -> Int
loop2 !i !j !k !cc | j <= 1000 = loop2 i (j+1) k (loop1 i j k cc)
| otherwise = cc
loop3 :: Int -> Int -> Int -> Int -> Int
loop3 !i !j !k !cc | i <= 1000 = loop3 (i+1) j k (loop2 i j k cc)
| otherwise = cc
And the execution on my machine (test2 is your Haskell code):
$ ghc --make -O2 test1.hs && ghc --make -O2 test2.hs && javac test3.java
$ time ./test1.exe && time ./test2.exe && time java test3
499500
real 0m1.614s
user 0m0.000s
sys 0m0.000s
499500
real 0m35.922s
user 0m0.000s
sys 0m0.000s
499500
real 0m1.589s
user 0m0.000s
sys 0m0.015s

Abnormally slow Haskell code

I've been trying to practice with the Digits-Recognizer Dojo in Haskell after having done it in F#. I'm getting results, but for some reason my Haskell code is insanely slow, and I cannot seem to find what's wrong.
Here is my code (the .csv files can be found on the Dojo's GitHub):
import Data.Char
import Data.List
import Data.List.Split
import Data.Ord
import System.IO
type Pixels = [Int]
data Digit = Digit { label :: Int, pixels :: Pixels }
distance :: Pixels -> Pixels -> Float
distance d1 d2 = sqrt . sum $ map pointDistance $ zip d1 d2
where pointDistance (a, b) = fromIntegral $ (a - b) * (a - b)
parseDigit :: String -> Digit
parseDigit s = Digit label pixels
where (label:pixels) = map read $ splitOn "," s
identify :: Digit -> [Digit] -> (Digit, Float)
identify digit training = minimumBy (comparing snd) distances
where distances = map fn training
fn ref = (ref, distance (pixels digit) (pixels ref))
readDigits :: String -> IO [Digit]
readDigits filename = do
fileContent <- readFile filename
return $ map parseDigit $ tail $ lines fileContent
main :: IO ()
main = do
trainingSample <- readDigits "trainingsample.csv"
validationSample <- readDigits "validationsample.csv"
let result = [(d, identify d trainingSample) | d <- validationSample]
fmt (d, (ref, dist)) = putStrLn $ "Found..."
mapM_ fmt result
What would be the reason of these bad performances?
[UPDATE] Thank you for your many ideas! I have switched my usage of String to Data.Text and my usage of List to Data.Vector as suggested, unfortunately the result is still far from satisfactory.
My updated code is available here.
To give you a better understanding of my interrogation, here's the output of my Haskell (left) and F# (right) implementation. I'm a total newbie of both languages, so I sincerely believe that there has to be a major mistake in my Haskell version to be that much slower.

If you're patient, you'll notice that the second result is calculated much faster than the first. That's because your implementation takes some time to read in the csv files.
You may be tempted to stick a print statement to see when it's done loading like so:
main = do
trainingSample <- readDigits "trainingsample.csv"
validationSample <- readDigits "validationsample.csv"
putStrLn "done loading data"
But due to lazyIO, this won't do what you think it does. trainingSample and validationSample are not yet fully evaluated. So your print statement will print almost immediately, and the first result will still take forever.
You can force readDigits to fully evaluate their return values, though, which will give you a better idea of how much time is spent there. You could either switch to using non-lazy IO, or just print something derived from the data:
readDigits :: String -> IO [Digit]
readDigits filename = do
fileContent <- readFile filename
putStr' $ filename ++ ": "
rows <- forM (tail $ lines fileContent) $ \line -> do
let xs = parseDigit line
putStr' $ case compare (sum $ pixels xs) 0 of
LT -> "-"
EQ -> "0"
GT -> "+"
return xs
putStrLn ""
return rows
where putStr' s = putStr s >> hFlush stdout
On my machine, this let me see that it took about 27 seconds to fully read the digits from trainingsample.csv.
This is printf-style profiling, which isn't great (much better to use a real profiler, or use criterion to benchmark various parts of your code), but good enough for these purposes.
That's clearly a major part of the slowdown, so it's worth trying to switch to strict io. Using Data.Text.IO.readFile, which is strict, cut it down to ~18 seconds.
UPDATE
Here's how to speed up your updated code:
Use unboxed vectors for Pixels (small win):
import qualified Data.Vector.Unboxed as U
-- ...
type Pixels = U.Vector Int
-- ...
distance :: Pixels -> Pixels -> Float
distance d1 d2 = sqrt . U.sum $ U.zipWith pointDistance d1 d2
where pointDistance a b = fromIntegral $ (a - b) * (a - b)
parseDigit :: T.Text -> Digit
parseDigit s = Digit label (U.fromList pixels)
where (label:pixels) = map toDigit $ T.splitOn (T.pack ",") s
toDigit s = either (\_ -> 0) fst (T.Read.decimal s)
Force the distance evaluation early by using seq (big win):
identify :: Digit -> V.Vector Digit -> (Digit, Float)
identify digit training = V.minimumBy (comparing snd) distances
where distances = V.map fn training
fn ref = let d = distance (pixels digit) (pixels ref) in d `seq` (ref, d)
On my machine, the whole program now runs in ~5s:
% ghc --make -O2 Main.hs
[1 of 1] Compiling Main ( Main.hs, Main.o )
Linking Main ...
% time ./Main
./Main 5.00s user 0.11s system 99% cpu 5.115 total
The thunks were killing you.

Your Vector's version, partially unboxed, adapted for ByteString and compiled with -O2 -fllvm runs in 8 seconds on my machine:
import Data.Ord
import Data.Maybe
import qualified Data.Vector as V
import qualified Data.Vector.Unboxed as U
import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as BC
type Pixels = U.Vector Int
data Digit = Digit { label :: !Int, pixels :: !Pixels }
distance :: Pixels -> Pixels -> Float
distance d1 d2 = sqrt . U.sum . U.zipWith pointDistance d1 $ d2
where pointDistance a b = fromIntegral $ (a - b) * (a - b)
parseDigit :: B.ByteString -> Digit
parseDigit bs =
let (label:pixels) = toIntegers bs []
in Digit label (U.fromList pixels)
where
toIntegers bs is =
let Just (i,bs') = BC.readInt bs
in if B.null bs' then reverse is else toIntegers (BC.tail bs') (i:is)
identify :: Digit -> V.Vector Digit -> (Digit, Float)
identify digit training = V.minimumBy (comparing snd) distances
where distances = V.map fn training
fn ref = (ref, distance (pixels digit) (pixels ref))
readDigits :: String -> IO (V.Vector Digit)
readDigits filename = do
fileContent <- B.readFile filename
return . V.map parseDigit . V.fromList . tail . BC.lines $ fileContent
main :: IO ()
main = do
trainingSample <- readDigits "trainingsample.csv"
validationSample <- readDigits "validationsample.csv"
let result = V.map (\d -> (d, identify d trainingSample)) validationSample
fmt (d, (ref, dist)) = putStrLn $ "Found " ++ show (label ref) ++ " for " ++ show (label d) ++ " (distance=" ++ show dist ++ ")"
V.mapM_ fmt result
Output of +RTS -s:
989,632,984 bytes allocated in the heap
19,875,368 bytes copied during GC
31,016,504 bytes maximum residency (5 sample(s))
22,748,608 bytes maximum slop
78 MB total memory in use (1 MB lost due to fragmentation)
Tot time (elapsed) Avg pause Max pause
Gen 0 1761 colls, 0 par 0.05s 0.05s 0.0000s 0.0008s
Gen 1 5 colls, 0 par 0.00s 0.02s 0.0030s 0.0085s
INIT time 0.00s ( 0.00s elapsed)
MUT time 7.42s ( 7.69s elapsed)
GC time 0.05s ( 0.06s elapsed)
EXIT time 0.00s ( 0.01s elapsed)
Total time 7.47s ( 7.77s elapsed)
%GC time 0.7% (0.8% elapsed)
Alloc rate 133,419,569 bytes per MUT second
Productivity 99.3% of total user, 95.5% of total elapsed

Space leak when grouping key/value pairs in Haskell

I have a problem where my code is creating too many thunks (over 270MB) and consequently spends way too much time (over 70%) in GC when grouping values by key. I was wondering what the best way to group values by key.
The problem is that I have keys and values represented by vectors and I want to group the values by keys preserving the order. For example:
Input:
keys = 1 2 4 3 1 3 4 2 1
vals = 1 2 3 4 5 6 7 8 9
Output:
1 = 1,5,9
2 = 2,8
3 = 4,6
4 = 3,7
Compile options:
ghc --make -03 -fllvm histogram.hs
In imperative programming, I would just use a multimap so I decided to use a hash table and where the associated value is [Int] to store the grouped values. I am hoping there is a much better FP solution.
{-# LANGUAGE BangPatterns #-}
import qualified Data.HashMap.Strict as M
import qualified Data.Vector.Unboxed as V
n :: Int
n = 5000000
kv :: V.Vector (Int,Int)
kv = V.zip k v
where
k = V.generate n (\i -> i `mod` 1000)
v = V.generate n (\i -> i)
ts :: V.Vector (Int,Int) -> M.HashMap Int Int
ts vec =
V.foldl' (\ht (k, v) -> M.insertWith (+) k v ht) M.empty vec
ts2 :: V.Vector (Int,Int) -> M.HashMap Int [Int]
ts2 vec =
V.foldl' (\ht (!k, !v) -> M.insertWith (++) k [v] ht) M.empty vec
main :: IO ()
main = ts2 kv `seq` putStrLn "done"
Here's what spits out at runtime:
3,117,102,992 bytes allocated in the heap
1,847,205,880 bytes copied during GC
324,159,752 bytes maximum residency (12 sample(s))
6,502,224 bytes maximum slop
658 MB total memory in use (0 MB lost due to fragmentation)
Tot time (elapsed) Avg pause Max pause
Gen 0 5991 colls, 0 par 0.58s 0.58s 0.0001s 0.0003s
Gen 1 12 colls, 0 par 0.69s 0.69s 0.0577s 0.3070s
INIT time 0.00s ( 0.00s elapsed)
MUT time 0.45s ( 0.45s elapsed)
GC time 1.27s ( 1.27s elapsed)
EXIT time 0.03s ( 0.03s elapsed)
Total time 1.75s ( 1.75s elapsed)
%GC time 72.7% (72.8% elapsed)
Alloc rate 6,933,912,935 bytes per MUT second
Productivity 27.3% of total user, 27.3% of total elapsed
You can see it spends a lot of time in GC so I decided to use bangs to make the list concatenation strict. I guess the ++ is quite expensive too but don't know a workaround around this.

Those strictness annotations are useless. They're forcing only the first constructor of the lists.
Even worse, it appears you're attempting to left fold (++), which is never a good idea. It results in lots of useless copying of intermediate lists, even when it's made fully strict.
You should fold to a [Int] -> [Int] value, instead. That will get rid of the multiple useless allocations. I'm on mobile, so I can't really provide full example code. The main idea is that you change the loop to M.insertWith (.) k (v:) and then map ($ [] ) over the values in the HashMap after the fold.

The bulk of your problem is due to (++) leading to "lots of useless copying of intermediate lists", as Carl puts it in his answer. Having played with a few different approaches at replacing (++), I got the best results thus far by switching to Data.IntMap.Strict from containers (just to take advantage of the less stern API - I don't know which implementation is more efficient per se) and using its alter function to prepend the vector elements without creating singleton lists:
import qualified Data.IntMap.Strict as M
import qualified Data.Vector.Unboxed as V
n :: Int
n = 5000000
kv :: V.Vector (Int,Int)
kv = V.zip k v
where
k = V.generate n (\i -> i `mod` 1000)
v = V.generate n (\i -> i)
ts2 :: V.Vector (Int,Int) -> M.IntMap [Int]
ts2 vec =
V.foldl' (\ht (k, v) -> M.alter (prep v) k ht) M.empty vec
where
prep x = Just . maybe [x] (x:)
main :: IO ()
main = print $ M.foldl' (+) 0 $ M.map length $ ts2 kv
The second best solution was using
\ht (k, v) -> M.insertWith (\(x:_) -> (x :)) k [v] ht
as the fold operator. That works with both Data.IntMap.Strict and Data.HashMap.Strict, with similar results performance-wise.
N.B.: Note that in all cases, your original implementation included, the vector elements are being prepended, rather than appended, to the lists. Your problems would be much more serious if you were appending the elements, as repeatedly appending to an empty list with (++) is quadratic in the number of elements.

I tried to run your code on my host and I am not able to reproduce your profile:
runhaskell test8.hs +RTS -sstderr
done
120,112 bytes allocated in the heap
3,520 bytes copied during GC
68,968 bytes maximum residency (1 sample(s))
12,952 bytes maximum slop
1 MB total memory in use (0 MB lost due to fragmentation)
Tot time (elapsed) Avg pause Max pause
Gen 0 0 colls, 0 par 0.00s 0.00s 0.0000s 0.0000s
Gen 1 1 colls, 0 par 0.00s 0.09s 0.0909s 0.0909s
INIT time 0.00s ( 0.01s elapsed)
MUT time 0.00s ( 29.21s elapsed)
GC time 0.00s ( 0.09s elapsed)
EXIT time 0.00s ( 0.09s elapsed)
Total time 0.01s ( 29.40s elapsed)
%GC time 5.7% (0.3% elapsed)
Alloc rate 381,307,936 bytes per MUT second
Productivity 91.1% of total user, 0.0% of total elapsed
Can you pls outline some more detail about how you are testing the code? If you are using ghci then
$ ghci -fobject-code
we probably need to use -fobject-code to eliminate any space leaks from the ghci. If you have already tried the ghci option, assuming that you are using ghci, I will edit my answer. At this point, I would like to reproduce the issue you are seeing.
Update:
# duplode : Thank you for the pointers. I am going to delete the previous output no one objects to it as it is misleading.
I have been able to reduce the gc overhead by a bit using one of the following options. I am getting some benefits but the overhead is still in the 49 - 50 % range:
ts3 :: V.Vector (Int, Int) -> M.HashMap Int [Int]
ts3 vec =
V.foldl (\ht (!k, !v) ->
let
element = M.lookup k ht in
case element of
Nothing -> M.insert k [v] ht
Just aList -> M.insert k (v:aList) ht) M.empty vec
ts4 :: V.Vector (Int,Int) -> M.HashMap Int [Int]
ts4 vec =
let initMap = V.foldl (\ht (!k,_) -> M.insert k [] ht) M.empty vec
in
V.foldl (\ht (!k, !v) -> M.adjust(\x -> v:x) k ht) initMap vec
The adjust seemed a bit better, but they results seem similar to a straight lookup. With ts4 using adjust:
calling ts4 done.
3,838,059,320 bytes allocated in the heap
2,041,603,344 bytes copied during GC
377,412,728 bytes maximum residency (6 sample(s))
7,725,944 bytes maximum slop
737 MB total memory in use (0 MB lost due to fragmentation)
Tot time (elapsed) Avg pause Max pause
Gen 0 7260 colls, 0 par 1.32s 1.45s 0.0002s 0.0013s
Gen 1 6 colls, 0 par 0.88s 1.40s 0.2328s 0.9236s
INIT time 0.00s ( 0.00s elapsed)
MUT time 2.18s ( 2.21s elapsed)
GC time 2.19s ( 2.85s elapsed)
RP time 0.00s ( 0.00s elapsed)
PROF time 0.00s ( 0.00s elapsed)
EXIT time 0.01s ( 0.07s elapsed)
Total time 4.38s ( 5.13s elapsed)
%GC time 50.0% (55.5% elapsed)
Alloc rate 1,757,267,879 bytes per MUT second
Productivity 50.0% of total user, 42.7% of total elapsed
Using the simple lookup/update (imperative style of updating a map)
calling ts3 done.
3,677,137,816 bytes allocated in the heap
2,040,053,712 bytes copied during GC
395,867,512 bytes maximum residency (6 sample(s))
7,326,104 bytes maximum slop
769 MB total memory in use (0 MB lost due to fragmentation)
Tot time (elapsed) Avg pause Max pause
Gen 0 6999 colls, 0 par 1.35s 1.51s 0.0002s 0.0037s
Gen 1 6 colls, 0 par 1.06s 2.16s 0.3601s 1.3175s
INIT time 0.00s ( 0.00s elapsed)
MUT time 1.89s ( 2.07s elapsed)
GC time 2.41s ( 3.67s elapsed)
RP time 0.00s ( 0.00s elapsed)
PROF time 0.00s ( 0.00s elapsed)
EXIT time 0.01s ( 0.08s elapsed)
Total time 4.31s ( 5.82s elapsed)
%GC time 55.9% (63.0% elapsed)
Alloc rate 1,942,816,558 bytes per MUT second
Productivity 44.1% of total user, 32.6% of total elapsed
I am interested in finding out as to how to reduce the time for lookup as show in the profile output below:
COST CENTRE MODULE %time %alloc
ts3.\ Main 54.1 91.4
ts3.\.element Main 19.0 2.9
ts3 Main 11.0 2.9
kv.k Main 6.5 1.4
kv.v Main 5.2 1.4
kv.k.\ Main 4.0 0.0
individual inherited
COST CENTRE MODULE no. entries %time %alloc %time %alloc
MAIN MAIN 72 0 0.0 0.0 100.0 100.0
main Main 158 0 0.0 0.0 0.0 0.0
CAF:main Main 143 0 0.0 0.0 84.2 97.1
main Main 144 1 0.0 0.0 84.2 97.1
ts3 Main 145 1 11.0 2.9 84.2 97.1
ts3.\ Main 156 5000000 54.1 91.4 73.2 94.3
ts3.\.element Main 157 5000000 19.0 2.9 19.0 2.9
CAF:kv Main 142 0 0.0 0.0 0.0 0.0
Code
-- ghc -O2 --make test8.hs -prof -auto-all -caf-all -fforce-recomp +RTS
-- ./test8 +RTS -p
{-# LANGUAGE BangPatterns #-}
import qualified Data.HashMap.Strict as M
import qualified Data.Vector.Unboxed as V
n :: Int
n = 5000000
kv :: V.Vector (Int,Int)
kv = V.zip (k) (v)
where
k = V.generate n (\i -> i `mod` 1000)
v = V.generate n (\i -> i)
ts :: V.Vector (Int,Int) -> M.HashMap Int Int
ts vec =
V.foldl' (\ht (k, v) -> M.insertWith (+) k v ht) M.empty vec
ts2 :: V.Vector (Int,Int) -> M.HashMap Int [Int]
ts2 vec =
V.foldl (\ht (!k, !v) -> M.insertWith (++) k [v] ht) M.empty vec
ts3 :: V.Vector (Int, Int) -> M.HashMap Int [Int]
ts3 vec =
V.foldl (\ht (!k, !v) ->
let
element = M.lookup k ht in
case element of
Nothing -> M.insert k [v] ht
Just aList -> M.insert k (v:aList) ht) M.empty vec
ts4 :: V.Vector (Int,Int) -> M.HashMap Int [Int]
ts4 vec =
let initMap = V.foldl (\ht (!k,_) -> M.insert k [] ht) M.empty vec
in
V.foldl (\ht (!k, !v) -> M.adjust(\x -> v:x) k ht) initMap vec
main :: IO ()
main = ts3 kv `seq` putStrLn "calling ts3 done."
main1 = do
if x == y then
putStrLn "Algos Match"
else
putStrLn "Error"
where
x = ts2 kv
y = ts4 kv

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Optimizing Memory in Haskell, pipes, attoparsec, and containers - haskell

Related

Memoized Collatz sequence

Profiling/Improving memory usage and/or GC time

What are possible Haskell optimizations keys?

Abnormally slow Haskell code

Space leak when grouping key/value pairs in Haskell

Categories

Resources