Use vector to manipulate Chars instead of lists - haskell

I have the some code that compile and works. And then some that don't.
My concern was that the first version was soooo bloated that it crashed while running on too big arguments, so I wrote a second version with performance in mind.
The second version does't even compile. Please advice.
import System.Environment (getArgs)
import Data.List (nub)
import System.Random
import Control.Applicative ( (<$>) )
import Control.Monad (replicateM)
randomItem :: [a] -> IO a
randomItem xs = (xs!!) <$> randomRIO (0, length xs - 1)
genFromMask :: [String] -> IO String
genFromMask = mapM randomItem
genMeSome :: [String] -> Int -> IO [String]
genMeSome mask n = do
glist <- replicateM (n*10) (genFromMask mask)
return $ take n $ nub glist
writeIt :: FilePath -> Int -> [String] -> IO ()
writeIt fi n mask = do
glist <- genMeSome mask n
writeFile fi $ unlines glist
maj :: String
maj = ['A'..'Z']
numa :: String
numa = ['0'..'9']
-- | Certaines regions n'utilisent aucune des plages libres
genBra :: [String]
genBra = ["VWXYZ",maj,maj," ",numa,numa,numa,numa]
genAus :: [String]
genAus = [maj,maj,maj," ",numa,numa,numa]
main :: IO ()
main = do
args <- getArgs
case args of
(mo:fi:n:_) -> case mo of
"aus" -> writeIt fi (read n) genAus
"bra" -> writeIt fi (read n) genBra
_ -> error "country is not supported"
_ -> error "wrong input, format is: genLicensePlate country file number"
And here is the second:
import System.Environment (getArgs)
import System.Random
import Crypto.Random.AESCtr (makeSystem)
import Control.Applicative ( (<$>) )
import qualified Data.Vector as V
import qualified Data.Text as T
import qualified Data.Text.IO as T
nubV :: V.Vector a -> V.Vector a
nubV va
| V.null va = V.empty
| V.any (== headV) tailV = nubV tailV
| otherwise = headV `V.cons` nubV tailV
where
headV = V.head va
tailV = V.tail va
randomItem :: RandomGen g => g -> V.Vector a -> (a,g)
randomItem g xs =
(xs V.! fst shamble, snd shamble)
where
shamble = randomR (0, V.length xs - 1) g
genFromMask :: RandomGen g => g -> V.Vector (V.Vector a) -> V.Vector a
genFromMask g xs =
if V.null xs
then V.empty
else fst paket `V.cons` genFromMask (snd paket) (V.tail xs)
where
paket = randomItem g (V.head xs)
genMeSome :: RandomGen g => g -> V.Vector (V.Vector a) -> Int -> V.Vector (V.Vector a)
genMeSome g mask n =
V.take n $ nubV $ V.replicateM (n*10) (genFromMask g mask)
writeIt :: RandomGen g => g -> FilePath -> Int -> V.Vector (V.Vector a) -> IO ()
writeIt g fi n mask =
T.writeFile fi $ T.unlines $ T.pack $ V.toList (V.map V.toList $ genMeSome g mask n)
maj = V.fromList ['A'..'Z']
num a = V.fromList ['0'..'9']
vspa = V.fromList " "
vtir = V.fromList "-"
-- | Certaines regions n'utilisent aucune des plages libres
genBra = V.fromList [static,maj,maj,vspa,numa,numa,numa,numa]
where
static = V.fromList "VWXYZ"
genAus = V.fromList [maj,maj,maj,vspa,numa,numa,numa]
main :: IO ()
main = do
g <- makeSystem
args <- getArgs
case args of
(mo:fi:n:_) -> case mo of
"aus" -> writeIt g fi (read n) genAus
"bra" -> writeIt g fi (read n) genBra
_ -> error "country is not supported"
_ -> error "wrong input, format is: genLicensePlate country file number"
I am trying to generate fake licenses plates, to populate an anonymous database.
EDIT1:
Here are the errors:
genLicensePlate.hs:22:12:
No instance for (Eq a)
arising from a use of `=='
In the first argument of `V.any', namely `(== headV)
In the expression: V.any (== headV) tailV
In a stmt of a pattern guard for
an equation for `nubV':
V.any (== headV) tailV
genLicensePlate.hs:48:52:
Couldn't match expected type `Char' with actual type
Expected type: V.Vector Char
Actual type: V.Vector [a]
In the first argument of `V.toList', namely
`(V.map V.toList $ genMeSome g mask n)'
In the second argument of `($)', namely
`V.toList (V.map V.toList $ genMeSome g mask n)'
EDIT2:
So the general idea is to use a mask to generate random Strings.
Like myFunc g [['A'..'Z'],['A'..'Z']] gives AA or ZZ or BA or FG etc...
Then I use this function to make a lot of those strings based on the mask.
After that I removes duplicate and take as many as needed (since I generate 10 times the number asked even with duplicate I am OK).
Finaly I drop it on a file.
I hope it is more clear.
Kind regards,
Sar

nubV needs an Eq constraint, since it compares elements (but you really should use a Set or HashSet or so to get a better algorithm)
nubV :: Eq a => V.Vector a -> V.Vector a
nubV va
| V.null va = V.empty
| V.any (== headV) tailV = nubV tailV
| otherwise = headV `V.cons` nubV tailV
where
headV = V.head va
tailV = V.tail va
And in writeIt, you lack a map,
writeIt :: RandomGen g => g -> FilePath -> Int -> V.Vector (V.Vector a) -> IO ()
writeIt g fi n mask =
T.writeFile fi $ T.unlines $ map T.pack $ V.toList (V.map V.toList $ genMeSome g mask n)
-- ^^^
since you get a list of lists of Char from V.toList (V.map V.toList $ genMeSome g mask n).
That fixes the two reported errors.

Related

How to parametrize a constant (in this particular recursive function)?

test1 correctly produces the following structure from the string "abcdef":
(a,(1,[0])) -- type 'a' occur 1 time in position 0
(b,(1,[1])) -- type 'b' occur 1 time in position 1
(c,(1,[2]))
(d,(1,[3]))
(e,(1,[4]))
(f*,(1,[5])) -- type 'f' is the last of the list
But this result depends on the number 6, that is the length of a very particular class of string, invalid for general case.
So if the string in test1 is instead "abc" the result is wrong:
(a,(1,[0]))
(b,(1,[7]))
(c*,(1,[8]))
If the string in test1 is instead "abcdefgh" the result is also wrong:
(a,(1,[0]))
(b,(1,[2])) -- Should be [1]
(c,(1,[3])) -- Should be [2]
(d,(1,[4])) -- ...
(e,(1,[5]))
(f,(1,[6]))
(g,(1,[7]))
(h*,(1,[8]))
In addTrieWithCounter I'm not able to substitue this constant (6) with a parameterized function on the length of the word.
The CONTEXT of this function. The addTrieWithCounter will be placed in a special "loop" such "al alts" becames: addTrieWithCounter ... "al" 0 -> "drop the space" -> addTrieWithCounter ... "alts" 3. So the occurrences will be aligned with the initial string.
-- analyzing "all alts" should be obtained this result.
(a,(2,[4,0])) -- type 'a' occur 2 times in positions 3 and 0 (reversed order)
(l,(2,[5,1])) -- type 'l' (of seq "al") occur 2 times in positions 4 and 1 (reversed order)
(l*,(1,[2])) -- type 'l' (of seq "all") occur 1 time in positions 2
(t,(1,[6])) -- type 't' (of seq "alt") occur 1 time in positions 6
(s*,(1,[7])) -- type 's' (of seq "alts") occur 1 time in positions 7
It will be a trivial thing, but I have no idea.
Thanks in advance for your suggestions.
import qualified Data.Map as M
import Text.PrettyPrint as TP
import Data.Either (either)
data Trie a b = Nil | Trie (M.Map (Either a a) (b, Trie a b)) deriving Show
-- (Just a note: Trie will be a Monoid's instance. So with "Either" it is possible to distinguish the following cases: "all" and "alliance")
-- add an element to a Trie
addTrieWithCounter
:: Ord a =>
(Trie a (Int, [t1]), Int)
-> ((Int, [t1]) -> Int -> (Int, [t1]))
-> [a]
-> (Trie a (Int, [t1]), Int)
addTrieWithCounter (t,st) f [] = (t,st)
addTrieWithCounter (Nil,st) f xs = addTrieWithCounter (Trie M.empty, st) f xs
addTrieWithCounter (Trie m,st) f [x] =
(Trie $ M.insertWith (\(c,_) _ -> (f c st,Nil)) (Left x) (f (0,[]) st,Nil) m,st + 1)
addTrieWithCounter (Trie m, st) f (x:xs) =
case M.lookup (Right x) m of -- !!!!! PROBLEM IN THE FOLLOWING LINE !!!!!
Nothing -> let (t',st') = addTrieWithCounter (Nil, 6 - length xs ) f xs
in (Trie $ M.insert (Right x) (f (0,[]) st,t') m,st + 1)
Just (c,t) -> let (t',st') = addTrieWithCounter (t,st) f xs -- TO CHANGE
in (Trie $ M.insert (Right x) (f c st',t') m,st')
showTrieS f (t,_) = showTrie f t
showTrie :: Show a => (Either t t -> String) -> Trie t a -> Doc
showTrie _ Nil = empty
showTrie f (Trie m)
| M.null m = empty
| otherwise =
vcat $
do (k,(count,t)) <- M.assocs m
return $
vcat [ lparen TP.<> text (f k) TP.<> comma TP.<> (text . show $ count) TP.<> rparen
, nest 4 (showTrie f t)
]
test1 = showTrieS f1 t
where
f1 = (either (:"*") (:""))
t = addTrieWithCounter (Trie M.empty,0) f2 "abcdef"
f2 (cr,poss) st = ((cr + 1),(st : poss))
This will get you most of the way there. It doesn't solve your
exact problem, but shows how to remove the hard-coded length value.
import qualified Data.Map.Strict as M
import qualified Data.IntSet as S
import Data.Monoid
import Text.PrettyPrint hiding ((<>))
data GenTrie a b = Trie (M.Map a (b, GenTrie a b))
deriving (Show)
emptyTrie = Trie M.empty
data Info = Info { _count :: Int, _positions :: S.IntSet }
deriving (Show)
type Trie = GenTrie Char Info
addString :: Int -> String -> Trie -> Trie
addString i cs t = go t i cs
where
go :: Trie -> Int -> String -> Trie
go t i [] = t
go t i (c:cs) =
let Trie m = t
pair =
case M.lookup c m of
Nothing ->
let t2 = go emptyTrie (i+1) cs
val = Info 1 (S.singleton i)
in (val, t2)
Just (info,t1) ->
let t2 = go t1 (i+1) cs
val = info { _count = _count info+1
, _positions = S.insert i (_positions info)
}
in (val, t2)
in Trie (M.insert c pair m)
printTrie = putStrLn . showTrie
showTrie = render . trieToDoc
trieToDoc :: Trie -> Doc
trieToDoc (Trie m)
| M.null m = empty
| otherwise =
vcat $
do (ch, (info,t)) <- M.assocs m
let count = show (_count info)
pos = show (S.toList (_positions info))
return $
vcat [ text [ch] <> space <> text count <> space <> text pos
, nest 4 (trieToDoc t)
]
test1 = printTrie $ addString 0 "abc" emptyTrie
test2 = printTrie $ addString 4 "alts" $ addString 0 "all" emptyTrie
addTrieWithCounter (Trie m,st) f (x:xs) =
case M.lookup (Right x) m of
Nothing -> let (t',st') = addTrieWithCounter (Nil, st + 1 ) f xs
in (Trie $ M.insert (Right x) (f (0,[]) st,t') m, st')
Just (c,t) -> let (t',st') = addTrieWithCounter (t,st + 1) f xs
in (Trie $ M.insert (Right x) (f c st,t') m,st')

Representing Types And Occurrences: (so) easy to understand, (so) difficult to code

A brief introduction to the types and occurrences through examples.
Ex1. abbacb
a, b, c are the types.
a occurres 2 times; b occurres 3 times; c occurres 1 times.
This can be represented more concisely as [('a',2),('b',3),('c',1)] (Indeed, the order doesn't matter).
Ex2. abbacb
ab, bb, ba, ac, cb are sequences of types
Each sequence occurs only once.
This can be represented as [("ab",1),("bb",1),("ba",1),("ac",1),("cb",1)]
The following graphical structure has the same informative content of the previous two:
('a',2) -- 'a' occurs 2 times
('b',1) -- "ab" occurs 1 times
('c',1) -- "ac" occurs 1 times
('b',2) -- 'b' occurs 2 times
('a',1) -- "ba" occurs 1 times
('b',1) -- "bb" occurs 1 times
('c',1) -- 'c' occurs 1 times
('b',1) -- "cb" occurs 1 times
In Haskell: [(('a',2),[('b',1),('c',1)]),(('b',2),[('a',1),('b',1)]),(('c',1),[('b',1)])]
For occurrences of sequences of 3 elements:
('a',2) -- 'a' occurs 2 times
('b',1) -- "ab" occurs 1 times
('b',1) -- "abb" occurs 1 times
('c',1) -- "ac" occurs 1 times
('b',1) -- "acb" occurs 1 times
...
In Haskell:
[
(('a',2), [(('b',1),[('b',1)]),(('c',1),[('b',1)])]),
(('b',2), [(('a',1),[('c',1)]),(('b',1),[('a',1)])])
]
with type [((Char, Int), [((Char, Int), [(Char, Int)])])]
Even considering only the sequences of two and three elements, the comprehensibility of the graphical representation is much greater than that in Haskell.
In addition, lists are not very efficient, so I used the Data.Map library and consequently a slightly different representation.
The following examples are based on Pi's digits. Interesting results can be obtained using the words of a novel.
My questions are:
Functions dedicated to the sequences of the three types are very complicated. It is possible to drastically simplify them?
I cannot even imagine how it is possible to generalize the functions for sequences of arbitrary length. Someone has an idea of how it could be done?
Using the following data type recursion should be easier to implement:
data TuplesTypesOccurences a = L (M.Map a Int) | B (M.Map a (Int,TuplesTypesOccurences a))
In this way however does not lose access to all of the functions in Data.Map library?
import qualified Data.Map as M
import Data.List (sortBy)
piDigits = "31415926535897932384626433832795028841971693993751058209749445923078164062862089986280348253421170679821480865132823066470938446095505822317253594081284811174502841027019385211055596446229489549303819644288109756659334461284756"
type TypesOccurrences a = M.Map a Int
toTypeOccurrences :: Ord k => [k] -> TypesOccurrences k -> TypesOccurrences k
toTypeOccurrences [] mp = mp
toTypeOccurrences (x:xs) mp = toTypeOccurrences xs $ M.insertWith (+) x 1 mp
-- ex. toTypeOccurrences piDigits M.empty
pprintTO :: Show a => TypesOccurrences a -> IO ()
pprintTO = mapM_ putStrLn . map (\(xs,n) -> show xs ++ " " ++ (show n)). sortBy (\x y -> compare (snd y) (snd x)) . M.toList
-- ex. pprintTO . M.filter (> 22) . toTypeOccurrences piDigits $ M.empty
type Seq2TypeOccurrences a = M.Map a (Int,TypesOccurrences a)
toSQ2TO :: Ord a => [a] -> Seq2TypeOccurrences a -> Seq2TypeOccurrences a
toSQ2TO [] mp = mp
toSQ2TO [x] mp = mp
toSQ2TO (x:y:xs) mp = toSQ2TO (y:xs) $
case M.lookup x mp of
Nothing -> M.insert x (1,M.singleton y 1) mp
Just (_,mp2) -> case M.lookup y mp2 of
Nothing -> M.update (\(n,mp2) -> Just (n+1,M.insert y 1 mp2)) x mp
Just _ -> M.update (\(n,mp2) -> Just (n+1,M.update (\m -> Just (m+1)) y mp2)) x mp
-- ex. toSQ2TO piDigits M.empty
pprintSQ2TO :: Show a => Seq2TypeOccurrences a -> IO ()
pprintSQ2TO = mapM_ putStrLn . map (\(x,(n,mp)) -> "(" ++ (show x) ++ "," ++ (show n) ++ ")\n\t" ++ (drop 2 . concatMap (("\n\t" ++) . show) . M.toList $ mp)) . M.toList
-- ex. pprintSQ2TO (toSQ2TO piDigits M.empty)
greaterThanSQ2TO :: Ord a => Int -> Seq2TypeOccurrences a -> Seq2TypeOccurrences a
greaterThanSQ2TO n = M.filter (\(_,mp2) -> not . M.null $ mp2) . M.map (\(o,mp2) -> (o,M.filter (> n) mp2)) . M.filter (\(m,mp) -> m > n)
-- ex. pprintSQ2TO . greaterThanSQ2TO 4 . toSQ2TO piDigits $ M.empty
descSortSQ2TO :: Ord a => Seq2TypeOccurrences a -> [([a], Int)]
descSortSQ2TO = sortBy (\xs ys -> compare (snd ys) (snd xs)) . concatMap (\(x,ys) -> zipWith (\x (y,n) -> ([x,y],n)) (repeat x) ys ) . map (\(x,(_,mp2)) -> (x,M.toList mp2)) . M.toList
-- mapM_ print . descSortSQ2TO . greaterThanSQ2TO 4 . toSQ2TO piDigits $ M.empty
unionSQ2TO :: Ord a => Seq2TypeOccurrences a -> Seq2TypeOccurrences a -> Seq2TypeOccurrences a
unionSQ2TO = M.unionWith (\(n1,mp1) (n2,mp2) -> (n1+n2, M.unionWith (+) mp1 mp2))
type Seq3TypeOccurrences a = M.Map a (Int,Seq2TypeOccurrences a)
toSQ3TO :: Ord k => [k] -> Seq3TypeOccurrences k -> Seq3TypeOccurrences k
toSQ3TO [] mp = mp
toSQ3TO [x] mp = mp
toSQ3TO [x,y] mp = mp
toSQ3TO (x:y:z:xs) mp = toSQ3TO (y:z:xs) $
case M.lookup x mp of
Nothing -> M.insert x (1,M.singleton y (1,M.singleton z 1)) mp
Just (_,mp2) -> case M.lookup y mp2 of
Nothing -> M.update (\(n,mp2) -> Just (n+1,M.insert y (1,M.singleton z 1) mp2)) x mp
Just (m,kns3) -> case M.lookup z kns3 of
Nothing -> M.update (\(n,_) -> Just (n+1,M.update (\(m,mp3) -> Just (m+1,M.insert z 1 mp3)) y mp2)) x mp
Just _ -> M.update (\(n,_) -> Just (n+1,M.update (\(m,mp3) -> Just (m+1,M.update (Just . (+1)) z mp3)) y mp2)) x mp
-- ex. toSQ3TO piDigits M.empty
pprint3 :: Show a => Seq3TypeOccurrences a -> IO ()
pprint3 = mapM_ putStrLn . map (\(x,(n,mp)) -> "(" ++ (show x) ++ "," ++ (show n) ++ ")" ++ (concatMap (\(x2,(n2,mp2)) -> "\n\t(" ++ (show x2) ++ "," ++ (show n2) ++ ")" ++ (f mp2)) . M.toList $ mp)) . M.toList
where
f = concatMap (\(x,n) -> "\n\t\t(" ++ (show x) ++ "," ++ (show n) ++ ")") . M.toList
-- pprint3 . toSQ3TO piDigits $ M.empty
pprint3B :: Show a => Seq3TypeOccurrences a -> IO ()
pprint3B = mapM_ putStrLn . map (\(xs,n) -> show xs ++ " " ++ (show n)) . concatMap (\(xs,mp) -> zipWith (\ys (z,n) -> (ys ++ [z],n)) (repeat xs) mp) . concatMap (\(x,mp) -> zipWith (\y (z,mp2) -> ([y,z],mp2)) (repeat x) mp) . map (\(x,(_,mp)) -> (x, map (\(y,(_,mp2)) -> (y, M.toList mp2)) $ M.toList mp)) . M.toList
-- pprint3B . toSQ3TO piDigits $ M.empty
greaterThan3Q2TO :: Ord a => Int -> Seq3TypeOccurrences a -> Seq3TypeOccurrences a
greaterThan3Q2TO n = M.filter (\(_,mp) -> not . M.null $ mp)
. M.map (\(m,mp) -> (m,M.filter (\(o,mp2) -> not . M.null $ mp2) mp))
. M.map (\(m,mp) -> (m,M.map (\(o,mp2) -> (o,M.filter (>n) mp2)) mp))
. M.filter (\(_,mp) -> not. M.null $ mp)
. M.map (\(m,mp) -> (m,M.filter ((n <) . fst) mp))
. M.filter (\(m,mp) -> m > n)
-- ex. pprint3B . greaterThan3Q2TO 2 . toSQ3TO piDigits $ M.empty
unionSQ3TO :: Ord a => Seq3TypeOccurrences a -> Seq3TypeOccurrences a -> Seq3TypeOccurrences a
unionSQ3TO = M.unionWith (\(n,mp2a) (m,mp2b) -> (n+m,unionSQ2TO mp2a mp2b))
You need to define a recursive data structure like this:
data Trie = Nil | Trie (Map Char (Int, Trie))
This allows the show and add functions to be defined recursively.
Here is an implementation. Run test3 to see an example of how it works.
import qualified Data.Map as M
import Text.PrettyPrint
import Data.List
data Trie = Nil | Trie (M.Map Char (Int, Trie))
showTrie :: String -> Trie -> Doc
showTrie _ Nil = empty
showTrie prefix (Trie m) =
vcat $
do (k,(count,t)) <- M.assocs m
let prefix' = prefix ++ [k]
return $
vcat [ lparen <> char '"' <> text prefix' <> char '"' <> comma <> int count <> rparen
, nest 4 (showTrie prefix' t)
]
-- add an element to a Trie
addTrie :: Trie -> String -> Trie
addTrie t [] = t
addTrie Nil xs = addTrie (Trie M.empty) xs
addTrie (Trie m) (x:xs) =
case M.lookup x m of
Nothing -> let t' = addTrie Nil xs
in Trie $ M.insert x (1,t') m
Just (c,t) -> let t' = addTrie t xs
in Trie $ M.insert x (c+1,t') m
test1 =
let t1 = addTrie Nil "abcd"
t2 = addTrie t1 "abce"
in putStrLn $ render $ showTrie "" t2
test2 n str =
putStrLn $ render $ showTrie "" $
foldr (flip addTrie) Nil (map (take n) (tails str))
test3 = test2 4 "31415926535897932384626433832795028841971693993751058209749445923078164062862089986280348253421170679821480865132823066470938446095505822317253594081284811174502841027019385211055596446229489549303819644288109756659334461284756"

Mutually recursive IO definitions

I can write the following:
f :: [Int] -> [Int]
f x = 0:(map (+1) x)
g :: [Int] -> [Int]
g x = map (*2) x
a = f b
b = g a
main = print $ take 5 a
And things work perfectly fine (ideone).
However, lets say I want g to do something more complex than multiply by 2, like ask the user for a number and add that, like so:
g2 :: [Int] -> IO [Int]
g2 = mapM (\x -> getLine >>= (return . (+x) . read))
How do I then, well, tie the knot?
Clarification:
Basically I want the list of Ints from f to be the input of g2 and the list of Ints from g2 to be the input of f.
The effectful generalization of lists is ListT:
import Control.Monad
import Pipes
f :: ListT IO Int -> ListT IO Int
f x = return 0 `mplus` fmap (+ 1) x
g2 :: ListT IO Int -> ListT IO Int
g2 x = do
n <- x
n' <- lift (fmap read getLine)
return (n' + n)
a = f b
b = g2 a
main = runListT $ do
n <- a
lift (print n)
mzero
You can also implement take like functionality with a little extra code:
import qualified Pipes.Prelude as Pipes
take' :: Monad m => Int -> ListT m a -> ListT m a
take' n l = Select (enumerate l >-> Pipes.take n)
main = runListT $ do
n <- take' 5 a
lift (print n)
mzero
Example session:
>>> main
0
1<Enter>
2
2<Enter>
3<Enter>
7
4<Enter>
5<Enter>
6<Enter>
18
7<Enter>
8<Enter>
9<Enter>
10<Enter>
38
You can learn more about ListT by reading the pipes tutorial, specifically the section on ListT.

Generating sequence from Markov chain in Haskell

I would like to generate random sequences from a Markov chain. To generate the Markov chain I use the following code.
module Main where
import qualified Control.Monad.Random as R
import qualified Data.List as L
import qualified Data.Map as M
type TransitionMap = M.Map (String, String) Int
type MarkovChain = M.Map String [(String, Int)]
addTransition :: (String, String) -> TransitionMap -> TransitionMap
addTransition k = M.insertWith (+) k 1
fromTransitionMap :: TransitionMap -> MarkovChain
fromTransitionMap m =
M.fromList [(k, frequencies k) | k <- ks]
where ks = L.nub $ map fst $ M.keys m
frequencies a = map reduce $ filter (outboundFor a) $ M.toList m
outboundFor a k = fst (fst k) == a
reduce e = (snd (fst e), snd e)
After collecting the statistics and generating a Markov Chain object I would like to generate random sequences. I could imagine this method could look something like that (pseudo-code)
generateSequence mc s
| s == "." = s
| otherwise = s ++ " " ++ generateSequence mc s'
where s' = drawRandomlyFrom $ R.fromList $ mc ! s
I would greatly appreciate if someone could explain to me, how I should implement this function.
Edit
If anyone's interested it wasn't as difficult as I thought.
module Main where
import qualified Control.Monad.Random as R
import qualified Data.List as L
import qualified Data.Map as M
type TransitionMap = M.Map (String, String) Rational
type MarkovChain = M.Map String [(String, Rational)]
addTransition :: TransitionMap -> (String, String) -> TransitionMap
addTransition m k = M.insertWith (+) k 1 m
fromTransitionMap :: TransitionMap -> MarkovChain
fromTransitionMap m =
M.fromList [(k, frequencies k) | k <- ks]
where ks = L.nub $ map fst $ M.keys m
frequencies a = map reduce $ filter (outboundFor a) $ M.toList m
outboundFor a k = fst (fst k) == a
reduce e = (snd (fst e), snd e)
generateSequence :: (R.MonadRandom m) => MarkovChain -> String -> m String
generateSequence m s
| not (null s) && last s == '.' = return s
| otherwise = do
s' <- R.fromList $ m M.! s
ss <- generateSequence m s'
return $ if null s then ss else s ++ " " ++ ss
fromSample :: [String] -> MarkovChain
fromSample ss = fromTransitionMap $ foldl addTransition M.empty $ concatMap pairs ss
where pairs s = let ws = words s in zipWith (,) ("":ws) ws
sample :: [String]
sample = [ "I am a monster."
, "I am a rock star."
, "I want to go to Hawaii."
, "I want to eat a hamburger."
, "I have a really big headache."
, "Haskell is a fun language."
, "Go eat a big hamburger."
, "Markov chains are fun to use."
]
main = do
s <- generateSequence (fromSample sample) ""
print s
The only tiny annoyance is the fake "" starting node.
Not sure if this is what you're looking for. This compiles though:
generateSequence :: (R.MonadRandom m) => MarkovChain -> String -> m String
generateSequence mc s | s == "." = return s
| otherwise = do
s' <- R.fromList $ rationalize (mc M.! s)
s'' <- generateSequence mc s'
return $ s ++ " " ++ s''
rationalize :: [(String,Int)] -> [(String,Rational)]
rationalize = map (\(x,i) -> (x, toRational i))
All random number generation needs to happen in either the Random monad or the IO monad. For your purpose, it's probably easiest to understand how to do that in the IO monad, using evalRandIO. In the example below, getRandom is the function we want to use. Now getRandom operates in the Random monad, but we can use evalRandIO to lift it to the IO monad, like this:
main :: IO ()
main = do
x <- evalRandIO getRandom :: IO Double
putStrLn $ "Your random number is " ++ show x
Note: The reason we have to add the type signature to the line that binds x is because in this particular example there are no other hints to tell the compiler what type we want x to be. However, if we used x in some way that makes it clear that we want it to be a Double (e.g., multiplying by another Double), then the type signature wouldn't be necessary.
Using your MarkovChain type, for a current state you can trivially get the available transitions in the form [(nextState,probability)]. (I'm using the word "probability" loosely, it doesn't need to be a true probability; any numeric weight is fine). This is what fromList in Control.Monad.Random is designed for. Again, it operates in the Random monad, but we can use evalRandIO to lift it to the IO monad. Suppose transitions is your list of transitions, having the type [(nextState,probability)]. Then, in the IO monad you can call:
nextState <- evalRandIO $ fromList transitions
You might instead want to create your own function that operates in the Random monad, like this:
getRandomTransition :: RandomGen g => MarkovChain -> String -> Rand g String
getRandomTransition currState chain = do
let transitions = lookup currState chain
fromList transitions
Then you can call this function in the IO monad using evalRandIO, e.g.
nextState <- evalRandIO $ getRandomTransition chain

Why my program use so much memory?

For just a 25mb file the memory usage is constant at 792mb! I thought it had to do with my usage
from list, but moving certain parts of the code for vector (the arrays where fft is applied, for example) didn't change how much memory being used at all!
{-# LANGUAGE OverloadedStrings,BangPatterns #-}
import qualified Data.Attoparsec.Char8 as Ap
import Data.Attoparsec
import Control.Monad
import Control.Applicative
--import Control.DeepSeq (force)
import System.IO
import System.Environment
import Data.List (zipWith4,unzip4,zip4,foldl')
import Data.Bits
import Data.Complex
import Data.String (fromString)
import Data.ByteString.Internal
import qualified Data.ByteString.Char8 as B
import qualified Data.ByteString.Lazy.Char8 as Bl
import qualified Data.Vector.Unboxed as Vu
import qualified Statistics.Transform as St
{-
I run a test on a collection of data from a file
[(1,t),(2,t),(3,t),(4,t),(5,t)]
- - -
| - - -
| | - - -
| | |
[y++t, n, y++t]
To do that, I use splitN to create a list of list
[[(1,t),(2,t),(3,t)],[(2,t),(3,t),(4,t)],[(3,t),(4,t),(5,t)]]
Map a serie of functions to determine a value for each inner collection,
and return when an event happened.
-}
data FourD b a = FourD a a a b
instance Functor (FourD c) where
fmap f (FourD x y z d) = FourD (f x) (f y) (f z) d
mgrav_per_bit = [ 18, 36, 71, 143, 286, 571, 1142 ]
--Converting raw data to mg
aToG :: Int -> Double
aToG a = fromIntegral . sign $ uresult
where
twocomp = if a>128
then 256-a
else a
uresult = sum $ zipWith (*) mgrav_per_bit (map (fromEnum . testBit twocomp) [0..7])
sign = if a > 128
then negate
else id
--Data is (int,int,int,time)
--Converted to (St.CD^3,Bytestring) in place of maping afterwards.
parseAcc :: Parser (FourD B.ByteString St.CD)
parseAcc = do Ap.char '('
x <- fmap ((:+0) . aToG) Ap.decimal
Ap.char ','
y <- fmap ((:+0) . aToG) Ap.decimal
Ap.char ','
z <- fmap ((:+0) . aToG) Ap.decimal
Ap.char ','
time <- takeTill (== 41)
Ap.char ')'
return $! FourD x y z time
--applies parseAcc to many lines, fails at the end of file (Need to add a newline)
parseFile = many $ parseAcc <* (Ap.endOfInput <|> Ap.endOfLine)
readExpr input = case parse parseFile input of
Done b val -> val
Partial p -> undefined
Fail a b c -> undefined
unType (FourD x y d z) = (x ,y ,d ,z)
-- Breaks a list of FourD into smaller lists, apply f and g to those lists, then filter the result based if an even happened or not
amap :: (Num c, Ord c) => ([a] -> [c]) -> ([d] -> [ByteString]) -> [FourD d a] -> [Bl.ByteString]
amap f g = (uncurry4 (zipWith4 (filterAcc))). map4 f g . unzip4 . map (unType)
where map4 f g (a,b,c,d) = (f a,f b,f c,g d)
uncurry4 f (a,b,c,d) = f a b c d
-- before i had map filterAcc,outside amap. Tried to fuse everything to eliminate intermediaries
-- An event is detected if x > 50
filterAcc x y z t = if x > 50
then (Bl.pack . B.unpack) $ "yes: " `B.append` t
else ""
-- split [St.CD] in [(Vector St.CD)], apply fft to each, and compress to a single value.
-- Core of the application
fftAcross :: [St.CD] -> [Int]
fftAcross = map (floor . noiseEnergy . St.fft) . splitN 32
-- how the value is determined (sum of all magnitudes but the first one)
noiseEnergy :: (RealFloat a, Vu.Unbox a) => Vu.Vector (Complex a) -> a
noiseEnergy x = (Vu.foldl' (\b a-> b+(magnitude a)) 0 (Vu.drop 1 x))/32
-- how the values are split in (Vector St.CD), if lenght > 32, takes 32, otherwhise I'm done
splitN :: Vu.Unbox a => Int -> [a] -> [Vu.Vector a]
splitN n x = helper x
where
helper x = if atLeast n x
then (Vu.take n (Vu.fromList x)) : (helper (drop 1 x) )
else []
-- Replacing the test by atLeast in place of a counter (that compared to length x,calculated once) reduced the behaviour that memory usage was constant.
-- this is replicated so the behaviour of splitN happens on the time part of FourD, Can't use the same since there is no Vector Bytestring instance
splitN2 n x = helper x
where
helper x = if atLeast n x
then (head x) : (helper (drop 1 x))
else []
atLeast :: Int -> [a] -> Bool
atLeast 0 _ = True
atLeast _ [] = False
atLeast n (_:ys) = atLeast (n-1) ys
main = do
filename <- liftM head getArgs
filehandle <- openFile "results.txt" WriteMode
contents <- liftM readExpr $ B.readFile filename
Bl.hPutStr (filehandle) . Bl.unlines . splitAndApplyAndFilter $ contents where
splitAndApplyAndFilter = amap fftAcross (splitN2 32)
Edit: after some refactoring, fusing some maps, reducing length, I managed to get this working at 400~ with a 25mb input file. Still, on a 100mb, it takes 1.5gb.
The program is intended to determine if a certain event happened ina point of time, for that it requries a collection of values (im using 32 atm), runs a fft in it, sum those values and see if passes a threshold. If yes, print the time to a file.
http://db.tt/fT8kXPKz for a 25mb testfile
I found the solution due a topic in reddit about the same problem!
Parsing with Haskell and Attoparsec
The great majority of my problem was caused by the fact attoparsec is strict and haskell data are rather large (so a 100mb text file can be actually much more in run time)
The other half was that profiling doubles the memory use, and I didn't account for that.
After changing the parser to be lazy, my program uses 120mb in place of 800mb (when input size is 116mb), so sucess!
In case this interest someone, here is the relevant piece of code change:
readExpr input = case parse (parseAcc<*(Ap.endOfLine<*Ap.endOfInput<|>Ap.endOfLine)) input of
Done b val -> val : readExpr b
Partial e -> []
Fail _ _ c -> error c
The full code:
{-# LANGUAGE OverloadedStrings,BangPatterns #-}
import qualified Data.Attoparsec.Char8 as Ap
import Data.Attoparsec
import Control.Monad
import Control.Applicative
--import Control.DeepSeq (force)
import System.IO
import System.Environment
import Data.List (zipWith4,unzip4,zip4,foldl')
import Data.Bits
import Data.Complex
import Data.String (fromString)
import Data.ByteString.Internal
import qualified Data.ByteString.Char8 as B
import qualified Data.ByteString.Lazy.Char8 as Bl
import qualified Data.Vector.Unboxed as Vu
import qualified Statistics.Transform as St
{-
I run a test on a collection of data from a file
[(1,t),(2,t),(3,t),(4,t),(5,t)]
- - -
| - - -
| | - - -
| | |
[y++t, n, y++t]
To do that, I use splitN to create a list of list
[[(1,t),(2,t),(3,t)],[(2,t),(3,t),(4,t)],[(3,t),(4,t),(5,t)]]
Map a serie of functions to determine a value for each inner collection,
and return when an event happened.
-}
data FourD b a = FourD a a a b
instance Functor (FourD c) where
fmap f (FourD x y z d) = FourD (f x) (f y) (f z) d
mgrav_per_bit = [ 18, 36, 71, 143, 286, 571, 1142 ]
--Converting raw data to mg
aToG :: Int -> Double
aToG a = fromIntegral . sign $ uresult
where
twocomp
| a>128 = 256-a
| otherwise = a
uresult = sum $ zipWith (*) mgrav_per_bit (map (fromEnum . testBit twocomp) [0..7])
sign
| a > 128 = negate
| otherwise = id
--Data is (int,int,int,time)
--Converted to (St.CD^3,Bytestring) in place of maping afterwards.
parseAcc :: Parser (FourD B.ByteString St.CD)
parseAcc = do Ap.char '('
x <- fmap ((:+0) . aToG) Ap.decimal -- Parse, transform to mg, convert to complex
Ap.char ','
y <- fmap ((:+0) . aToG) Ap.decimal
Ap.char ','
z <- fmap ((:+0) . aToG) Ap.decimal
Ap.char ','
time <- takeTill (== 41)
Ap.char ')'
return $! FourD x y z time
--applies parseAcc to many lines, fails at the end of file (Need to add a newline)
parseFile = many $ parseAcc <* (Ap.endOfInput <|> Ap.endOfLine)
readExpr input = case parse (parseAcc<*(Ap.endOfLine<*Ap.endOfInput<|>Ap.endOfLine)) input of
Done b val -> val : readExpr b
Partial e -> []
Fail _ _ c -> error c
unType (FourD x y d z) = (x ,y ,d ,z)
-- Breaks a list of FourD into smaller lists, apply f and g to those lists, then filter the result based if an even happened or not
amap :: (Num c, Ord c) => ([a] -> [c]) -> ([d] -> [ByteString]) -> [FourD d a] -> [ByteString]
amap f g = (uncurry4 (zipWith4 (filterAcc))). map4 f g . unzip4 . map (unType)
where map4 f g (a,b,c,d) = (f a,f b,f c,g d)
uncurry4 f (a,b,c,d) = f a b c d
-- before i had map filterAcc,outside amap. Tried to fuse everything to eliminate intermediaries
-- An event is detected if x > 50
filterAcc x y z t
| x > 50 = t
| otherwise = ""
-- split [St.CD] in [(Vector St.CD)], apply fft to each, and compress to a single value.
-- Core of the application
fftAcross :: [St.CD] -> [Int]
fftAcross = map (floor . noiseEnergy . St.fft) . splitN 32
-- how the value is determined (sum of all magnitudes but the first one)
noiseEnergy :: (RealFloat a, Vu.Unbox a) => Vu.Vector (Complex a) -> a
noiseEnergy x = (Vu.foldl' (\b a-> b+(magnitude a)) 0 (Vu.drop 1 x))/32
-- how the values are split in (Vector St.CD), if lenght > 32, takes 32, otherwhise I'm done
splitN :: Vu.Unbox a => Int -> [a] -> [Vu.Vector a]
splitN n x = helper x
where
helper x
| atLeast n x = (Vu.take n (Vu.fromList x)) : (helper (drop 1 x) )
| otherwise = []
-- Replacing the test by atLeast in place of a counter (that compared to length x,calculated once) reduced the behaviour that memory usage was constant.
-- this is replicated so the behaviour of splitN happens on the time part of FourD, Can't use the same since there is no Vector Bytestring instance
splitN2 n x = helper x
where
helper x
| atLeast n x = (head x) : (helper (drop 1 x))
| otherwise = []
atLeast :: Int -> [a] -> Bool
atLeast 0 _ = True
atLeast _ [] = False
atLeast n (_:ys) = atLeast (n-1) ys
intervalFinder :: [ByteString]->[B.ByteString]
intervalFinder x = helper x ""
where
helper (x:xs) ""
| x /= "" = ("Start Time: " `B.append` x `B.append` "\n"):(helper xs x)
| otherwise = helper xs ""
helper (x:xs) y
| x == "" = ( "End Time: "`B.append` y `B.append` "\n\n" ):(helper xs "")
| otherwise = helper xs x
helper _ _ = []
main = do
filename <- liftM head getArgs
filehandle <- openFile "results.txt" WriteMode
contents <- liftM readExpr $ B.readFile filename
Bl.hPutStr (filehandle) . Bl.fromChunks . intervalFinder . splitAndApplyAndFilter $ contents
hClose filehandle
where
splitAndApplyAndFilter = amap fftAcross (splitN2 32)
--contents <- liftM ((map ( readExpr )) . B.lines) $ B.readFile filename
{- *Main> let g = liftM ((amap fftAcross (splitN2 32)) . readExpr) $ B.readFile "te
stpattern2.txt"
-}
-- B.hPutStrLn (filehandle) . B.unlines . map (B.pack . show ) . amap (map (floor .quare) . (filter (/=[])) . map ( (drop 1) . (map (/32)) . fft ) . splitN 32) . map ( fmap(fromIntegral . aToG)) . map readExpr $ contents

Resources