Haskell based file streaming causes memory leak

Haskell based file streaming causes memory leak - haskell

I'm fairly new to Haskell and I'm working on an existing code base, that collects files from file shares. To parallelize the processing of the file shares Conduit is used. The scaffold is based on the this tutorial. To continuously read the file share I added the delay and a recursive call to the streamFile function. I am not sure if this is the problem, but the memory allocation is increasing constantly up to several gigabytes.
What could be the problem that causes the memory leak?
module FileScraper(runFileScraperFinal, FileScraper, watch, watchDirectories) where
import Actions (PostProcAction)
import Colog (LogAction, Msg, Severity)
import Conduit (ConduitM, ConduitT, MonadIO (..), MonadResource, MonadTrans (lift), MonadUnliftIO (withRunInIO), ResourceT, await, bracketP, mapMC, mapM_C, runConduit, runResourceT, yield, (.|), takeWhileC)
import Control.Concurrent (threadDelay)
import qualified Control.Concurrent.Async as Async
import qualified Control.Concurrent.STM as STM
import qualified Control.Concurrent.STM.TBMQueue as STM
import Data.ByteString (ByteString, readFile)
import Data.Conduit.Combinators (filterM, yieldMany)
import Data.Functor ((<&>))
import Data.Text (Text, unpack)
import Filters (FileFilter, DirectoryFilter)
import Polysemy (Final, Inspector (inspect), Member, Sem, makeSem)
import Polysemy.Final (bindS, getInitialStateS, getInspectorS, interpretFinal, liftS)
import Prelude hiding (filter, init, readFile)
import System.FilePath.Find (find, RecursionPredicate, (/~?), filePath, (&&?), (==?), fileType, FileType (RegularFile), always)
import System.Posix (raiseSignal, sigTERM)
data FileScraper m a where
Watch :: [(Text, Text, FileFilter, DirectoryFilter, PostProcAction)] -> (FilePath -> ByteString -> Text -> PostProcAction -> m Bool) -> FileScraper m ()
makeSem ''FileScraper
runFileScraperFinal :: forall m. (MonadUnliftIO m => forall r a. (Member (Final m) r) => LogAction m (Msg Severity) -> Sem (FileScraper ': r) a -> Sem r a)
runFileScraperFinal _ = do
interpretFinal #m (\case
Watch sources callback -> do
is <- getInitialStateS
ins <- getInspectorS
cb' <- bindS $ uncurry4 callback
liftS $ withRunInIO $ \runInIO -> liftIO $ do
runResourceT . runConduit $ watchDirectories sources .| mapMC (\(fp,fc,dest,ppa) -> lift $ do
eff <- runInIO $ cb' ((fp,fc,dest,ppa) <$ is)
case inspect ins eff of
Nothing -> do
raiseSignal sigTERM
pure False
Just v -> do
pure v
) .| takeWhileC id .| mapM_C (const $ pure ())
)
uncurry4 :: (a -> b -> c -> d -> e) -> ((a, b, c, d) -> e)
uncurry4 f ~(a,b,c,d) = f a b c d
watchDirectories :: MonadResource m => [(Text, Text, FileFilter, DirectoryFilter, PostProcAction)] -> ConduitM a (FilePath, ByteString, Text, PostProcAction) m ()
watchDirectories sourceToFilterMap = parSources (fmap (\(src, dest, filter, dirFilter, postProcActions) -> streamFile (unpack src) dest filter dirFilter postProcActions) sourceToFilterMap)
streamFile :: MonadResource m => FilePath -> Text -> FileFilter -> DirectoryFilter -> PostProcAction -> ConduitM a (FilePath, ByteString, Text, PostProcAction) m ()
streamFile baseDir destination filter dirFilter postProcActions = do
newFiles <- liftIO $ find (recursionPredicate dirFilter) (fileType ==? RegularFile) baseDir
yieldMany newFiles .| filterM (liftIO . filter) .| mapMC (\entry -> do
liftIO $ readFile entry <&> (entry,,destination,postProcActions))
let minutes :: Int = 60_000_000
liftIO $ threadDelay (5 * minutes)
streamFile baseDir destination filter dirFilter postProcActions
where
recursionPredicate :: DirectoryFilter -> RecursionPredicate
recursionPredicate df = case df of
[] -> always
excludes -> foldl1 (&&?) $ map ((/~?) filePath . unpack) excludes
parSources :: (MonadResource m, Foldable f) => f (ConduitM () o (ResourceT IO) ()) -> ConduitT i o m ()
parSources sources = bracketP init cleanup finalSource
where
init = do
-- create the queue where all sources will put their items
queue <- STM.newTBMQueueIO 100
-- In a separate thread, run concurrently all conduits
a <- Async.async $ do
Async.mapConcurrently_ (\source -> runResourceT $ runConduit (source .| sinkQueue queue)) sources
-- once all conduits are done, close the queue
STM.atomically (STM.closeTBMQueue queue)
pure (a, queue)
cleanup (async, queue) = do
-- upon exception or cancellation, close the queue and cancel the threads
STM.atomically (STM.closeTBMQueue queue)
Async.cancel async
finalSource (_, queue) = sourceQueue queue
sourceQueue :: MonadIO m => STM.TBMQueue o -> ConduitT i o m ()
sourceQueue queue = do
mbItem <- liftIO $ STM.atomically (STM.readTBMQueue queue)
case mbItem of
Nothing -> pure () -- queue closed
Just item -> yield item *> sourceQueue queue
sinkQueue :: MonadIO m => STM.TBMQueue a -> ConduitT a o m ()
sinkQueue queue = do
mbItem <- await
case mbItem of
Nothing -> pure () -- no more items to come
Just item -> do
liftIO $ STM.atomically (STM.writeTBMQueue queue item)
sinkQueue queue
Update (Added function that uses the callback):
...
void $ async $ watch normalisedPrefixedSources (\fp content dest ppa -> do
log Info $ "Sending file " <> pack fp
result <- await =<< send (unpack dest) content
case result of
Just True -> do
log Info $ "File sent " <> pack fp
res <- embed #m $ liftIO $ ppa fp
if res then pure True else do
log Error "Raise signal for graceful shutdown."
embed #m $ liftIO $ raiseSignal sigTERM
pure False
_ -> do
log Error $ "Error sending file " <> pack fp <> ". Raise signal for graceful shutdown."
embed #m $ liftIO $ raiseSignal sigTERM
pure False
)
...
Update 2:
After removing the idempotent filter from the configuration (the changes from #K. A. Buhr are still in place) the memory consumption is constant.
type FileFilter = FilePath -> IO Bool
createIdempotentFilter :: LogAction IO Message -> M.Idempotent -> IO FileFilter
createIdempotentFilter la filterConfig = do
cache <- newIORef []
let configuredCacheSize :: Int = fromIntegral $ M.lruCacheSize filterConfig
pure $ \path -> do
fileModificationEpoch <- getModificationTime path
cache' <- readIORef cache
if (path, fileModificationEpoch) `elem` cache' then do
la <& logText Debug ("File already in cache " <> pack path <> " | " <> pack (show fileModificationEpoch))
pure False
else do
la <& logText Debug ("File not in cache " <> pack path <> " | " <> pack (show fileModificationEpoch))
let alreadyScanned' = cache' <> [(path, fileModificationEpoch)]
writeIORef cache $ drop (length alreadyScanned' - configuredCacheSize) alreadyScanned'
pure True
Is there any problematic code - that causes a memory leak - in the function createIdempotentFilter?

First, make sure you rule out the ByteStrings of file contents as a source of the leak. You will have a maximum number of files in flight equal to the length of the bounded queue, and so your high watermark will be the contents of some arbitrary collection of 100 files from the input filesystems. If you're processing filesystems with large video/image files, you could see erratic, transient spikes from that. Also, if your callback is holding references to the pathnames and/or contents of (some or all of) those files, you'll see a very severe space leak as a result. Rule all this out by replacing readFile entry with return mempty and using a null callback (\_ _ _ _ -> return True).
After making a similar change myself, I was able to duplicate your space leak and tracked it down to two technical issues.
The first was:
.| takeWhileC id .| mapM_C (const $ pure ())
Replacing this with:
.| Control.Monad.void andC
reduced the maximum residency for a single pass through a test filesystem from 130MB to 15MB, but still with a characteristic linear increase in heap usage on a a heap profile.
The second was:
yield item *> sourceQueue queue
Replacing this with:
yield item >> sourceQueue queue
removed the leak entirely. Maximum residency was only 2MB, and there was no discernible leak on a heap profile for multiple passes through the test filesystem.
I'm not exactly sure what's going on here, for either issue. The *> versus >> issue is a problem I've seen before. While these are semantically equivalent, they don't necessarily have the same implementation, and sometimes *> leaks space where >> doesn't. However, the takeWhileC problem is a mystery to me.

Related

Passing State from a Producer to a Parser

I'm using pipes, attoparsec, and pipes-attoparsec to write a database dump file converter. The general format of the file is to have a create table command followed by an optional insert command. In addition to transforming the statements in place, the table definitions have to be held in memory until the very end for additional processing (indexes, constraints, etc.).
This works fine, but now I need to allow some of my internal parsers to have access to my Producer's State in order to determine which parser needs to be run while processing the values from the insert command.
I tried something like this:
-- IO
import qualified Data.ByteString.Char8 as BS (putStrLn)
import System.Exit (ExitCode (..), exitSuccess, exitFailure)
import System.IO (hPutStrLn, stderr)
-- Pipes
import Pipes (runEffect, for, liftIO, Producer, Effect)
import Pipes.Attoparsec (parsed, ParsingError)
import Pipes.Lift (runStateP)
import Pipes.Safe (runSafeT)
import qualified Pipes.ByteString as PBS (stdin)
-- State
import Control.Monad.Trans.Class (lift)
import Control.Monad.Trans.State.Strict
dump' :: StateT ParserState Parser Command
dump' = fmap Create createStatements' <|> fmap Insert justData'
doStuff :: MonadIO m => Effect m (Either (ParsingError, Producer ByteString (StateT ParserState m) ()) (), ParserState)
doStuff = runStateP defaultParserState theStuff
theStuff :: MonadIO m => Effect (StateT ParserState m) (Either (ParsingError, Producer ByteString (StateT ParserState m) ()) ())
theStuff = for runParser (liftIO . BS.putStrLn <=< lift . processCommand)
runParser :: MonadIO m => Producer Command (StateT ParserState m) (Either (ParsingError, Producer ByteString (StateT ParserState m) ()) ())
runParser = do
s <- lift get
liftIO $ putStrLn "runParser"
liftIO $ putStrLn $ show s
parsed (evalStateT dump' s) PBS.stdin
processCommand :: MonadIO m => Command -> StateT ParserState m ByteString
processCommand (Create xs) = do
currentState <- get
liftIO $ putStrLn "processCommand"
liftIO $ putStrLn $ show currentState
_ <- put (currentState { constructs = xs ++ (constructs currentState)})
return $ P.firstPass $ P.transformConstructs xs
processCommand (Insert x) = return x
Complete source (including parsers): https://github.com/cimmanon/mysqlnothx/blob/parser-state/src/Main.hs
When I run it, I get a result that looks something like this:
runParser
ParserState {constructs = []}
processCommand
ParserState {constructs = []}
processCommand
ParserState {constructs = [ ... ]}
processCommand
ParserState {constructs = [ ..... ]}
I was expecting runParser (which would grab the latest contents from State) to be run every time processCommand runs, but that's clearly not the case based on the output. When I check the contents of State within the parser, it's always empty no matter how many commands are parsed.
How can I extend State from my Producers to my Parser (dump') so that they share the same State? If my Producer has 4 values in State, the Parser should also see those same 4 values.

I was expecting runParser (which would grab the latest contents from State) to be run every time processCommand runs, but that's clearly not the case.
Your main effect is for runParser (liftIO . BS.putStrLn <=< lift . processCommand). To understand what this effect does you need to understand what for does:
(for p body) loops over p replacing each yield with body
"Loops over p" is accurate if a bit confusing. It doesn't run p once for each value produced by p; that would explode! Instead for replaces every yield in p with body. By replacing yield with body it runs body once for every yielded value. Running the body once for each produced value is similar to how in other languages a for-loop over a list runs the body once for each value in the list.
Your runParser is
runParser = do
s <- lift get
liftIO $ putStrLn "runParser"
liftIO $ putStrLn $ show s
parsed (evalStateT dump' s) PBS.stdin
It reads the state, outputs it, and produces the Commands parsed from stdin. Pipes-autoparsec's parsed parses the source and yields once for each completely successfully parsed value. Your for then replaces each of parsed's yields with liftIO . BS.putStrLn <=< lift . processCommand. The complete effect runs runParser once and processCommand once for each yield, which is what you're observing in the output.

Filtering ANSI escape sequences from a ByteString with Conduit

I'm trying to make a Conduit that filters ANSI escape codes from ByteStrings. I've come up with a function that converts the ByteString into a stream of Word8's, does the filtering, and converts back into a stream of ByteStream at the end.
It seems to work fine when I use it in GHCi:
> runConduit $ yield "hello\27[23;1m world" .| ansiFilter .| printC
"hello world"
When I use it in my application, conduits that contain ansiFilter don't seem to pass anything through. Here is the full source:
{-# LANGUAGE OverloadedStrings #-}
module Main where
import Conduit
import Control.Concurrent.Async
import Control.Concurrent.STM
import Data.ByteString (ByteString)
import qualified Data.ByteString as BS
import Data.Conduit.TQueue
import Data.Word8 (Word8)
import qualified Data.Word8 as Word8
main :: IO ()
main = do
queue <- atomically $ newTBQueue 25
let qSource = sourceTBQueue queue
atomically $ writeTBQueue queue ("hello" :: ByteString)
race_
(putInputIntoQueue queue)
(doConversionAndPrint qSource)
putInputIntoQueue q =
runConduit
$ stdinC
.| iterMC (atomically . writeTBQueue q)
.| sinkNull
doConversionAndPrint src =
runConduit
$ src
.| ansiFilter
.| stdoutC
ansiFilter :: MonadIO m => ConduitM ByteString ByteString m ()
ansiFilter = toWord8 .| ansiFilter' .| toByteString
where
ansiFilter' = awaitForever $ \first -> do
msecond <- peekC
case (first, msecond) of
(0x1b, Just 0x5b) -> do
dropWhileC (not . Word8.isLetter)
dropC 1
_ -> yield first
toWord8 = concatC
toByteString :: Monad m => ConduitM Word8 ByteString m ()
toByteString =
(mapC BS.singleton .| foldC) >>= yield
This program is supposed to echo back the filtered contents of stdin, but nothing gets echoed back.
However, if I comment out the ansiFilter in doConversionAndPrint, echoing does work which makes me thing the ansiFilter function is wrong.
Any help would be greatly appreciated!

I reimplemented ansiFilter in terms of the higher level chunked data functions in conduit-combinator, like takeWhileCE. This seems to work, and should be more efficient by letting more of the data remain in an efficient memory representation:
ansiFilter :: MonadIO m => ConduitM ByteString ByteString m ()
ansiFilter = loop
where
loop = do
takeWhileCE (/= 0x1b)
mfirst <- headCE
case mfirst of
Nothing -> return ()
Just first -> assert (first == 0x1b) $ do
msecond <- peekCE
case msecond of
Just 0x5b -> do
dropWhileCE (not . Word8.isLetter)
dropCE 1
_ -> yield $ BS.singleton first
loop

Went with a slightly different approach and am having more luck leaving the ByteStrings alone. I think this gives up some of the streaming stuff, but is acceptable for my use-case.
ansiFilter :: Monad m => Conduit ByteString m ByteString
ansiFilter = mapC (go "")
where
csi = "\27["
go acc "" = acc
go acc remaining = go (acc <> filtered) (stripCode unfiltered)
where
(filtered, unfiltered) = BS.breakSubstring csi remaining
stripCode bs = BS.drop 1 (BS.dropWhile (not . Word8.isLetter) bs)

Pipes and callbacks in Haskell

I'm processing some audio using portaudio. The haskell FFI bindings call a user defined callback whenever there's audio data to be processed. This callback should be handled very quickly and ideally with no I/O. I wanted to save the audio input and return quickly since my application doesn't need to react to the audio in realtime (right now I'm just saving the audio data to a file; later I'll construct a simple speech recognition system).
I like the idea of pipes and thought I could use that library. The problem is that I don't know how to create a Producer that returns data that came in through a callback.
How do I handle my use case?
Here's what I'm working with right now, in case that helps (the datum mvar isn't working right now but I don't like storing all the data in a seq... I'd rather process it as it came instead of just at the end):
{-# LANGUAGE FlexibleInstances, MultiParamTypeClasses #-}
module Main where
import Codec.Wav
import Sound.PortAudio
import Sound.PortAudio.Base
import Sound.PortAudio.Buffer
import Foreign.Ptr
import Foreign.ForeignPtr
import Foreign.C.Types
import Foreign.Storable
import qualified Data.StorableVector as SV
import qualified Data.StorableVector.Base as SVB
import Control.Exception.Base (evaluate)
import Data.Int
import Data.Sequence as Seq
import Control.Concurrent
instance Buffer SV.Vector a where
fromForeignPtr fp = return . SVB.fromForeignPtr fp
toForeignPtr = return . (\(a, b, c) -> (a, c)) . SVB.toForeignPtr
-- | Wrap a buffer callback into the generic stream callback type.
buffCBtoRawCB' :: (StreamFormat input, StreamFormat output, Buffer a input, Buffer b output) =>
BuffStreamCallback input output a b -> StreamCallback input output
buffCBtoRawCB' func = \a b c d e -> do
fpA <- newForeignPtr_ d -- We will not free, as callback system will do that for us
fpB <- newForeignPtr_ e -- We will not free, as callback system will do that for us
storeInp <- fromForeignPtr fpA (fromIntegral $ 1 * c)
storeOut <- fromForeignPtr fpB (fromIntegral $ 0 * c)
func a b c storeInp storeOut
callback :: MVar (Seq.Seq [Int32]) -> PaStreamCallbackTimeInfo -> [StreamCallbackFlag] -> CULong
-> SV.Vector Int32 -> SV.Vector Int32 -> IO StreamResult
callback seqmvar = \timeinfo flags numsamples input output -> do
putStrLn $ "timeinfo: " ++ show timeinfo ++ "; flags are " ++ show flags ++ " in callback with " ++ show numsamples ++ " samples."
print input
-- write data to output
--mapM_ (uncurry $ pokeElemOff output) $ zip (map fromIntegral [0..(numsamples-1)]) datum
--print "wrote data"
input' <- evaluate $ SV.unpack input
modifyMVar_ seqmvar (\s -> return $ s Seq.|> input')
case flags of
[] -> return $ if unPaTime (outputBufferDacTime timeinfo) > 0.2 then Complete else Continue
_ -> return Complete
done doneMVar = do
putStrLn "total done dood!"
putMVar doneMVar True
return ()
main = do
let samplerate = 16000
Nothing <- initialize
print "initialized"
m <- newEmptyMVar
datum <- newMVar Seq.empty
Right s <- openDefaultStream 1 0 samplerate Nothing (Just $ buffCBtoRawCB' (callback datum)) (Just $ done m)
startStream s
_ <- takeMVar m -- wait until our callbacks decide they are done!
Nothing <- terminate
print "let's see what we've recorded..."
stuff <- takeMVar datum
print stuff
-- write out wav file
-- let datum =
-- audio = Audio { sampleRate = samplerate
-- , channelNumber = 1
-- , sampleData = datum
-- }
-- exportFile "foo.wav" audio
print "main done"

The simplest solution is to use MVars to communicate between the callback and Producer. Here's how:
import Control.Proxy
import Control.Concurrent.MVar
fromMVar :: (Proxy p) => MVar (Maybe a) -> () -> Producer p a IO ()
fromMVar mvar () = runIdentityP loop where
loop = do
ma <- lift $ takeMVar mvar
case ma of
Nothing -> return ()
Just a -> do
respond a
loop
Your stream callback will write Just input to the MVar and your finalization callback will write Nothing to terminate the Producer.
Here's a ghci example demonstrating how it works:
>>> mvar <- newEmptyMVar :: IO (MVar (Maybe Int))
>>> forkIO $ runProxy $ fromMVar mvar >-> printD
>>> putMVar mvar (Just 1)
1
>>> putMVar mvar (Just 2)
2
>>> putMVar mvar Nothing
>>> putMVar mvar (Just 3)
>>>
Edit: The pipes-concurrency library now provides this feature, and it even has a section in the tutorial explaining specifically how to use it to get data out of callbacks.

Sequential Binary Data Decoding Using Conduits

The goal is to have a conduit with the following type signature
protobufConduit :: MonadResource m => (ByteString -> a) -> Conduit ByteString m a
The conduit should repeatedly parse protocol buffers (using the ByteString -> a function) received via TCP/IP (using the network-conduit package).
The wire message format is
{length (32 bits big endian)}{protobuf 1}{length}{protobuf 2}...
(The curly braces are not party of the protocol, only used here to separate the entities).
The first idea was to use sequenceSink to repeatedly apply a Sink that is able to parse one ProtoBuf:
[...]
import qualified Data.Binary as B
import qualified Data.Conduit.Binary as CB
import qualified Data.Conduit.Util as CU
protobufConduit :: MonadResource m => (ByteString -> a) -> Conduit ByteString m a
protobufConduit protobufDecode =
CU.sequenceSink () $ \() ->
do lenBytes <- CB.take 4 -- read protobuf length
let len :: Word32
len = B.decode lengthBytes -- decode ProtoBuf length
intLen = fromIntegral len
protobufBytes <- CB.take intLen -- read the ProtoBuf bytes
return $ CU.Emit () [ protobufDecode protobufBytes ] -- emit decoded ProtoBuf
It doens't work (only works for the first protocol buffer) because there seems to be a number of "leftover" bytes already read from the source but not consumed via CB.take that get discarded.
And I found no way of pushing "the rest back into the source".
Did I get the concept entirely wrong?
PS: Even if I use protocol buffers here, the problem is not related to protocol buffers. To debug the problem I always use {length}{UTF8 encoded string}{length}{UTF8 encoded string}... and a conduit similar to the above one (utf8StringConduit :: MonadResource m => Conduit ByteString m Text).
Update:
I just tried to replace the state (no state () in the sample above) by the remaining bytes and replaced the CB.take calls by calls to a function that first consumes the already read bytes (from the state) and calls await only as needed (when the state is not large enough). Unfortunately, that doesn't work either because as soon as the Source has no bytes left, sequenceSink does not execute the code but the state still contains the remaining bytes :-(.
If you should be interested in the code (which isn't optimized or very good but should be enough to test):
utf8StringConduit :: forall m. MonadResource m => Conduit ByteString m Text
utf8StringConduit =
CU.sequenceSink [] $ \st ->
do (lengthBytes, st') <- takeWithState BS.empty st 4
let len :: Word32
len = B.decode $ BSL.fromChunks [lengthBytes]
intLength = fromIntegral len
(textBytes, st'') <- takeWithState BS.empty st' intLength
return $ CU.Emit st'' [ TE.decodeUtf8 $ textBytes ]
takeWithState :: Monad m
=> ByteString
-> [ByteString]
-> Int
-> Pipe l ByteString o u m (ByteString, [ByteString])
takeWithState acc state 0 = return (acc, state)
takeWithState acc state neededLen =
let stateLenSum = foldl' (+) 0 $ map BS.length state
in if stateLenSum >= neededLen
then do let (firstChunk:state') = state
(neededChunk, pushBack) = BS.splitAt neededLen firstChunk
acc' = acc `BS.append` neededChunk
neededLen' = neededLen - BS.length neededChunk
state'' = if BS.null pushBack
then state'
else pushBack:state'
takeWithState acc' state'' neededLen'
else do aM <- await
case aM of
Just a -> takeWithState acc (state ++ [a]) neededLen
Nothing -> error "to be fixed later"

For protocol buffer parsing and serializing we use messageWithLengthPutM and messageWithLengthGetM (see below) but I assume it uses a varint encoding for the length, which is not what you need. I'd probably try to adapt our implementation below by replacing the messageWithLength Get/Put with something like
myMessageWithLengthGetM =
do size <- getWord32be
getMessageWithSize size
but I have no idea how to implement the getMessageWithSize using the available functions from the protocol buffer package. On the other hand you could just getByteString and then "reparse" the bytestring.
Regarding conduits: Have you tried implementing the conduit without Data.Conduit.Util? Something like
protobufConduit protobufDecode = loop
where
loop =
do len <- liftM convertLen (CB.take 4)
bs <- CB.take len
yield (protobufDecode bs)
loop
Here's the code we use:
pbufSerialize :: (ReflectDescriptor w, Wire w) => Conduit w IO ByteString
pbufSerialize = awaitForever f
where f pb = M.mapM_ yield $ BSL.toChunks $ runPut (messageWithLengthPutM pb)
pbufParse :: (ReflectDescriptor w, Wire w, Show w) => Conduit ByteString IO w
pbufParse = new
where
new = read (runGet messageWithLengthGetM . BSL.fromChunks . (:[]))
read parse =
do mbs <- await
case mbs of
Just bs -> checkResult (parse bs)
Nothing -> return ()
checkResult result =
case result of
Failed _ errmsg -> fail errmsg
Partial cont -> read (cont . Just . BSL.fromChunks . (:[]))
Finished rest _ msg ->
do yield msg
checkResult (runGet messageWithLengthGetM rest)

Nested ResourceT scopes within a conduit Sink

Is there a way to scope runResourceT to the lifetime of a single Sink?
I'm trying to build a Sink that wraps a potentially infinite number of Sinks. This works fine with threads but I'm trying to do it without threads. It seems like it should be possible. I've hit a road block due to the scoping of runResourceT: I get either too coarsely grained (but functional) or much too finely grained (totally broken) resource management.
{-# LANGUAGE FlexibleContexts #-}
import Control.Monad.Trans (lift)
import Control.Monad.Trans.Resource
import Data.ByteString (ByteString)
import qualified Data.ByteString.Char8 as BC8 (pack)
import Data.Conduit
import qualified Data.Conduit.Binary as Cb
import qualified Data.Conduit.List as Cl
import System.FilePath ((<.>))
test :: IO ()
test =
runResourceT
$ Cl.sourceList (fmap (BC8.pack . show) [(1 :: Int)..1000])
$$ rotateResourceHog "/tmp/foo"
-- |
-- files are allocated on demand but handles are released at the same time
rotateResourceHog
:: MonadResource m
=> FilePath -> Sink ByteString m ()
rotateResourceHog filePath = step 0 where
step i = do
x <- Cl.peek
case x of
Just _ -> do
chunkWriter $ filePath <.> show (i :: Integer)
-- loop
step $ i+1
Nothing -> return ()
-- |
-- files are allocated on demand but handles are released immediately
rotateUsingClosedHandles
:: (MonadBaseControl IO m, MonadResource m)
=> FilePath -> Sink ByteString m ()
rotateUsingClosedHandles filePath = step 0 where
step i = do
x <- Cl.peek
case x of
Just _ -> do
transPipe runResourceT . chunkWriter $ filePath <.> show (i :: Integer)
-- loop
step $ i+1
Nothing -> return ()
chunkWriter
:: MonadResource m
=> FilePath -> Sink ByteString m ()
chunkWriter filePath = do
_ <- lift $ allocate (putStrLn "alloc") (\ _ -> putStrLn "free")
-- the actual conduit chain is more complicated
Cl.isolate 100 =$= Cb.sinkFile filePath

ResourceT is only intended to clean up resources in exceptional cases. It is not intended to provide prompt finalization, only guaranteed finalization. For promptness, conduit provides its own facilities for handling cleanup. In your case, you're looking for both: you want cleanup to happen as early as possible, and to occur even in the case of an exception being thrown. For this, you should use bracketP. For example:
chunkWriter
:: MonadResource m
=> FilePath -> Sink ByteString m ()
chunkWriter filePath = bracketP
(putStrLn "alloc")
(\() -> putStrLn "free")
(\() -> Cl.isolate 100 =$= Cb.sinkFile filePath)
This results in the desired interleaving of alloc and free outputs.

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Haskell based file streaming causes memory leak - haskell

Related

Passing State from a Producer to a Parser

Filtering ANSI escape sequences from a ByteString with Conduit

Pipes and callbacks in Haskell

Sequential Binary Data Decoding Using Conduits

Nested ResourceT scopes within a conduit Sink

Categories

Resources