Node.js thumbnailer using Imagemagick: nondeterministic corruption - node.js

I have a Node.js server which dynamically generates and serves small (200x200) thumbnails from images (640x640) in a database (mongodb). I'm using the node-imagemagick module for thumbnailing.
My code works roughly 95% of the time; about 1 in 20 (or fewer) thumbnailed images are corrupt on the client (iOS), which reports:
JPEG Corrupt JPEG data: premature end of data segment
For the corrupt images, the client displays the top 50% - 75% of the image, and the remainder is truncated.
The behavior is non-deterministic and the specific images which are corrupt changes on a per-request basis.
I'm using the following code to resize the image and output the thumbnail:
im.resize({
srcData: image.imageData.buffer,
width: opt_width,
}, function(err, stdout) {
var responseHeaders = {};
responseHeaders['content-type'] = 'image/jpeg';
responseHeaders['content-length'] = stdout.length;
debug('Writing ', stdout.length, ' bytes.');
response.writeHead(200, responseHeaders);
response.write(stdout, 'binary');
response.end();
});
What could be wrong, here?
Notes:
The problem is not an incorrect content-length header. When I omit the header, the result is the same.
When I do not resize the image, the full-size image always seems to be fine.
In researching this I found this and this StackOverflow questions, which both solved the problem by increasing the buffer size. In my case the images are very small, so this seems unlikely to be responsible.
I was originally assigning stdout to a new Buffer(stdout, 'binary') and writing that. Removing it ('binary' will be deprecated) made no difference.

The problem seems to have been due to a slightly older version of node-imagemagick (0.1.2); upgrading to 0.1.3 was the solution.
In case this is helpful to anyone, here's the code I used to make Node.js queue up and handle client requests one at a time.
// Set up your server like normal.
http.createServer(handleRequest);
// ...
var requestQueue = [];
var isHandlingRequest = false; // Prevent new requests from being handled.
// If you have any endpoints that don't always call response.end(), add them here.
var urlsToHandleConcurrently = {
'/someCometStyleThingy': true
};
function handleRequest(req, res) {
if (req.url in urlsToHandleConcurrently) {
handleQueuedRequest(req, res);
return;
}
requestQueue.push([req, res]); // Enqueue new requests.
processRequestQueue(); // Check if a request in the queue can be handled.
}
function processRequestQueue() {
// Continue if no requests are being processed and the queue is not empty.
if (isHandlingRequest) return;
if (requestQueue.length == 0) return;
var op = requestQueue.shift();
var req = op[0], res = op[1];
// Wrap .end() on the http.ServerRequest instance to
// unblock and process the next queued item.
res.oldEnd = res.end;
res.end = function(data) {
res.oldEnd(data);
isHandlingRequest = false;
processRequestQueue();
};
// Start handling the request, while blocking the queue until res.end() is called.
isHandlingRequest = true;
handleQueuedRequest(req, res);
}
function handleQueuedRequest(req, res) {
// Your regular request handling code here...
}

Related

Websocket - Waiting for a http request callback to execute before next pusher event

So I'm working with websockets to process data from website's API. For every new event I also send some http requests back to the website in order to obtain more data. Up untill now everything has worked fine, but now that I started using async requests to speed it up a bit things got a bit different. My code used to process one event and then move on to the next one (these events come in extremely quick - around 10 per second) but now it just seems to ignore the async (non blocking) part and move on to the next event and that way it just skips over half of the code. Note that the code works fine outside the Pusher. I'm using the 'pusher-client' module. My code looks like this:
var Request = require("request");
var requestSync = require('sync-request');
var Pusher = require('pusher-client');
var events_channel = pusher.subscribe('inventory_changes');
events_channel1.bind('listed', function(data)
{
var var2;
//Async request (to speed up the code)
function myFunction(callback){
request("url", function(error, response, body) {
if (!error && response.statusCode == 200)
{
result = JSON.stringify(JSON.parse(body));
return callback(null, result);
}
else
{
return callback(error, null);
}
});
}
myFunction(function(err, data){
if(!err)
{
var2 = data
return(data);
}
else
{
return(err);
}
});
//The part of the code below waits for the callback and the executes some code
var var1 = var2;
check();
function check()
{
if(var2 === var1)
{
setTimeout(check, 10);
return;
}
var1 = var2;
//A CHUNK OF CODE EXECUTES HERE (connected to the data from the callback)
}
});
In conclusion the code works, but not inside the pusher due to the pusher skipping the asynchronous request. How would I make the pusher wait for my async request to finish, before processing the next event (I have no idea)? If you happen to know, please let me know :)
You need to implement a queue to handle events one after another. I'm curious how it worked before, even without Pusher you'd have to implement some queue mechanism for it.
const eventsQueue = []
events_channel1.bind('listed', function(data) {
eventsQueue.push(data)
handleNewEvent()
})
let processingEvent = false
function handleNewEvent() {
if (processingEvent) return // do nothing if already processing an event
processingEvent = true
const eventData = eventsQueue.shift() // pick the first element from array
if (!eventData) return // all events are handled at the moment
... // handle event data here
processingEvent = false
handleNewEvent() // handle next event
}
Also, you should call clearTimeout method to clear your timeout when you don;t need it anymore.
And it's better to use promises or async/await instead of callbacks. Your code will be much easier to read and maintain.

How to download multiple links from a .csv file using multithreading in node.js?

I am trying to download links from a .csv file and store the downloaded files in a folder. I have used multithreading library for this i.e mt-files-downloader. The files are downloading fine but it takes too much time to download about 313 files. These files are about 400Kb in size max. When i tried using normal download using node i could download them in a minute or two but with this library the download should be fast as i am using multithread library but it takes lot of time. Below is my code any help would be useful. Thanks!
var rec;
csv
.fromStream(stream, { headers: ["Recording", , , , , , , ,] })
.on("data", function (records) {
rec = records.Recording;
//console.log(rec);
download(rec);
})
.on("end", function () {
console.log('Reading complete')
});
function download(rec) {
var filename = rec.replace(/\//g, '');
var filePath = './recordings/'+filename;
var downloadPath = path.resolve(filePath)
var fileUrl = 'http:' + rec;
var downloader = new Downloader();
var dl = downloader.download(fileUrl, downloadPath);
dl.start();
dl.on('error', function(dl) {
var dlUrl = dl.url;
console.log('error downloading = > '+dl.url+' restarting download....');
if(!dlUrl.endsWith('.wav') && !dlUrl.endsWith('Recording')){
console.log('resuming file download => '+dlUrl);
dl.resume();
}
});
}
You're right, downloading 313 files of 400kB should not take long - and I don't think this has to do with your code - maybe the connection is bad? Have you tried downloading a single file via curl?
Anyway I see two problems in your approach with which I can help:
first - you download all the files at the same time (which may introduce some overhead on the server)
second - your error handling will run in loop without waiting and checking the actual file, so if there's a 404 - you'll flood the server with requests.
Using streams with on('data') events has a major drawback of executing all the chunks more or less synchronously as they are read. This means that your code will execute whatever is in on('data') handler never waiting for completion of your downloads. The only limiting factor is now how fast the server can read the csv - and I'd expect millions of lines per second to be normal.
From the server perspective, you're simply requesting 313 files at once, which will result, not wanting to speculate on the actual technical mechanisms of the server, in some of those requests waiting and interfering with each other.
This can be solved by using a streaming framework, like scramjet, event-steram or highland for instance. I'm the author of the first and it's IMHO the easiest in this case, but you can use any of those changing the code a little to match their API - it's pretty similar in all cases anyway.
Here's a heavily commented code that will run a couple downloads in parallel:
const {StringStream} = require("scramjet");
const sleep = require("sleep-promise");
const Downloader = require('mt-files-downloader');
const downloader = new Downloader();
const {StringStream} = require("scramjet");
const sleep = require("sleep-promise");
const Downloader = require('mt-files-downloader');
const downloader = new Downloader();
// First we create a StringStream class from your csv stream
StringStream.from(csvStream)
// we parse it as CSV without columns
.CSVParse({header: false})
// we set the limit of parallel operations, it will get propagated.
.setOptions({maxParallel: 16})
// now we extract the first column as `recording` and create a
// download request.
.map(([recording]) => {
// here's the first part of your code
const filename = rec.replace(/\//g, '');
const filePath = './recordings/'+filename;
const downloadPath = path.resolve(filePath)
const fileUrl = 'http:' + rec;
// at this point we return the dl object so we can keep these
// parts separate.
// see that the download hasn't been started yet
return downloader.download(fileUrl, downloadPath);
})
// what we get is a stream of not started download objects
// so we run this asynchronous function. If this returns a Promise
// it will wait
.map(
async (dl) => new Promise((res, rej) => {
// let's assume a couple retries we allow
let retries = 10;
dl.on('error', async (dl) => {
try {
// here we reject if the download fails too many times.
if (retries-- === 0) throw new Error(`Download of ${dl.url} failed too many times`);
var dlUrl = dl.url;
console.log('error downloading = > '+dl.url+' restarting download....');
if(!dlUrl.endsWith('.wav') && !dlUrl.endsWith('Recording')){
console.log('resuming file download => '+dlUrl);
// lets wait half a second before retrying
await sleep(500);
dl.resume();
}
} catch(e) {
// here we call the `reject` function - meaning that
// this file wasn't downloaded despite retries.
rej(e);
}
});
// here we call `resolve` function to confirm that the file was
// downloaded.
dl.on('end', () => res());
})
)
// we log some message and ignore the result in case of an error
.catch(e => {
console.error('An error occured:', e.message);
return;
})
// Every steram must have some sink to flow to, the `run` method runs
// every operation above.
.run();
You can also use the stream to push out some kind of log messages and use pipe(process.stderr) in the end, instead of those console.logs. Please check the scramjet documentation for additional info and a Mozilla doc on async functions

EventEmitter memory leak detected: Proper way to pass CSV data to multiple modules?

I am dipping my toe into using different npm modules my own way whereas before I just executed already created gulpfiles. The npm module penthouse loads a webpage and determines the above the fold CSS for that page. I am trying to take that module and use it with a site crawler so I can get the above the fold css for all pages, and store that CSS in a table.
So essentially I am:
Crawling a site to get all the urls
capturing the page id from each url
storing pages & their id's in a CSV
load the CSV and pass each URL to penthouse
take penthouse output and store it in a table
So I am fine up until the last two steps. When I am reading the CSV, I get the error possible EventEmitter memory leak detected. 11 exit listeners added. Use emitter.setMaxListeners() to increase limit.
The stack trace points here at line 134. After reading about the error, it makes sense because I see a bunch of event listeners being added, but I don't see penthouse ever really executing and closing the event listeners.
It works just fine standalone as expected (Running penthouse against a single page then exiting). But when I execute the code below to try and loop through all URLs in a csv, it spits out the memory leak error twice, and just hangs. None of my console.log statements in the following script are executed.
However, I added console.log to the end of the penthouse index.js file, and it is executed multiple times (where it adds event listeners), but it never timeouts or exits.
So it's clear I am not integrating this properly, but not sure how to proceed. What would be the best way to force it to read one line in the CSV at a time, process the URL, then take the output and store it in the DB before moving onto the next line?
const fs = require('fs');
var csv = require('fast-csv');
var penthouse = require('penthouse'),
path = require('path');
var readUrlCsv = function() {
var stream = fs.createReadStream("/home/vagrant/urls.csv");
var csvStream = csv()
//returns single line from CSV
.on("data", function(data) {
// data[0]: table id, data[1]: page type, data[2]: url
penthouse({
url : data[2],
css : './dist/styles/main.css'
}, function(err, criticalCss) {
if (err) {
console.log(err);
}
console.log('do we ever get here?'); //answer is no
if (data[1] === 'post') {
wp.posts().id( data[0] ).post({
inline_css: criticalCss
}).then(function( response ) {
console.log('saved to db');
});
} else {
wp.pages().id( data[0] ).page({
inline_css: criticalCss
}).then(function( response ) {
console.log('saved to db');
});
}
});
})
.on("end", function(){
console.log("done");
});
return stream.pipe(csvStream);
};
UPDATE
Changed my method to look like below so it processes all rows first, but still throws the same error. Writes "done" to the console, and immediately spits out the memory warning twice.
var readUrlCsv = function() {
var stream = fs.createReadStream("/home/vagrant/urls.csv");
var urls = [];
var csvStream = csv()
.on("data", function(data) {
// data[0]: table id, data[1]: page type, data[2]: url
urls.push(data);
})
.on("end", function(){
console.log("done");
buildCriticalCss(urls);
});
return stream.pipe(csvStream);
};
var buildCriticalCss = function(urls) {
//console.log(urls);
urls.forEach(function(data, idx) {
//console.log(data);
penthouse({
url : data[2],
css : './dist/styles/main.css',
// OPTIONAL params
width : 1300, // viewport width
height : 900, // viewport height
timeout: 30000, // ms; abort critical css generation after this timeout
strict: false, // set to true to throw on css errors (will run faster if no errors)
maxEmbeddedBase64Length: 1000 // charaters; strip out inline base64 encoded resources larger than this
}, function(err, criticalCss) {
if (err) {
console.log(err);
}
console.log('do we ever finish one?');
if (data[1] === 'post') {
console.log('saving post ' + data[0]);
wp.posts().id( data[0] ).post({
inline_css: criticalCss
}).then(function( response ) {
console.log('saved post to db');
});
} else {
console.log('saving page ' + data[0]);
wp.pages().id( data[0] ).page({
inline_css: criticalCss
}).then(function( response ) {
console.log('saved page to db');
});
}
});
});
};
Update 2
I took the simple approach to control the amount of concurrent processes spawned.
var readUrlCsv = function() {
var stream = fs.createReadStream("/home/vagrant/urls.csv");
var urls = [];
var csvStream = csv()
.on("data", function(data) {
// data[0]: table id, data[1]: page type, data[2]: url
urls.push(data);
})
.on("end", function(){
console.log("done");
//console.log(urls);
buildCriticalCss(urls);
});
return stream.pipe(csvStream);
};
function buildCriticalCss(data) {
var row = data.shift();
console.log(row);
penthouse({
url : row[2],
css : './dist/styles/main.css',
// OPTIONAL params
width : 1300, // viewport width
height : 900, // viewport height
timeout: 30000, // ms; abort critical css generation after this timeout
strict: false, // set to true to throw on css errors (will run faster if no errors)
maxEmbeddedBase64Length: 1000 // charaters; strip out inline base64 encoded resources larger than this
}, function(err, criticalCss) {
if (err) {
console.log('err');
}
// handle your criticalCSS
console.log('finished');
console.log(row[2]);
// now start next job, if we have more urls
if (data.length !== 0) {
buildCriticalCss(data);
}
});
}
The error message you're seeing is a default printed to the console by node's event library if more than the allowed number of event listeners are defined for an instance of EventEmitter. It does not indicate an actual memory leak. Rather it is displayed to make sure you're aware of the possibility of a leak.
You can see this by checking the event.EventEmitter source code at lines 20 and 244.
To stop EventEmitter from displaying this message and since penthouse does not expose its specific EventEmitter, you'll need to set the default allowed event emitter listeners to something larger than its default value of 10 using:
var EventEmitter=require('event').EventEmitter;
EventEmitter.defaultMaxListeners=20;
Note that according to Node's documentation for EventEmitter.defaultMaxListeners, this will change the maximum number of listeners for all instances of EventEmitter, including those that have already been defined previous to the change.
Or you could simply ignore the message.
Further to the hanging of your code, I'd advise gathering all the results from the parsing of your CSV into an array, and then processing the array contents separately from the parsing process.
This would accomplish two things: It would allow you to
be assured the entire CSV file was valid before you started processing, and
instrument debugging messages while processing each element, which would give you deeper insight into how each element of the array was processed.
UPDATE
As noted below, depending on how many URLs you're processing, you're probably overwhelming Node's ability to handle all of your requests in parallel.
One easy way to proceed would be to use eventing to marshall your processing so your URLs are processed sequentially, as in:
var assert=require('assert'),
event=require('events'),
fs=require('fs'),
csv=require('fast-csv');
penthouse=require('penthouse');
var emitter=new events.EventEmitter();
/** Container for URL records read from CSV file.
*
* #type {Array}
*/
var urls=[];
/** Reads urls from file and triggers processing
*
* #emits processUrl
*/
var readUrlCsv = function() {
var stream = fs.createReadStream("/home/vagrant/urls.csv");
stream.on('error',function(e){ // always handle errors!!
console.error('failed to createReadStream: %s',e);
process.exit(-1);
});
var csvStream = csv()
.on("data", function(data) {
// data[0]: table id, data[1]: page type, data[2]: url
urls.push(data);
})
.on("end", function(){
console.log("done reading csv");
//console.log(urls);
emitter.emit('processUrl'); // start processing URLs
})
.on('error',function(e){
console.error('failed to parse CSV: %s',e);
process.exit(-1);
});
// no return required since we don't do anything with the result
stream.pipe(csvStream);
};
/** Event handler to process a single URL
*
* #emits processUrl
*/
var onProcessUrl=function(){
// always check your assumptions
assert(Array.isArray(urls),'urls must be an array');
var urlRecord=urls.shift();
if(urlRecord){
assert(Array.isArray(urlRecord),'urlRecord must be an array');
assert(urlRecord.length>2,'urlRecord must have at least three elements');
penthouse(
{
// ...
},
function(e,criticalCss){
if(e){
console.error('failed to process record %s: %s',urlRecord,e);
return; // IMPORTANT! do not drop through to rest of func!
}
// do what you need with the result here
if(urls.length===0){ // ok, we're done
console.log('completed processing URLs');
return;
}
emitter.emit('processUrl');
}
);
}
}
/**
* processUrl event - triggers processing of next URL
*
* #event processUrl
*/
emitter.on('processUrl',onProcessUrl); // assign handler
// start everything going...
readUrlCsv();
The benefit of using events here rather than your solution is the lack of recursion which can easily overwhelm your stack.
Hint: You can use events to handle all program flow issues normally addressed by Promises or modules like async.
And since events are at the very heart of Node (the "event loop"), it's really the best, most efficient way to solve such problems.
It's both elegant and "The Node Way"!
Here is a gist that illustrates the technique, without relying on streams or penthouse, the output of which is:
url: url1
RESULT: RESULT FOR url1
url: url2
RESULT: RESULT FOR url2
url: url3
RESULT: RESULT FOR url3
completed processing URLs
Besides using console.logs which usually is enough, you can also use the built in debugger: https://nodejs.org/api/debugger.html
Another thing you can do is go into the node_modules/penthouse directory and add your console.logs or debugger statement into the code for that module. That way you can debug your program there rather than the module just being a black box.
Also make sure there isn't some kind of race condition where for example the CSV doesn't always get output before it tries to read them in.
I think that the memory leak issue is probably a red herring as far as making your code function.
From your comment it sounds like you want to do something like the following with async.mapSeries: http://promise-nuggets.github.io/articles/15-map-in-series.html You could also use promises as it shows or even after getting promises set up use the async/await stuff with a regular for loop after compiling with babel. In the long run I recommend doing that sort of thing with async/await and babel but that might be overkill just to get this working.

Meteor: How do I stream and parse a large file to an async Node function?

I'm using the job-collection package to do the following:
Download a large file with a bunch of metadata about webpages
Create a stream from the file metadata that is split by a regex using the NPM event-stream package
Check if there is a match of the metadata in a collection (I've been attempting to stream each webpage's metadata to another function to do this)
The file is too large to buffer, so streaming is required. Here is a small file with a few examples of the metadata if you wish to try this.
Each job from the job-collection package is already inside an async function:
var request = Npm.require('request');
var zlib = Npm.require('zlib');
var EventStream = Meteor.npmRequire('event-stream');
function (job, callback) {
//This download is much too long to block
request({url: job.fileURL, encoding: null}, function (error, response, body) {
if (error) console.error('Error downloading File');
if (response.statusCode !== 200) console.error(downloadResponse.statusCode, 'Status not 200');
var responseEncoding = response.headers['content-type'];
console.log('response encoding is %s', responseEncoding);
if (responseEncoding === 'application/octet-stream' || 'binary/octet-stream') {
console.log('Received binary/octet-stream');
var regexSplit = /WARC\/1\./;
response.pipe(zlib.createGunzip()
.pipe(EventStream.split(regexSplit))
.pipe(EventStream.map(function (webpageMetaData) {
/* Need parse the metaData or pass each webpageMetaData to function
* This next function could block if it had to */
searchPageMetaData(webpageMetaData); // pass each metadatum to this function to update a collection - this function can be synchronous
}));
} else {
console.error('Wrong encoding');
}
});
}
function searchWebPageMetaData(metaData) {
// Parse JSON and search collection for match
}
Are there better ways to structure this? Am I on the right track?
Where to put Meteor.bindEnvironment? - do I I bind the environment for each time I pass to searchWebPageMetaData()? Do I need to expressly use fibers here?
The stream stops when running this if I run it to process.stdout. Am I supposed to put the stream into one of Meteor's wrap
I'm aware of Meteor.wrapAsync. Do I want to wrap the innermost searchWebPageMetaData() function in Meteor.wrapAsync? (think I'm answering this yes as I type)
Will the stream slow to compensate for the slowness of the DB calls? My guess is no but how do I deal with that?
I've spent quite a while learning about Meteor's wrapAsync, and bindEnvironment but having trouble bringing it all together and understanding where to use them.
SUPPLEMENT 1
Just to clarify, the steps are:
Download file;
Create stream;
unzip it;
split it into individual webPages - EventStream handles this
send it to a function - don't need return values; this could be blocking, it's just some searching and database call
I was trying to do something like this, except the core code I need help with was in a function in a different file. The following code has most of #electric-jesus' answer in there.
processJobs('parseWatFile', {
concurrency: 1,
cargo: 1,
pollInterval: 1000,
prefetch: 1
}, function (job, callback) {
if (job.data.watZipFileLink) {
queue.pause();
console.log('queue should be paused now');
var watFileUrl = 'https://s3.amazonaws.com/ja-common-crawl/exampleWatFile.wat.gz';
function searchPageMetaData(webpageMetaData, callback) {
console.log(webpageMetaData); // Would be nice to just get this function logging each webPageMetaData
future.return(callback(webpageMetaData)); //I don't need this to return any value - do I have to return something?
}
if (!watFile)
console.error('No watFile passed to downloadAndSearchWatFileForEntity ');
var future = new Future(); // Doc Brown would be proud.
if(typeof callback !== 'function') future.throw('callbacks are supposed to be functions.');
request({url: watFile, encoding: null}, function (error, response, body) {
if (error) future.throw('Error Downloading File');
if (response.statusCode !== 200) future.throw('Expected status 200, got ' + response.statusCode + '.');
var responseEncoding = response.headers['content-type'];
if (responseEncoding === 'application/octet-stream' || 'binary/octet-stream') {
var regexSplit = /WARC\/1\./;
response.pipe(zlib.createGunzip()
.pipe(EventStream.split(regexSplit))
.pipe(EventStream.map(function (webpageMetaData) {
searchPageMetaData(webpageMetaData, callback);
})
));
} else {
future.throw('Wrong encoding');
}
});
return future.wait();
} else {
console.log('No watZipFileLink for this job');
job.log('ERROR: NO watZipFileLink from commonCrawlJob collection');
}
queue.resume();
job.done;
callback();
}
Interesting, looks alright. I've never worked with job-collection but it seems to be just a Mongo-driven task queue.. so I am assuming it works like a regular queue. I've always found for stuff with callback, I most certainly use the Future pattern. e.g:
var request = Npm.require('request');
var zlib = Npm.require('zlib');
var EventStream = Meteor.npmRequire('event-stream');
var Future = Npm.require('fibers/future');
var searchWebPageMetaData = function (metaData) {
// Parse JSON and search collection for match
// make it return something
var result = /droids/ig.test(metaData);
return result;
}
var processJob = function (job, callback) {
var future = new Future(); // Doc Brown would be proud.
if(typeof callback !== 'function') future.throw("Oops, you forgot that callbacks are supposed to be functions.. not undefined or whatever.");
//This download is much too long to block
request({url: job.fileURL, encoding: null}, function (error, response, body) {
if (error) future.throw("Error Downloading File");
if (response.statusCode !== 200) future.throw("Expected status 200, got " + downloadResponse.statusCode + ".");
var responseEncoding = response.headers['content-type'];
if (responseEncoding === 'application/octet-stream' || 'binary/octet-stream') {
var regexSplit = /WARC\/1\./;
response.pipe(zlib.createGunzip()
.pipe(EventStream.split(regexSplit))
.pipe(EventStream.map(function (webpageMetaData) {
/* Need parse the metaData or pass each webpageMetaData to function
* This next function could block if it had to */
// pass each metadatum to this function to update a collection - this function can be synchronous
future.return(callback(webpageMetaData)); // this way, processJob returns whatever we find in the completed webpage, via callback.
}));
} else {
future.throw('Wrong encoding');
}
});
return future.wait();
}
Example usage:
so whenever you assign variables here:
var currentJob = processJob(myjob, searchWebPageMetaData);
and even with synchronous type obtainment/variable assignment, you get your async stuff done and transported just-in-time for you.
To answer your questions,
Where to put Meteor.bindEnvironment? - do I I bind the environment for each time I pass to searchWebPageMetaData()? Do I need to expressly use fibers here?
not really, i believe the explicit use of fibers/future already take care of this.
The stream stops when running this if I run it to process.stdout. Am I supposed to put the stream into one of Meteor's wrap
how do you mean? I vaguely remember process.stdout is blocking, that might be a cause. again, wrapping the result in a future should take care of this.
I'm aware of Meteor.wrapAsync. Do I want to wrap the innermost searchWebPageMetaData() function in Meteor.wrapAsync? (think I'm answering this yes as I type)
Take a look at the Meteor.wrapAsync helper code. It's basically a future resolution applied, of course you can do this then again you can also explicitly use fibers/future on its own with no problem.
Will the stream slow to compensate for the slowness of the DB calls? My guess is no but how do I deal with that?
Not really sure what you mean here.. but since we're trying to use asynchronous fibers, my guess is no as well. I've yet to see any slowness with the use of fibers. Probably only if there are multiple jobs launched (and concurrently running) at once, then you will have a performance issue in terms of memory usages. Keep the concurrent queue low as Fibers can be quite powerful in running stuff at the same time. You only have one core to process it all, that's a sad fact as node can't multi-core :(
This is quite tricky if you want to handle all errors correctly. So one should ask themself, what to do if: you code throws an exception, or error event handler is called. You want that errors propagate correctly, that is, are thrown as an exception in the fiber calling streaming code. I implemented something like this for one of our job-collecton tasks, for extracting tar files.
First you need some helper functions:
bindWithFuture = (futures, mainFuture, fun, self) ->
wrapped = (args...) ->
future = new Future()
if mainFuture
future.resolve (error, value) ->
# To resolve mainFuture early when an exception occurs
mainFuture.throw error if error and not mainFuture.isResolved()
# We ignore the value
args.push future.resolver()
try
futures.list.push future
fun.apply (self or #), args
catch error
future.throw error
# This waiting does not really do much because we are
# probably in a new fiber created by Meteor.bindEnvironment,
# but we can still try to wait
Future.wait future
Meteor.bindEnvironment wrapped, null, self
wait = (futures) ->
while futures.list.length
Future.wait futures.list
# Some elements could be added in meantime to futures,
# so let's remove resolved ones and retry
futures.list = _.reject futures.list, (f) ->
if f.isResolved()
# We get to throw an exception if there was an exception.
# This should not really be needed because exception should
# be already thrown through mainFuture and we should not even
# get here, but let's check for every case.
f.get()
true # And to remove resolved
And then you can run something like:
mainFuture = new Future()
# To be able to override list with a new value in wait we wrap it in an object
futures =
list: []
bindWithOnException = (f) =>
Meteor.bindEnvironment f, (error) =>
mainFuture.throw error unless mainFuture.isResolved()
onWebpageMetaData = (metaData, callback) =>
return callback null if mainFuture.isResolved()
# Do whatever you want here.
# Call callback(null) when you finish.
# Call callback(error) if there is an error.
# If you want to call into a Meteor code inside some other callback for async code you use,
# use bindWithOnException to wrap a function and stay inside a Meteor environment and fiber.
MeteorCollection.insert
metaData: metaData
callback null
requestFuture = new Future()
request
url: job.fileURL
encoding: null
,
(error, response, body) ->
return requestFuture.throw error if error
return requestFuture.throw new Error "Expected status 200, got #{ response.statusCode }." unless response.statusCode is 200
requestFuture.return response
response = requestFuture.wait()
responseEncoding = response.headers['content-type']
throw new Error "Wrong encoding" unless responseEncoding in ['application/octet-stream', 'binary/octet-stream']
regexSplit = /WARC\/1\./
response.pipe(
zlib.createGunzip()
).pipe(
EventStream.split regexSplit
).pipe(
EventStream.map bindWithFuture futures, mainFuture, onWebpageMetaData
).on('end', =>
# It could already be resolved by an exception from bindWithFuture or bindWithOnException
mainFuture.return() unless mainFuture.isResolved()
).on('error', (error) =>
# It could already be resolved by an exception from bindWithFuture or bindWithOnException
mainFuture.throw error unless mainFuture.isResolved()
)
mainFuture.wait()
wait futures

How to use filesystem's createReadStream with Meteor router(NodeJS)

I need to allow the user of my app to download a file with Meteor. Currently what I do is when the user requests to download a file I enter into a "fileRequests" collection in Mongo a document with the file location and a timestamp of the request and return the ID of the newly created request. When the client gets the new ID it imediately goes to mydomain.com/uploads/:id. I then use something like this to intercept the request before Meteor does:
var connect = Npm.require("connect");
var Fiber = Npm.require("fibers");
var path = Npm.require('path');
var fs = Npm.require("fs");
var mime = Npm.require("mime");
__meteor_bootstrap__.app
.use(connect.query())
.use(connect.bodyParser()) //I add this for file-uploading
.use(function (req, res, next) {
Fiber(function() {
if(req.method == "GET") {
// get the id here, and stream the file using fs.createReadStream();
}
next();
}).run();
});
I check to make sure the file request was made less than 5 seconds ago, and I immediately delete the request document after I've queried it.
This works, and is secure(enough) I think. No one can make a request without being logged in and 5 seconds is a pretty small window for someone to be able to highjack the created request URL but I just don't feel right with my solution. It feels dirty!
So I attempted to use Meteor-Router to accomplish the same thing. That way I can check if they're logged in correctly without doing the 5 second open to the world trickery.
So here's the code I wrote for that:
Meteor.Router.add('/uploads/:id', function(id) {
var path = Npm.require('path');
var fs = Npm.require("fs");
var mime = Npm.require("mime");
var res = this.response;
var file = FileSystem.findOne({ _id: id });
if(typeof file !== "undefined") {
var filename = path.basename(file.filePath);
var filePath = '/var/MeteorDMS/uploads/' + filename;
var stat = fs.statSync(filePath);
res.setHeader('Content-Disposition', 'attachment; filename=' + filename);
res.setHeader('Content-Type', mime.lookup(filePath));
res.setHeader('Content-Length', stat.size);
var filestream = fs.createReadStream(filePath);
filestream.pipe(res);
return;
}
});
This looks great, fits right in with the rest of the code and is easy to read, no hacking involved, BUT! It doesn't work! The browser spins and spins and never quite knows what to do. I have ZERO error messages coming up. I can keep using the app on other tabs. I don't know what it's doing, it never stops "loading". If I restart the server, I get a 0 byte file with all the correct headers, but I don't get the data.
Any help is greatly appreciated!!
EDIT:
After digging around a bit more, I noticed that trying to turn the response object into a JSON object results in a circular structure error.
Now the interesting thing about this is that when I listen to the filestream for the "data" event, and attempt to stringify the response object I don't get that error. But if I attempt to do the same thing in my first solution(listen to "data" and stringify the response) I get the error again.
So using the Meteor-Router solution something is happening to the response object. I also noticed that on the "data" event response.finished is flagged as true.
filestream.on('data', function(data) {
fs.writeFile('/var/MeteorDMS/afterData', JSON.stringify(res));
});
The Meteor router installs a middleware to do the routing. All Connect middleware either MUST call next() (exactly once) to indicate that the response is not yet settled or MUST settle the response by calling res.end() or by piping to the response. It is not allowed to do both.
I studied the source code of the middleware (see below). We see that we can return false to tell the middleware to call next(). This means we declare that this route did not settle the response and we would like to let other middleware do their work.
Or we can return a template name, a text, an array [status, text] or an array [status, headers, text], and the middleware will settle the response on our behalf by calling res.end() using the data we returned.
However, by piping to the response, we already settled the response. The Meteor router should not call next() nor res.end().
We solved the problem by forking the Meteor router and making a small change. We replaced the else in line 87 (after if (output === false)) by:
else if (typeof(output)!="undefined") {
See the commit with sha 8d8fc23d9c in my fork.
This way return; in the route method will tell the router to do nothing. Of course you already settled the response by piping to it.
Source code of the middleware as in the commit with sha f910a090ae:
// hook up the serving
__meteor_bootstrap__.app
.use(connect.query()) // <- XXX: we can probably assume accounts did this
.use(this._config.requestParser(this._config.bodyParser))
.use(function(req, res, next) {
// need to wrap in a fiber in case they do something async
// (e.g. in the database)
if(typeof(Fiber)=="undefined") Fiber = Npm.require('fibers');
Fiber(function() {
var output = Meteor.Router.match(req, res);
if (output === false) {
return next();
} else {
// parse out the various type of response we can have
// array can be
// [content], [status, content], [status, headers, content]
if (_.isArray(output)) {
// copy the array so we aren't actually modifying it!
output = output.slice(0);
if (output.length === 3) {
var headers = output.splice(1, 1)[0];
_.each(headers, function(value, key) {
res.setHeader(key, value);
});
}
if (output.length === 2) {
res.statusCode = output.shift();
}
output = output[0];
}
if (_.isNumber(output)) {
res.statusCode = output;
output = '';
}
return res.end(output);
}
}).run();
});

Resources