unzip the file and parse it to js - node.js

hi i had tried to unzip the file from my c drive and trying to parse to javascript object
here is the code
var AdmZip = require('adm-zip');
var fs = require('fs'), xml2js = require('xml2js');
var parser = new xml2js.Parser();
var paramdata = 'c:/sample/kusuma.zip';
console.log(paramdata);
var zip = new AdmZip(paramdata);
var zipEntries = zip.getEntries();
var obj = [];
var count = 0;
zipEntries.forEach(function(zipEntry) {
var len = zipEntries.length;
console.log(zipEntry.toString());
console.log(zipEntry.entryName);
fs.readFile("", function(err, data) {
console.log(data);
parser.parseString(data, function(err, result) {
count++;
console.log(count);
obj.push(result);
if (count === len) {
console.log(obj);
res.send(obj);
}
});
});
});
please check the code once and provide me some more examples

Well, fs.readFile() is for reading files that are themselves directly on disk, which these aren't.
However, adm-zip is already reading in the contents of the .zip, so you shouldn't need fs. Each zipEntry has getData() and getDataAsync() methods that can be used to retrieve contents.
zipEntries.forEach(function (zipEntry) {
zipEntry.getDataAsync(function (data) {
parser.parseString(data, function (err, result) {
console.log(result);
});
});
});
Also, as zipEntries is an Array, you can use .filter() to reduce it to only XML files.
var zipEntries = zip.getEntries().filter(function (zipEntry) {
return !zipEntry.isDirectory && /\.xml$/.test(zipEntry.entryName);
});
You'll also want to determine len once from the collection rather than from each entry. You can also test that against use obj.length rather than having to keep count separately:
var len = zipEntries.length;
var obj = [];
zipEntries.forEach(function (zipEntry) {
zipEntry.getDataAsync(function (data) {
parser.parseString(data, function (err, result) {
obj.push(result);
if (obj.length === len) {
res.send(obj);
}
});
});
});

Related

Nodjs Sqllite3 not flushing to disk

I'm having problem with sqlite3 not flush to disk. The code I'm using is below. My total filelist are over 470k and the program tends to use several gigabytes of memory. while the program is running test.db is 0 bytes and no journal is used. It only starts to write to disk when db.close() is running.
var fs = require('fs');
var sqlite3 = require('sqlite3').verbose();
var db = new sqlite3.Database('test.db');
db.serialize(function () {
db.run("BEGIN;");
db.run("CREATE TABLE if not exists Files (name TEXT);");
db.run("COMMIT;");
var files = fs.readdirSync("./files/");
console.log("File list completed: " + files.length);
for (var i = 0; i < files.length; i++) {
db.run("INSERT INTO Files VALUES (?);",files[i]);
}
});
db.close();
I have have tried to remove db.run("BEGIN;"); and db.run("COMMIT;"); but it does not help.
Bug?
I'm reporting this as a bug on github
I'm think that there is a problem with transaction and db.serialize.
db.serialize is uncontrolled code. I don't know when it's useful.
Try control flow like below
var fs = require('fs');
var sqlite3 = require('sqlite3');
var async = require('async');
var db = new sqlite3.Database('test.db');
async.series ([
function(cb) {
db.run('CREATE TABLE if not exists Files (name TEXT)', cb);
},
function(cb) {
db.run('begin transaction', cb);
},
function(cb) {
var files = fs.readdirSync("./files/");
async.each(
files,
function(file, cb) { db.run('INSERT INTO Files VALUES (?)', file, cb); },
cb
);
},
function(cb) {
db.run('commit transaction', cb);
}
],
function(err) {
if (err) {
console.log(err);
db.run('rollback transaction'); // can fail if error occurs on create table
}
db.close();
}
)
If you don't need insert all rows or nothing that you can try next code
var fs = require('fs');
var sqlite3 = require('sqlite3');
var async = require('async');
var db = new sqlite3.Database('test.db');
db.run('CREATE TABLE if not exists Files (name TEXT)', function (err) {
if (err)
return console.log(err);
var files = fs.readdirSync("./files/");
async.eachSeries(
files,
function(file, cb) { db.run('INSERT INTO Files VALUES (?)', file, cb); },
function(err) {
console.log((err) ? err : 'Done');
db.close();
}
);
});

Node transform stream not waiting

I'm getting a file stream from Busboy and then i'm piping it to a custom transform stream to validate and clean it. It works with small files but as they get bigger my custom stream doesn't wait for the busboy stream to finish and it gets truncated.
Here is the busboy code:
busboy
.on("file", function(fieldname, file, filename, encoding, mimetype) {
//Creating a mongo doc first
Dataset.create(dataset, function (err, ds) {
if(err) {...}
else {
file.pipe(validateCSV));
}
});
validateCSV
.on("finish", function() {
// Send to Data Import
datasetService.import(validateCSV, dataset, function (err, result) {
...
});
});
});
And my transform stream:
module.exports.ValidateCSV = ValidateCSV;
function ValidateCSV(options) {
if (!(this instanceof ValidateCSV)) return new ValidateCSV(options);
if (!options) options = {};
options.objectMode = true;
Transform.call(this, options);
}
util.inherits(ValidateCSV, Transform);
ValidateCSV.prototype._transform = function (chunk, encoding, done) {
if (this._checked) {
this.push(chunk);
} else {
//Do some validation
var data = chunk.toString();
var lines = data.match(/[^\r\n]+/g);
var headerline = lines[0] || "";
var header = headerline.split(",");
...
this._checked = true;
this.push(chunk);
}
done()
}
It turned out it was a backpressure issue and seeting the HighWaterMark option on the transform stream fixed it. Ideally it woold be set according to the filesize of the upload but this fixed it for me:
function ValidateCSV(options) {
if (!(this instanceof ValidateCSV)) return new ValidateCSV(options);
if (!options) options = {};
options.objectMode = true;
options.highWaterMark = 100000;
Transform.call(this, options);
}

Node.js async - build object from loop, then do something with object

I'm trying to run a function and once that function is complete, then run another function. The first function reads a CSV file, makes a GET request, and builds an object. The second function uses that newly created object to create a new CSV file.
The problem I'm having is that the new CSV file is being created prior to the GET requests finishing.
I'm using async.parallel to set the flow, but not able to get the logic right.
I'd love to know what I'm doing wrong and better understand how node thinks about these tasks.
// Require
var request = require('request');
var fs = require('fs');
var json2csv = require('json2csv');
var csv = require('csv');
var async = require('async');
// Params
var emailHunter_apiKey = '0000';
var emails = [];
var fields = ['email'];
var i = 0;
// Start
async.parallel([
function(callback){
setTimeout(function(){
var file = fs.readFileSync('file.csv');
csv.parse(file, {delimiter: ','}, function (err, data) {
for (var key in data) {
if (i < 5) {
if (data.hasOwnProperty(key)) {
var h = data[key];
if (h[5] != '') {
var url = h[5];
url = url.replace('//', '');
url = url.replace('www.', '');
request('https://api.emailhunter.co/v1/search?domain=' + url + '&api_key=' + emailHunter_apiKey + '', function (error, response, body) {
if (!error && response.statusCode == 200) {
var json = JSON.parse(body);
for (var subObj in json) {
if (json.hasOwnProperty(subObj) && subObj == 'emails') {
var emailObj = json[subObj];
for (var key in emailObj) {
var email = {
'email': emailObj[key]['value']
};
emails.push(email);
}
}
}
}
});
}
}
}
i++;
}
});
callback(null, emails);
}, 200);
console.log(emails);
}
],
function(err, results){
json2csv({data: results, fields: fields}, function (err, csv) {
if (err) console.log(err);
fs.writeFile('export.csv', csv, function (err) {
if (err) throw err;
console.log('file saved');
});
});
console.log(results);
});
As laggingreflex mentioned, you're using async incorrectly.
First you should build a an array of functions that you want to execute in parallel. And then use async to execute them.
Furthermore, your callback was getting executed immediately because csv.parse() is an async function. Therefore node fires it immediately and then executes callback(). You need to move the callback inside of parse().
Try this...
// Params
var emailHunter_apiKey = '0000';
var emails = [];
var fields = ['email'];
var i = 0;
var functionsToRunAsync = [];
var file = fs.readFileSync('file.csv');
csv.parse(file, {delimiter: ','}, function (err, data) {
for (var key in data) {
if (i < 5) {
if (data.hasOwnProperty(key)) {
var h = data[key];
if (h[5] != '') {
var url = h[5];
url = url.replace('//', '');
url = url.replace('www.', '');
// add a new function to an array, to be executed later
functionsToRunAsync.push(function(callback) {
request('https://api.emailhunter.co/v1/search?domain=' + url + '&api_key=' + emailHunter_apiKey + '', function (error, response, body) {
if (!error && response.statusCode == 200) {
var json = JSON.parse(body);
for (var subObj in json) {
if (json.hasOwnProperty(subObj) && subObj == 'emails') {
var emailObj = json[subObj];
for (var key in emailObj) {
var email = {
'email': emailObj[key]['value']
};
emails.push(email);
// callback to tell async this function is complete
callback()
}
}
}
} else {
// callback to tell async this function is complete
callback
}
});
});
}
}
}
i++;
}
// now that we have all of the functions in an array, we run them in parallel
async.parallel(
functionsToRunAsync,
function(err, results) { // all async functions complete
json2csv({data: results, fields: fields}, function (err, csv) {
if (err) console.log(err);
fs.writeFile('export.csv', csv, function (err) {
if (err) throw err;
console.log('file saved');
});
});
console.log(results);
});
});

Streaming the results of multiple fs.readfiles

I'm a newbie at node js streams, and what I want to achieve is streaming the results of the readfiles in a module that I have. I want to then somehow invoke this readable stream in my main app and listen to data events, so everytime readfile returns a result a data event will trigger, and the object is passed as a chunk. This is what I've got so far and it's throwing an error...
function streamObjects(type, dirname){
var Readable = require('stream').Readable;
var rs = new Readable({objectMode: true});
fs.readdir(dirname, function(err, files){
if(err)
console.log(err);
for(var i=0;i<10;i++)
{
fs.readFile(path.resolve(dirname, files[i]),function(err,data){
if(err)
console.log(err);
rs.push(JSON.parse(data));
}); //end readFile
} //end for loop
return rs;
});
}
You want to use EventEmitter.
Your readFile is an async function which is not called when you return rs Readable.
var util = require('util');
var EventEmitter = require('events').EventEmitter;
function streamObjects(type, dirname) {
var Readable = require('stream').Readable;
var rs = new Readable({objectMode: true});
var self = this;
fs.readdir(dirname, function (err, files) {
if (err)
console.log(err);
for (var i = 0; i < 10; i++) {
fs.readFile(path.resolve(dirname, files[i]), function (err, data) {
if (err)
console.log(err);
self.emit('data', files[i], data);
rs.push(JSON.parse(data));
}); //end readFile
} //end for loop
});
}
util.inherits(streamObjects, EventEmitter);
module.exports = streamObjects;
From another file
var streamObjects = require('streamObjects');
var streamObjectInstance = new streamObjects(type, dirName);
streamObjectInstance.on('data', yourFunctionHere);
I did not put into error emit, but you can add those to when error happens.

How do I download a bunch of files from a remote server, and concatenate them in a certain order using Node.js?

I'm trying to read the contents of a bunch of javascript files on a server, and then concatenate them into a new local file. The files have to be concatenated in a specific order (specified in an array). Here's what I have so far:
var http = require('http');
var fs = require('fs');
var commonWebFiles = getCommonWebDependenciesInOrder();
var fileContents = [];
var path = '/folder/';
fs.mkdir("target");
for(var i = 0, l = commonWebFiles.length; i < l; ++i){
getFileContents(path, commonWebFiles[i]);
}
function getCommonWebDependenciesInOrder(){
//Hit manager to get an correctly ordered array of common web dependencies
//Stub
return [
'file1.js',
'file2.js',
'file3.js'
];
};
function getFileContents(path, filename){
var contents = "";
var writeStream = fs.createWriteStream("target/" + filename, {'flags': 'a'});
var options = {
host: 'ahost.net',
port: 80,
path: path + filename
};
var req = http.get(options, function(res) {
res.on('data', function(chunk) {
contents += chunk;
});
res.on('end', function() {
writeStream.write(contents, encoding='binary');
writeStream.end();
fileContents[filename] = contents;
});
}).on('error', function(e) {
console.log("Got error: " + e.message);
});
};
This downloads the files and recreates them locally, but it seems a little clunky. When I tried to just write a single file directly from a looped set of requests, I got the chunks out of order....I feel like there must be an easier way....
Thanks in advance.
Use async and request:
var fs = require('fs'),
async = require('async'),
request = require('request');
// Utility function to overcome request's callback (err, response, body) where we're only interested in err and body
function simpleRequest(url, callback) {
request(url, function(err, response, body) {
callback(err, body);
});
}
async.map(urls, simpleRequest, function(err, results) {
if(err)
return console.error(err);
fs.writeFile(outfile, results.join(''));
});

Resources