I have a large xml which is a combination of xml documents. I'm trying to use a nodejs xml splitter and respond back with the number of documents i have found.
My code looks something like this. I'm looking to get the number of documents outside the function(in the last line). Is there something I can do to achieve this?
var XmlSplit = require('./xmlsplitter.js')
const fs = require('fs')
var xmlsplit = new XmlSplit()
var no_of_docs = 0
var inputStream = fs.createReadStream('./files/input/test.xml')
inputStream.pipe(xmlsplit).on('data', function(data,callback) {
var xmlDocument = data.toString();
no_of_docs = no_of_docs + 1;
})
inputStream.pipe(xmlsplit).on('end', function(){
console.log('Stream ended');
console.log(no_of_docs); <-- This prints the correct value but the value is lost as soon as we exit this.
});
console.log("This is outside the function " + no_of_docs); <-- I need the value here.
Related
I am receiving data from the socket server and pushing it into an array. I am also checking if the data I received from the socket exists in an array, If it exists ignore it and if it doesn't push it into an existing array. For some reason though, the socketarray.push part seems to be overwriting previous data in the socketarray variable. No idea what am I doing wrong here
var socketarray = [];
socket.on('data', async data => {
var server_data = data.toString();
var parsed_result = m.parsingData(data.toString());
if (parsed_result) {
var car_id = parsed_result.car_id;
var find_string = socketarray.findIndex(x => x.car_id == car_id);
if (find_string === -1) {
length = socketarray.push({ "car_id": car_id, });
} else {
console.log("already present");
console.log(socketarray);
}
}
});
But there is also one weird thing which is happening. If I add this code
var length = socketarray.push({ "car_id": car_id,});
Before this line var find_string = socketarray.findIndex(x => x.car_id == car_id); which is basically putting a condition to make sure it doesn't save duplicate data. Then multiple data pushed successfully. But the only issue is the duplicity. So I have no idea why the code works before the conditional line and not after it.
I tried to make the function async but when I print the attacks it prints out {} without anything in it but when I print the values right after adding them in attacks I can print them why is it like that? how can I use the value?
var fs = require('fs');
var http = require('http');
var attacks = {};
var phase_name;
var directory = 'cti-master\\enterprise-attack\\attack-pattern\\';
// getting all files names.
async function getData(directory){
fs.readdir(directory, (err, files) => {
if(err) { return;}
var fileNum = 0;
// opening all the files and sorting the data in them.
while (fileNum < files.length - 1)
{
fs.readFile(directory + files[fileNum], 'utf8', (err, data) =>
{
// parsing the data from json.
var fileData = JSON.parse(data);
// sometimes there is no phase name.
if(fileData['objects'][0]['kill_chain_phases'] == undefined){phase_name = undefined;}
else{phase_name = fileData['objects'][0]['kill_chain_phases'][0]['phase_name'];}
// sorting data by name to make it easier later.
attacks[fileData['objects'][0]['name']] = {
id: fileData['objects'][0]['id'],
type: fileData['objects'][0]['type'],
description: fileData['objects'][0]['description'],
x_mitre_platforms: fileData['objects'][0]['x_mitre_platforms'],
x_mitre_detection: fileData['objects'][0]['x_mitre_detection'],
phase_name: phase_name};
});
fileNum += 1;
};
});
var keys = Object.keys(attacks);
console.log(attacks);
}
getData(directory);
The reason for the empty log here because the node does not wait to finish while loop Hence you are getting empty log. Basically, you can improve this code by using the async-await method.
But if you are stick with this code, I am just suggesting this logic.
Just bring your log inside an if condition block. which have condition "print only if expected file count reached"
for example.
if(fileNum === files.length) {
var keys = Object.keys(attacks);
console.log(attacks);
}
So now log print only when this condition is satisfied which means after completion of while loop
I'm attempting to write a very basic scraper that loops through a few pages and outputs all the data from each url to a single json file. The url structure goes as follows:
http://url/1
http://url/2
http://url/n
Each of the urls has a table, which contains information pertaining to the ID of the url. This is the data I am attempting to retrieve and store inside a json file.
I am still extremely new to this and having a difficult time moving forward. So far, my code looks as follows:
app.get('/scrape', function(req, res){
var json;
for (var i = 1163; i < 1166; i++){
url = 'https://urlgoeshere.com' + i;
request(url, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
var mN, mL, iD;
var json = { mN : "", mL : "", iD: ""};
$('html body div#wrap h2').filter(function(){
var data = $(this);
mN = data.text();
json.mN = mN;
})
$('table.vertical-table:nth-child(7)').filter(function(){
var data = $(this);
mL = data.text();
json.mL = mL;
})
$('table.vertical-table:nth-child(8)').filter(function(){
var data = $(this);
iD = data.text();
json.iD = iD;
})
}
fs.writeFile('output' + i + '.json', JSON.stringify(json, null, 4), function(err){
console.log('File successfully written! - Check your project directory for the output' + i + '.json file');
})
});
}
res.send(json);
})
app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;
When I run the code as displayed above, the output within the output.json file only contains data for the last url. I presume that's because I attempt to save all the data within the same variable?
If I include res.send() inside the loop, so the data writes after each page, I receive the error that multiple headers cannot be sent.
Can someone provide some pointers as to what I'm doing wrong? Thanks in advance.
Ideal output I would like to see:
Page ID: 1
Page Name: First Page
Color: Blue
Page ID: 2
Page Name: Second Page
Color: Red
Page ID: n
Page Name: Nth Page
Color: Green
I can see a number of problems:
Your loop doesn't wait for the asynchronous operations in the loop, thus you do some things like res.send() before the asynchronous operations in the loop have completed.
In appropriate use of cheerio's .filter().
Your json variable is constantly being overwritten so it only has the last data in it.
Your loop variable i would lose its value by the time you tried to use it in the fs.writeFile() statement.
Here's one way to deal with those issues:
const rp = require('request-promise');
const fsp = require('fs').promises;
app.get('/scrape', async function(req, res) {
let data = [];
for (let i = 1163; i < 1166; i++) {
const url = 'https://urlgoeshere.com/' + i;
try {
const html = await rp(url)
const $ = cheerio.load(html);
const mN = $('html body div#wrap h2').first().text();
const mL = $('table.vertical-table:nth-child(7)').first().text();
const iD = $('table.vertical-table:nth-child(8)').first().text();
// create object for this iteration of the loop
const obj = {iD, mN, mL};
// add this object to our overall array of all the data
data.push(obj);
// write a file specifically for this invocation of the loop
await fsp.writeFile('output' + i + '.json', JSON.stringify(obj, null, 4));
console.log('File successfully written! - Check your project directory for the output' + i + '.json file');
} catch(e) {
// stop further processing on an error
console.log("Error scraping ", url, e);
res.sendStatus(500);
return;
}
}
// send all the data we accumulated (in an array) as the final result
res.send(data);
});
Things different in this code:
Switch over all variable declarations to let or const
Declare route handler as async so we can use await inside.
Use the request-promise module instead of request. It has the same features, but returns a promise instead of using a plain callback.
Use the promise-based fs module (in latest versions of node.js).
Use await in order to serialize our two asynchronous (now promise-returning) operations so the for loop will pause for them and we can have proper sequencing.
Catch errors and stop further processing and return an error status.
Accumulate an object of data for each iteration of the for loop into an array.
Change .filter() to .first().
Make the response to the request handler be a JSON array of data.
FYI, you can tweak the organization of the data in obj however you want, but the point here is that you end up with an array of objects, one for each iteration of the for loop.
EDIT Jan, 2020 - request() module in maintenance mode
FYI, the request module and its derivatives like request-promise are now in maintenance mode and will not be actively developed to add new features. You can read more about the reasoning here. There is a list of alternatives in this table with some discussion of each one. I have been using got() myself and it's built from the beginning to use promises and is simple to use.
I use co-busyboy to parse file fields when uploading files using KOA. The official example looks like this:
var parse = require('co-busboy')
var parts = parse(this);
var part;
while (part = yield parts) {
if(!part.length) //it is a stream
part.pipe(fs.createWriteStream('some file.txt'));
}
For some reasons, I want to save all of "part" streams into an array, and perform the actual files writing when all of file streams are fetched. i.e.:
var parse = require('co-busboy');
var parts = parse(this);
var part;
var partArrays = [];
var cnt = 0;
while(part = yield parts){
partArray[cnt++] = part;
}
//after some processing, I perform the writing
for(file in partArray){
file.pipe(fs.createWriteStream('some file.txt');
}
The problem is that the while loop just does not continue. It seems that if I do not call part.pipe, the loop will halt.
So how can I make the while loop continue?
I need to run two commands in series that need to read data from the same stream.
After piping a stream into another the buffer is emptied so i can't read data from that stream again so this doesn't work:
var spawn = require('child_process').spawn;
var fs = require('fs');
var request = require('request');
var inputStream = request('http://placehold.it/640x360');
var identify = spawn('identify',['-']);
inputStream.pipe(identify.stdin);
var chunks = [];
identify.stdout.on('data',function(chunk) {
chunks.push(chunk);
});
identify.stdout.on('end',function() {
var size = getSize(Buffer.concat(chunks)); //width
var convert = spawn('convert',['-','-scale',size * 0.5,'png:-']);
inputStream.pipe(convert.stdin);
convert.stdout.pipe(fs.createWriteStream('half.png'));
});
function getSize(buffer){
return parseInt(buffer.toString().split(' ')[2].split('x')[0]);
}
Request complains about this
Error: You cannot pipe after data has been emitted from the response.
and changing the inputStream to fs.createWriteStream yields the same issue of course.
I don't want to write into a file but reuse in some way the stream that request produces (or any other for that matter).
Is there a way to reuse a readable stream once it finishes piping?
What would be the best way to accomplish something like the above example?
You have to create duplicate of the stream by piping it to two streams. You can create a simple stream with a PassThrough stream, it simply passes the input to the output.
const spawn = require('child_process').spawn;
const PassThrough = require('stream').PassThrough;
const a = spawn('echo', ['hi user']);
const b = new PassThrough();
const c = new PassThrough();
a.stdout.pipe(b);
a.stdout.pipe(c);
let count = 0;
b.on('data', function (chunk) {
count += chunk.length;
});
b.on('end', function () {
console.log(count);
c.pipe(process.stdout);
});
Output:
8
hi user
The first answer only works if streams take roughly the same amount of time to process data. If one takes significantly longer, the faster one will request new data, consequently overwriting the data still being used by the slower one (I had this problem after trying to solve it using a duplicate stream).
The following pattern worked very well for me. It uses a library based on Stream2 streams, Streamz, and Promises to synchronize async streams via a callback. Using the familiar example from the first answer:
spawn = require('child_process').spawn;
pass = require('stream').PassThrough;
streamz = require('streamz').PassThrough;
var Promise = require('bluebird');
a = spawn('echo', ['hi user']);
b = new pass;
c = new pass;
a.stdout.pipe(streamz(combineStreamOperations));
function combineStreamOperations(data, next){
Promise.join(b, c, function(b, c){ //perform n operations on the same data
next(); //request more
}
count = 0;
b.on('data', function(chunk) { count += chunk.length; });
b.on('end', function() { console.log(count); c.pipe(process.stdout); });
You can use this small npm package I created:
readable-stream-clone
With this you can reuse readable streams as many times as you need
For general problem, the following code works fine
var PassThrough = require('stream').PassThrough
a=PassThrough()
b1=PassThrough()
b2=PassThrough()
a.pipe(b1)
a.pipe(b2)
b1.on('data', function(data) {
console.log('b1:', data.toString())
})
b2.on('data', function(data) {
console.log('b2:', data.toString())
})
a.write('text')
I have a different solution to write to two streams simultaneously, naturally, the time to write will be the addition of the two times, but I use it to respond to a download request, where I want to keep a copy of the downloaded file on my server (actually I use a S3 backup, so I cache the most used files locally to avoid multiple file transfers)
/**
* A utility class made to write to a file while answering a file download request
*/
class TwoOutputStreams {
constructor(streamOne, streamTwo) {
this.streamOne = streamOne
this.streamTwo = streamTwo
}
setHeader(header, value) {
if (this.streamOne.setHeader)
this.streamOne.setHeader(header, value)
if (this.streamTwo.setHeader)
this.streamTwo.setHeader(header, value)
}
write(chunk) {
this.streamOne.write(chunk)
this.streamTwo.write(chunk)
}
end() {
this.streamOne.end()
this.streamTwo.end()
}
}
You can then use this as a regular OutputStream
const twoStreamsOut = new TwoOutputStreams(fileOut, responseStream)
and pass it to to your method as if it was a response or a fileOutputStream
If you have async operations on the PassThrough streams, the answers posted here won't work.
A solution that works for async operations includes buffering the stream content and then creating streams from the buffered result.
To buffer the result you can use concat-stream
const Promise = require('bluebird');
const concat = require('concat-stream');
const getBuffer = function(stream){
return new Promise(function(resolve, reject){
var gotBuffer = function(buffer){
resolve(buffer);
}
var concatStream = concat(gotBuffer);
stream.on('error', reject);
stream.pipe(concatStream);
});
}
To create streams from the buffer you can use:
const { Readable } = require('stream');
const getBufferStream = function(buffer){
const stream = new Readable();
stream.push(buffer);
stream.push(null);
return Promise.resolve(stream);
}
What about piping into two or more streams not at the same time ?
For example :
var PassThrough = require('stream').PassThrough;
var mybiraryStream = stream.start(); //never ending audio stream
var file1 = fs.createWriteStream('file1.wav',{encoding:'binary'})
var file2 = fs.createWriteStream('file2.wav',{encoding:'binary'})
var mypass = PassThrough
mybinaryStream.pipe(mypass)
mypass.pipe(file1)
setTimeout(function(){
mypass.pipe(file2);
},2000)
The above code does not produce any errors but the file2 is empty