I'm trying to save the result to a json file but when I see it goes in half, something this wrong in my code but I do not understand that you part, thanks for your help.
var request = require("request");
var cheerio = require("cheerio");
var fs = require('fs');
var urls = ["http://www.fordencuotas.com.ar"]
var req = function(url){
request({
uri: url,
}, function(error, response, body) {
var $ = cheerio.load(body);
$("a").each(function() {
var link = $(this);
var itri = {iti: new Array(link.attr("href"))}
var data = JSON.stringify(itri);
fs.writeFile("file.json", data, function(err){
if(err){console.log(err);} else {console.log("archivo guardado..");}
});
});
});
}
for (var i = 0; i < urls.length; i++){
req(urls[i]);
}
console.log("cargando...");
this output
[opmeitle#localhost crawler1]$ node crawmod.js
cargando...
archivo guardado..
archivo guardado..
archivo guardado..
archivo guardado..
archivo guardado..
...
archivo guardado..
[opmeitle#localhost crawler1]$ cat file.json
{"iti":["productos/autos/nuevo-focus.html"]}us.html"]}
[opmeitle#localhost crawler1]$
There's a couple of issues in your code.
First, you are trying to overwrite the same file (file.json) for each a element. I'm not sure if that's your intention, but it seems rather pointless.
Secondly, fs.writeFile is asynchronous. That means that Node doesn't wait until the file is written before it returns to your loop. In other words, for each a element you open the same file, while it might have already been opened by an earlier iteration of your loop. And each iteration is writing to the same file, so you're going to end up with unexpected results.
You can either use fs.writeFileSync to synchronously write the file, which would make Node wait until the data has been written to the file before continuing, or gather all the data that you want saved to the file in a variable, and — after the $("a").each(...) loop — write that variable to the file just once.
That last solution could look something like this:
var data = [];
$("a").each(function() {
var link = $(this);
var itri = {iti: new Array(link.attr("href"))}
data.push( itri );
});
fs.writeFile("file.json", JSON.stringify(data), function(err){
if(err){console.log(err);} else {console.log("archivo guardado..");}
});
Related
I tried to make the function async but when I print the attacks it prints out {} without anything in it but when I print the values right after adding them in attacks I can print them why is it like that? how can I use the value?
var fs = require('fs');
var http = require('http');
var attacks = {};
var phase_name;
var directory = 'cti-master\\enterprise-attack\\attack-pattern\\';
// getting all files names.
async function getData(directory){
fs.readdir(directory, (err, files) => {
if(err) { return;}
var fileNum = 0;
// opening all the files and sorting the data in them.
while (fileNum < files.length - 1)
{
fs.readFile(directory + files[fileNum], 'utf8', (err, data) =>
{
// parsing the data from json.
var fileData = JSON.parse(data);
// sometimes there is no phase name.
if(fileData['objects'][0]['kill_chain_phases'] == undefined){phase_name = undefined;}
else{phase_name = fileData['objects'][0]['kill_chain_phases'][0]['phase_name'];}
// sorting data by name to make it easier later.
attacks[fileData['objects'][0]['name']] = {
id: fileData['objects'][0]['id'],
type: fileData['objects'][0]['type'],
description: fileData['objects'][0]['description'],
x_mitre_platforms: fileData['objects'][0]['x_mitre_platforms'],
x_mitre_detection: fileData['objects'][0]['x_mitre_detection'],
phase_name: phase_name};
});
fileNum += 1;
};
});
var keys = Object.keys(attacks);
console.log(attacks);
}
getData(directory);
The reason for the empty log here because the node does not wait to finish while loop Hence you are getting empty log. Basically, you can improve this code by using the async-await method.
But if you are stick with this code, I am just suggesting this logic.
Just bring your log inside an if condition block. which have condition "print only if expected file count reached"
for example.
if(fileNum === files.length) {
var keys = Object.keys(attacks);
console.log(attacks);
}
So now log print only when this condition is satisfied which means after completion of while loop
I'm attempting to write a very basic scraper that loops through a few pages and outputs all the data from each url to a single json file. The url structure goes as follows:
http://url/1
http://url/2
http://url/n
Each of the urls has a table, which contains information pertaining to the ID of the url. This is the data I am attempting to retrieve and store inside a json file.
I am still extremely new to this and having a difficult time moving forward. So far, my code looks as follows:
app.get('/scrape', function(req, res){
var json;
for (var i = 1163; i < 1166; i++){
url = 'https://urlgoeshere.com' + i;
request(url, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
var mN, mL, iD;
var json = { mN : "", mL : "", iD: ""};
$('html body div#wrap h2').filter(function(){
var data = $(this);
mN = data.text();
json.mN = mN;
})
$('table.vertical-table:nth-child(7)').filter(function(){
var data = $(this);
mL = data.text();
json.mL = mL;
})
$('table.vertical-table:nth-child(8)').filter(function(){
var data = $(this);
iD = data.text();
json.iD = iD;
})
}
fs.writeFile('output' + i + '.json', JSON.stringify(json, null, 4), function(err){
console.log('File successfully written! - Check your project directory for the output' + i + '.json file');
})
});
}
res.send(json);
})
app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;
When I run the code as displayed above, the output within the output.json file only contains data for the last url. I presume that's because I attempt to save all the data within the same variable?
If I include res.send() inside the loop, so the data writes after each page, I receive the error that multiple headers cannot be sent.
Can someone provide some pointers as to what I'm doing wrong? Thanks in advance.
Ideal output I would like to see:
Page ID: 1
Page Name: First Page
Color: Blue
Page ID: 2
Page Name: Second Page
Color: Red
Page ID: n
Page Name: Nth Page
Color: Green
I can see a number of problems:
Your loop doesn't wait for the asynchronous operations in the loop, thus you do some things like res.send() before the asynchronous operations in the loop have completed.
In appropriate use of cheerio's .filter().
Your json variable is constantly being overwritten so it only has the last data in it.
Your loop variable i would lose its value by the time you tried to use it in the fs.writeFile() statement.
Here's one way to deal with those issues:
const rp = require('request-promise');
const fsp = require('fs').promises;
app.get('/scrape', async function(req, res) {
let data = [];
for (let i = 1163; i < 1166; i++) {
const url = 'https://urlgoeshere.com/' + i;
try {
const html = await rp(url)
const $ = cheerio.load(html);
const mN = $('html body div#wrap h2').first().text();
const mL = $('table.vertical-table:nth-child(7)').first().text();
const iD = $('table.vertical-table:nth-child(8)').first().text();
// create object for this iteration of the loop
const obj = {iD, mN, mL};
// add this object to our overall array of all the data
data.push(obj);
// write a file specifically for this invocation of the loop
await fsp.writeFile('output' + i + '.json', JSON.stringify(obj, null, 4));
console.log('File successfully written! - Check your project directory for the output' + i + '.json file');
} catch(e) {
// stop further processing on an error
console.log("Error scraping ", url, e);
res.sendStatus(500);
return;
}
}
// send all the data we accumulated (in an array) as the final result
res.send(data);
});
Things different in this code:
Switch over all variable declarations to let or const
Declare route handler as async so we can use await inside.
Use the request-promise module instead of request. It has the same features, but returns a promise instead of using a plain callback.
Use the promise-based fs module (in latest versions of node.js).
Use await in order to serialize our two asynchronous (now promise-returning) operations so the for loop will pause for them and we can have proper sequencing.
Catch errors and stop further processing and return an error status.
Accumulate an object of data for each iteration of the for loop into an array.
Change .filter() to .first().
Make the response to the request handler be a JSON array of data.
FYI, you can tweak the organization of the data in obj however you want, but the point here is that you end up with an array of objects, one for each iteration of the for loop.
EDIT Jan, 2020 - request() module in maintenance mode
FYI, the request module and its derivatives like request-promise are now in maintenance mode and will not be actively developed to add new features. You can read more about the reasoning here. There is a list of alternatives in this table with some discussion of each one. I have been using got() myself and it's built from the beginning to use promises and is simple to use.
I am trying to write a nodejs program. I have a file data.json , it contains json objects. For every object i have to add another key as review ,value as a text file data. here , I wrote code for reading data from json file, in that file , for every object I inserted key and value pairs. and stored in a array named 'matter', In below code, I used callback to post data to calling function. but callback is executing before 'for loop' in Fetchdata function. How to call callback after for loop.
var fs = require('fs');
var jf = require('jsonfile')
var file = '../data/data.json'
function readfile(str,callback) {
fs.readFile(str, function (err, data) {
callback && callback(data.toString());
});
}
function Fetchdata(callback) {
var matter = [];
jf.readFile(file, function (err, jsonData) {
var j=0;
for (var i = 0; i < jsonData.length; ++i) {
obj = jsonData[i];
var purchase_url = obj["purchase_url"];
if (purchase_url.indexOf("flipkart") > -1) {
var ss = purchase_url.split("pid=");
if (ss[1]) {
var s2 = ss[1].split('&');
readfile(s2[0],function(some){
"use strict";
obj["review"]= some;
matter.push(obj);
})
}
}
}
callback && callback(matter);
});
}
Fetchdata(function (some) {
console.log(some[0]);
});
You can use sync version of readFile
function readfile(filename,callback) {
callback(fs.readFileSync(filename).toString())
}
The sync version of readFile, will not continue to the next line. It will return the content of the file.
You can also, do it in one line fs.readFileSync(str,'utf-8') instead of using toString.
More info
readFileSync API
Synchronous vs Asynchronous code with Node.js
I am building a node application that reads a CSV file from the file system, analyzes the file, and then parses the file using the csv-parse module. Each of these steps are in the form of a stream, piped one into the next.
The trouble I am having is that for some files, the parse step can read the stream, but for others, the read() method returns null on the readable event and I don't know why.
In the code below, specifically, I will sometimes see data come through on calling read() on the parser stream, and other times it will return null. CSV files that succeed always succeed, and CSV files that fail always fail. I tried to determine some difference between the files, but other than using different field names in the first row, and slightly different data in the body, I can't see any significant difference between the source files.
What are some reasons that a node stream's read() function might return null after a readable event?
Sample code:
var parse = require('csv-parse');
var fs = require('fs');
var stream = require('stream');
var byteCounter = new stream.Transform({objectMode : true});
byteCounter.setEncoding('utf8');
var totalBytes = 0;
// count the bytes we have read
byteCounter._transform = function (chunk, encoding, done) {
var data = chunk.toString();
if (this._lastLineData) {
data = this._lastLineData + data ;
}
var lines = data.split('\n');
// this is because each chunk will probably not end precisely at the end of a line:
this._lastLineData = lines.splice(lines.length-1,1)[0];
lines.forEach(function(line) {
totalBytes += line.length + 1 // we add an extra byte for the end-of-line
this.push(line);
}, this);
done();
};
byteCounter._flush = function (done) {
if (this._lastLineData) {
this.push(this._lastLineData);
}
this._lastLineData = null;
done();
};
// csv parser
var parser = parse({
delimiter: ",",
comment: "#",
skip_empty_lines: true,
auto_parse: true,
columns: true
});
parser.on('readable', function(){
var row;
while( null !== (row = parser.read()) ) {
// do stuff
}
});
// start by reading a file, piping to byteCounter, then pipe to parser.
var myPath = "/path/to/file.csv";
var options = {
encoding : "utf-8"
};
fs.createReadStream(myPath, options).pipe(byteCounter).pipe(parser);
I was trying to learn the streaming in Nodejs by writing a small script. But after executing this one the last stream is not pushing all the data.
var stream = require('stream');
var fs = require('fs');
var util = require('util');
function Newliner () {
stream.Transform.call(this);
}
util.inherits(Newliner, stream.Transform);
Newliner.prototype._transform = function(chunk, enc, done) {
var split = 0;
for( var i =0; i <chunk.length; i++){
if(chunk[i] == 10) {
this.push(chunk.slice(split,i));
split = i+1;
}
}
}
function Greper(options) {
stream.Transform.call(this);
this.regex = new RegExp(options);
}
util.inherits(Greper, stream.Transform);
Greper.prototype._transform = function(chunk, enc, done) {
this.push(chunk); //Even this is not working.
/*
var a = chunk.toString();
if(this.regex.test(a)){
this.push(chunk);
}
*/
}
var n = new Newliner();
var g = new Greper("line");
var f = fs.createReadStream('a.txt');
f.pipe(n).pipe(g).pipe(process.stdout);
Input file a.txt is,
This is line one.
Another line.
Third line.
While executing only one line is displayed. What is the reason for this?
$ node test.js
This is line one.
Note: When i am piping the File read stream directely to the 'g' it works correctly.
You need to call the callback of the _transform() function when you are done processing the chunk. From the documentation:
callback (Function) Call this function (optionally with an error argument) when you are done processing the supplied chunk.
No more data will be pushed into the stream until the callback is called. If you don't call it, then the chunk will not be considered as processed… which is why your program stops after processing only one line.
Just add:
done();
at the end of the Newliner.prototype._transform and Greper.prototype._transform functions.