node js async request cant get body of web page - node.js

I want to get the body of web-page from a list of more than 1000 urls (my goal is to do scraping using cheerio then).
The problem is that I get a weird GUNZIP result and I can't get the content of the body tag. This is the code that I'm using (I cant use a simple "request" cause it misses some request)
var async = require('async');
var fetch = require('isomorphic-unfetch');
const cheerio = require('cheerio');
let urls= // reading a list of ~1000 URLs from JSON file
async.mapLimit(urls, 1, async function(url) {
const response = await fetch(url);
return response.body
}, (err, results) => {
if (err) throw err
console.log(results);
});

The problem is that I get a weird GUNZIP result
use zlib,
var zlib = require('zlib');
async.mapLimit(urls, 1, async function(url) {
const response = await fetch(url);
zlib.gunzip(response.body, function(err, dezipped) {
return (dezipped.toString());
});
}, (err, results) => {
if (err) throw err
console.log(results);
});
then purseed your parsing with cheerio :)
i hope this helps.

Related

NODE JS - request npm - manipulate url body

I'm working on a server side (self) project with node js (for the first time), and i ran into some difficulties.
My goal is the following:
first part - Im using "/uploads/processData" URL in my server to get URL(s) from the user request.
Now i want to access the user request URL(s) and get their HTML(s) file(s), to do so i'm using the "request" npm package (code below).
second part - I want access the body that I get back from the request package (from the first part), so I'm using cheerio npm package to do so.
Now to my problem - lets say that i'm trying to get the body of the url:
https://www.amazon.com/NIKE-Mens-Lunarconverge-Running-Shoes/dp/B06VVFGZHL?pd_rd_wg=6humg&pd_rd_r=61904ea4-c78e-43b6-8b8d-6b5ee8417541&pd_rd_w=Tue7n&ref_=pd_gw_simh&pf_rd_r=VGMA24803GJEV6DY7458&pf_rd_p=a670abbe-a1ba-52d3-b360-3badcefeb448&th=1
From some reason that i cant understand (probably because of lack of knowledge at web development), I dont always get the same body that i see when I review the above page (link) using F12, with my first part code. Hence sometimes my cheerio extraction (the second part) works as i expect and sometime does not (because some element from the full/original HTML file are missing). At first I thought it might be cache thing, so I added a middleware to set "nocache" flag.
What am I missing here? Does the way I try to operate wrong? Is there any way to ensure i get the same full/original HTML everytime?
Here is my code so far -
nocache middleware
function nocache(req, res, next) {
res.header("Cache-Control", "private, no-cache, no-store, must-revalidate");
res.header("Expires", "-1");
res.header("Pragma", "no-cache");
next();
}
EDIT
uploadRoutes.post("/processGoogleSearchData", nocache, (req, res) => {
//Assuming getting in req.body the google result JSON as "googleSearchResult"
var itemsArr = [];
var linksArr = [];
var bodysArr = [];
itemsArr = req.body.googleSearchResult.items;
if (itemsArr.length === 0) {
//return appropriate message
return res.status(400).send({ message: "No data sent to server" });
}
var linksArr = itemsArr.map(item => item.link);
//Get the needed info from every link
linksArr.forEach(link => {
request(link, (err, response, body) => {
if (!err && response.statusCode === 200) {
var $ = cheerio.load(body);
var tr = $(".a-lineitem").children();
var priceTd = tr.find(".a-span12");
var priceSpan = priceTd.find("#priceblock_ourprice");
console.log(priceSpan.text());
//when trying to build array of bodys the extraction doesnt work at all
bodysArr.push(body);
}
});
});
res.send(bodysArr);
});
I changed my code to the above, and it seems like the data extraction works more often. Can anyone explain why the extraction still sometimes doesnt work?
I tried return bodysArr for debbug purposes but when i do that the extraction does not work at all and my path response is always an empty array, why is that?
The problem is that:
res.send(bodysArr);
is executed straight after the call to
linksArr.forEach(link => {
The callbacks
(err, response, body) => {
if (!err && response.statusCode === 200) {
var $ = cheerio.load(body);
var tr = $(".a-lineitem").children();
var priceTd = tr.find(".a-span12");
var priceSpan = priceTd.find("#priceblock_ourprice");
console.log(priceSpan.text());
//when trying to build array of bodys the extraction doesnt work at all
bodysArr.push(body);
}
won't be guaranteed to have fired yet. What you want is ensure that res.send(bodysArr) runs after all the requests have happened
There are a few ways to handle this, one is with the excellent async library.
Hopefully you can get the gist of it with this example.
var array = [1,2,3]
function asyncRequest(input, callback){
//Do your fetch request here and call callback when done
setTimeout(callback, 10); //using setTiemout as an example
}
async.each(array, asyncRequest, (err) => {
if(err){
throw err;
}
console.log("All Finished");
});
<script src="https://cdnjs.cloudflare.com/ajax/libs/async/2.6.1/async.min.js"></script>
After reviewing Sudsy explanation, I came across loops of asynchronous methods.
While playing with this subject I could not figure out whats wrong with my following code:
This works fine - so i ended up using it
async function getItemsInfo(itemsArr) {
return itemsArr.map(async item => {
try {
var body = await axios(item.link);
var $ = await cheerio.load(body.data);
var tr = await $(".a-lineitem").children();
var priceTd = await tr.find(".a-span12");
var priceSpan = await priceTd.find("#priceblock_ourprice");
return priceSpan.text();
} catch (err) {
return err.message;
}
});
}
getItemsInfo(linksArr)
.then(res => Promise.all(res))
.then(res => console.log(res))
.catch(err => console.error(err));
Can someone explain to me what's wrong with the following codes?
async function getItemsInfo(itemsArr) {
await Promise.all(
itemsArr.map(async item => {
try {
var body = await axios(item.link);
var $ = await cheerio.load(body.data);
var tr = await $(".a-lineitem").children();
var priceTd = await tr.find(".a-span12");
var priceSpan = await priceTd.find("#priceblock_ourprice");
return priceSpan.text();
} catch (err) {
throw err.message;
}
})
)
.then(resulst => {
return results;
})
.catch(err => {
throw err.message;
});
}
//the caller function
try {
getItemsInfo(linksArr).then(results => {
res.status(200).send(results);
});
} catch (err) {
res.status(400).send(err.message);
}
or
async function getItemsInfo(itemsArr) {
const promises = itemsArr.map(async item => {
try {
var body = await axios(item.link);
var $ = await cheerio.load(body.data);
var tr = await $(".a-lineitem").children();
var priceTd = await tr.find(".a-span12");
var priceSpan = await priceTd.find("#priceblock_ourprice");
return priceSpan.text();
} catch (err) {
return err.message;
}
});
var results = await Promise.all(promises)
.then(results => {
return results;
})
.catch(err => {
return err.message;
});
}
//the caller function
try {
getItemsInfo(linksArr).then(results => {
res.status(200).send(results);
});
} catch (err) {
res.status(400).send(err.message);
}

NodeJS Download HTML with Request

Having quite a bit of trouble getting an HTML page to download using NodeJS. Here is my code snippet:
const request = require('request');
request('http://www.google.com', { json: true }, (err, res, body) => {
if (err) {
return console.log(err);
}
console.log(body.url);
console.log(body.explanation);
});
When I step through this it executes in about half a second. I get no errors back but I'm not getting any content logged to the console...
This works for me.
const request = require('request')
request('https://google.com', (err, res, body) => console.log(err ? err : body))
With request you can pipe the response body of a request directly to a WriteableStream
const fs = require('fs')
const request = require('request')
request('https://google.com').pipe(fs.createWriteStream('./google-index.html'))
Per the comments below, the following example illustrates how to wrap this request so it can be awaited and printed to the screen or written to a file.
const {promisify} = require('util')
const fs = require('fs')
const writeFile = promisify(fs.writeFile)
const request = require('request')
const getGoogleIndexHTML = () => {
return new Promise((resolve, reject) => {
request('https://google.com', (err, res, body) => err ? reject(err) : resolve(body))
})
}
const printAndWriteGoogleIndex = async () => {
try {
let googleIndexHTML = await getGoogleIndexHTML()
console.log(googleIndexHTML)
await writeFile('./google-index.html', googleIndexHTML, 'utf8')
console.log('google-index.html written.')
} catch(err) {
console.log(err)
}
}
printAndWriteGoogleIndex()

Node - Simple xml API Request

I'm using the latest node.js (and express) to make an API call to a site that returns... XML... Ugh >_>.
I've scowered the web and found a million different ways, but I don't know the latest, most up to date / best way to make a request and get a response in node/express.
I tried using https://github.com/request/request and did the following:
var sendJsonResponse = function(res, status, content) {
res.status(status);
res.json(content);
};
var token = request
.get('some-website.com/api/stuff')
.on('response', function(response) {
console.log(response.statusCode);
console.log(response.headers['content-type']);
});
sendJsonResponse(res, 200, token);
in the console.log statements I get 200 and then application/xml;charset=utf-8.
But on my page I don't get the xml I'm looking for. Any ideas? I've tried using https://github.com/Leonidas-from-XIV/node-xml2js to attempt to "parse" the response, in case node just can't handle the xml response, but to no avail.
var xml2js = require('xml2js');
parser.parseString(response, function(err, result) {
console.dir(result);
console.log('Done');
});
Any help on accessing an API using Node and actually using the XML response, please?
EDIT ANSWER
For the Node.js request and xml parsing of the returned xml content:
var request = require('request');
var xml2js = require('xml2js');
var sendJsonResponse = function(res, status, content) {
res.status(status);
res.json(content);
};
/* GET XML Content*/
module.exports.dsRequest = function(req, res) {
var parser = new xml2js.Parser();
request('url_for_xml_request', function(error, response, body) {
parser.parseString(body, function(err, result) {
sendJsonResponse(res, 200, result);
});
});
};
I think this will work, because request is async, you should write like below:
var sendJsonResponse = function(res, status, content) {
res.status(status);
res.json(content);
};
request.get('http://some-website.com/api/stuff', function (err,response, body) {
sendJsonResponse(res, 200, body);
});

Get File from external URL with fs.readFile

I have links on a page that when clicked I want a external docx file to open. Unfortunately fs.readFile only reads local paths.
I tried
app.get('/getfile', function (req, res) {
var externalURL = 'http://www.examplesite.com/example.docx';
// var externalURL = req.query.external;
fs.readFile(externalURL, function(err, data) {
var fileData = new Buffer(data).toString('base64');
res.send(fileData);
});
});
Try this:
const http = require("http");
const file = fs.createWriteStream("file.docx");
http.get("http://www.example.com/test.docx", response => {
response.pipe(file);
});
Try node-fetch.
It follows regular client syntax for the fetch command (MDN).
you can use get from http package provided by nodejs library.
in my case i created a function that return a promise that is fulfilled when the file is completely fetched:
getFileFromURL(pathFile: string): Promise<Buffer> {
return new Promise(function(resolve, reject) {//create a promise
http.get(pathFile.replace('https', 'http'), function(res) {
let bufferImage = Buffer.from(''); // create an empty buffer
res.on('data', function(chunk) { // listen to 'data' event and concatenate each chunk when it is received
bufferImage = Buffer.concat([bufferImage, chunk]);
});
res.on('end', function() {
resolve(bufferImage); // fulfil promise
});
res.on('error', function(err) {
reject(err); // reject promise
})
})
})
}
and finally you can use the function like this:
async function() {
const fileBuffer = await this.getFileFromURL('your external url');
// here you can you do what you want with your **filerBuffer**
// you can for example convert it to string like this:
const fileString = fileBuffer.toString('utf-8');
// or may be send it in the response like this;
yourResponse.end(fileBuffer);
// ....
}
I'd recommend to use request to do it.
I don't rly know you mean this...
TRY:
const request = require('request');
request.get('http://examplesite.com/example.docx', function(err, res, body) {
// Body is the example.docx data.
})

Empty body in a response when using request

I have the following code:
var request = require('request');
var cheerio = require('cheerio');
var URL = require('url')
var fs = require('fs')
fs.readFile("urls.txt", 'utf8', function(err, data) {
if (err) throw err;
var urls = data.split('\n');
urls = urls.filter(function(n){return n});
for(var i in urls) {
request(urls[i], function(err, resp, body) {
if (err)
throw err;
$ = cheerio.load(body,{lowerCaseTags: true, xmlMode: true});
$('item').each(function(){
console.log("----------");
console.log($(this).find('title').text());
console.log($(this).find('link').text());
console.log($(this).find('pubDate').text());
});
}).end();
}
});
and from the urls.txt file I only have the following url:
http://www.visir.is/section/?Template=rss&mime=xml
When I use wget on that url I get a response which looks like an rss feed but when I do it in the code above the body is empty. Can someone explain to me why and how can I fix this?
Update: Simply removing .end() from your original script works. end() terminates the script on callback. IMO, in 2016, I'd definitely choose Request over Needle.
Request is an odd bird, and why it's not working in your case it's giving no information in the response at all.
Try with Needle instead:
var needle = require('needle');
var cheerio = require('cheerio');
var URL = require('url')
var fs = require('fs')
fs.readFile("urls.txt", 'utf8', function(err, data) {
if (err) throw err;
var urls = data.split('\n');
urls = urls.filter(function(n){return n});
for(var i in urls) {
needle.get(urls[i], function(err, resp, body) {
if (err)
throw err;
$ = cheerio.load(body,{lowerCaseTags: true, xmlMode: true});
$('item').each(function(){
console.log("----------");
console.log($(this).find('title').text());
console.log($(this).find('link').text());
console.log($(this).find('pubDate').text());
});
});
}
});

Resources