I have the following code:
var request = require('request');
var cheerio = require('cheerio');
var URL = require('url')
var fs = require('fs')
fs.readFile("urls.txt", 'utf8', function(err, data) {
if (err) throw err;
var urls = data.split('\n');
urls = urls.filter(function(n){return n});
for(var i in urls) {
request(urls[i], function(err, resp, body) {
if (err)
throw err;
$ = cheerio.load(body,{lowerCaseTags: true, xmlMode: true});
$('item').each(function(){
console.log("----------");
console.log($(this).find('title').text());
console.log($(this).find('link').text());
console.log($(this).find('pubDate').text());
});
}).end();
}
});
and from the urls.txt file I only have the following url:
http://www.visir.is/section/?Template=rss&mime=xml
When I use wget on that url I get a response which looks like an rss feed but when I do it in the code above the body is empty. Can someone explain to me why and how can I fix this?
Update: Simply removing .end() from your original script works. end() terminates the script on callback. IMO, in 2016, I'd definitely choose Request over Needle.
Request is an odd bird, and why it's not working in your case it's giving no information in the response at all.
Try with Needle instead:
var needle = require('needle');
var cheerio = require('cheerio');
var URL = require('url')
var fs = require('fs')
fs.readFile("urls.txt", 'utf8', function(err, data) {
if (err) throw err;
var urls = data.split('\n');
urls = urls.filter(function(n){return n});
for(var i in urls) {
needle.get(urls[i], function(err, resp, body) {
if (err)
throw err;
$ = cheerio.load(body,{lowerCaseTags: true, xmlMode: true});
$('item').each(function(){
console.log("----------");
console.log($(this).find('title').text());
console.log($(this).find('link').text());
console.log($(this).find('pubDate').text());
});
});
}
});
Related
I am fetching JSON data from instagram api, which return something like {"pagination": {}, "data": [{"id": ...... and I am using node.js to fetch it. What's wrong with my code? I cannot see the expected console log of 'success'!
var cheerio = require('cheerio'),
request = require('request'),
url = require('url');
var results = [];
var target = 'https://api.instagram.com/v1/users/self/media/recent/?access_token=';
request.get(target, function(error, response, body) {
var $ = cheerio.load(body);
$('data').each(function(i, element) {
console.log('success');
results.push(element);
});
console.log(results);
});
Try this:
request.get(target, function(error, response, body) {
console.log(body);
// you can get pagination, etc with yourObj.pagination, ...
var yourObj = JSON.parse(body);
});
I want to get the body of web-page from a list of more than 1000 urls (my goal is to do scraping using cheerio then).
The problem is that I get a weird GUNZIP result and I can't get the content of the body tag. This is the code that I'm using (I cant use a simple "request" cause it misses some request)
var async = require('async');
var fetch = require('isomorphic-unfetch');
const cheerio = require('cheerio');
let urls= // reading a list of ~1000 URLs from JSON file
async.mapLimit(urls, 1, async function(url) {
const response = await fetch(url);
return response.body
}, (err, results) => {
if (err) throw err
console.log(results);
});
The problem is that I get a weird GUNZIP result
use zlib,
var zlib = require('zlib');
async.mapLimit(urls, 1, async function(url) {
const response = await fetch(url);
zlib.gunzip(response.body, function(err, dezipped) {
return (dezipped.toString());
});
}, (err, results) => {
if (err) throw err
console.log(results);
});
then purseed your parsing with cheerio :)
i hope this helps.
I have some sample code that looks like the following:
var soap = require('soap');
var url = "http://www.example.com?wsdl";
var soapHeader = "<AuthHeader>...</AuthHeader>";
soap.createClient(url, function(err, client){
client.addSoapHeader(soapHeader);
var args = {
'arg1': 1
}
};
client.SomeMethod(args, function(err, result){
if(err){
throw err;
}
console.log(JSON.stringify(result));
});
});
The problem is the request is failing due to either an incorrect header or arguments I'm passing. It would be much easier to debug if I could see the entirety of the request body. How is that done?
Not sure if this is still relevant to you, but here goes:
var soap = require('soap');
var url = "http://www.example.com?wsdl";
var soapHeader = "<AuthHeader>...</AuthHeader>";
soap.createClient(url, function(err, client){
client.addSoapHeader(soapHeader);
var args = {
'arg1': 1
}
};
client.SomeMethod(args, function(err, result){
if(err){
throw err;
}
console.log('last request: ', client.lastRequest);
console.log(JSON.stringify(result));
});
});
The extra console.log statement with "lastRequest" will show the XML request that is being sent, which can be used for debugging. Hope this helps.
I've array of url images. I need download asynchronous this images from remote url to my server.
EXAMPLE:
// i need this function return array localfiles after download
function someFunctionAsDownloadedRemoteIMages(arrayOfRemoteUrls){
localImages = [];
arrayOfRemoteUrls.forEach(function(url){
request.head(url, function(err, res, body){
newImageName = randomInt(100000, 999999);
var filename = 'catalog/import/'+newImageName+'.jpg';
request(url, {encoding: 'binary'}, function(error, response, body) {
fs.writeFile('image/'+filename, body, 'binary', function (err) {
if(err)
return;
localImages.push(filename);
});
});
});
});
}
var remoteImagesArray = ["http://example.com/1.jpg", "http://example.com/1444.jpg","http://example.com/ddsgggg.jpg"];
localImagesArray = someFunctionAsDownloadedRemoteIMages(remoteImagesArray);
someFunctionProccess(localImagesArray);
If you want it to asynchronously return anything you must use a callback pattern instead of returning a value from a function. With that said you also need a way for the final result callback to fire once all images have been loaded. I would suggest using a module like async and use the map function it provides. The map function will allow you to process an array and it gives back an array of results. Below is an example:
var async = require('async');
var fs = require('fs');
var request = require('request');
function processUrl(url, callback){
request.head(url, function(err, res, body){
var newImageName = randomInt(100000, 999999);
var filename = 'catalog/import/'+newImageName+'.jpg';
request(url, {encoding: 'binary'}, function(error, response, body) {
fs.writeFile('image/'+filename, body, 'binary', function (err) {
if(err) return callback(err);
callback(null,filename);
});
});
});
}
function someFunctionAsDownloadedRemoteIMages(arrayOfRemoteUrls, callback){
async.map(arrayOfRemoteUrls, processUrl, callback);
}
var remoteImagesArray = ["http://example.com/1.jpg", "http://example.com/1444.jpg","http://example.com/ddsgggg.jpg"];
someFunctionAsDownloadedRemoteIMages(remoteImagesArray, function(err, localImagesArray){
if(err) //handle it
someFunctionProccess(localImagesArray);
});
I'm using expressjs.
I have a router:
exports.index = function(req, res){
if(req.param('name')) {
var simpleParser = require('../tools/simpleParser');
var result = simpleParser.images(req.param('name'));
// how i can get result from simpleParser.images after it complete?
res.json(result);
}
res.render('goods');
};
An i have a simpleParser.images:
module.exports = {
images: function (url) {
if (url) {
var request = require('request'),
cheerio = require('cheerio');
request({
uri: url,
method: 'GET',
encoding: 'binary'
}, function (err, res, body) {
var tmp = [];
body = new Buffer(body, 'binary');
var $ = cheerio.load(body);
$('.products-listing li a').each(function () {
var link = $(this).find('img').attr('src');
tmp.push(link);
});
// How i can send tmp to router, when it complete?
});
}
}
};
When i asking page with ?name it return null, because request in simpleParser.images work async. How i can subscribe to result of simpleParser request function, and send json after it complete?
Like many node modules, you can provide a callback in your own utility functions. Your simpleParser.images function is not synchronous, as it uses the request module. You can have your simpleParser.images function accept a callback that will be called upon the completion of the network request and some data parsing.
var request = require('request'),
cheerio = require('cheerio');
module.exports = {
images: function (url, callback) {
if (!url) callback(null, null);
request({
uri: url,
method: 'GET',
encoding: 'binary'
}, function (err, res, body) {
if (err) callback(err);
var tmp = [];
body = new Buffer(body, 'binary');
var $ = cheerio.load(body);
$('.products-listing li a').each(function () {
var link = $(this).find('img').attr('src');
tmp.push(link);
});
// Here we have the data and can pass it in the callback
callback(null, tmp);
});
}
};
Then you essentially have your own function that can be performed asynchronously. Then in your
express route, that is async as well, so just plug in your new function
if (req.param('name'))
simpleParser.images(req.param('name'), function (err, images);
res.json(images);
});
} else {
res.render('goods');
}