how to created array global? in nodejs - node.js

do i need to keep each link in a array within "request({..})", and then display it or work on the outside of "request({..})", this would be my code but does not work, any idea?
var request = require("request");
var cheerio = require("cheerio");
var arrayLinks = [];
request({
uri: "http://www.some-url.com",
}, function(error, response, body) {
var $ = cheerio.load(body);
$("a").each(function() {
var link = $(this);
arrayLinks.push(link.attr("href"));
});
});
arrayLinks.forEach(function(link){console.log(link)});

For example:
var request = require("request");
var cheerio = require("cheerio");
var arrayLinks = [];
request({
uri: "http://www.some-url.com",
}, function(error, response, body) {
// Some logic.
linkTheArray()
});
function linkTheArray() {
arrayLinks.forEach(function(link){console.log(link)});
}
Now you can run it after the request is done. There is one other way, but it is pretty ugly. You can run a timeout function, until you get some data in the array

Related

Cheerio returns undefined when using the "contains" selector

I am currently trying to parse some HTML from this URL:
The main information I am after is the listed Weight. Using the Console in Chrome, I can issue the command:
$("th:contains(Weight)").parent()[0];
And it will give me the table rows containing all the information I need about the weight.
I tried to use this in Cheerio, but it just returns undefined.
This is my Node.js code:
var needle = require('needle');
var cheerio = require('cheerio');
function rei(product) {
//Request page from rei.com and follow the redirect
return needle("get", "https://rei.com/product/" + product, {
follow_max: 5
}).then(function(response) {
var $ = cheerio.load(response.body);
var test = $("th:contains(Weight)").parent()[0];
console.log(test);
}).catch(function(error) {
console.log(error);
})
};
rei(893905);
What would be the best way to get the information I need from Rei's website in an automated manner?
Try this:
var needle = require('needle');
var cheerio = require('cheerio');
var fs = require('fs');
function rei(product) {
//Request page from rei.com and follow the redirect
return needle("get", "https://rei.com/product/" + product, {
follow_max: 5
}).then(function(response) {
var $ = cheerio.load(response.body);
// your data in script
var content = $('script[data-client-store="product-details"]').html();
content = JSON.parse(content);
for (var spec of content.specs) {
if (spec.name == 'Weight') {
console.log(spec.values)
}
}
}).catch(function(error) {
console.log(error);
})
};
rei(893905);

WebScraping & web navigation simulation

I'm make a webscraper and I already know how to scrap some data and convert them to Json with this code I made :
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
var url = 'http://www.footmercato.net/';
request(url, function(err, resp, body) {
if (!err) {
var $ = cheerio.load(body);
var data = [];
var i = 1;
$('.text').each(function(i, element) {
var article = $('p');
var jsObject = { title : "", article : "", date : "" };
var articleTxt = article.text();
jsObject.article = articleTxt;
data.push(jsObject);
})
var json = JSON.stringify(data);
fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err) {
console.log('File successfully written!');
})
}
});
app.listen('8080');
But I would like to navigate to the website I'm scraping, fill out form and going to others pages.
Does somebody know if i can do it with cheerio or how I can add it to my existing code ?
Thanks
You can use webdriverio actually he will open a browser window, and then you can manipulate the dom through the webdriverio api to handle forms mouse clicks, and navigate from one page to an other.
var webdriverio = require('webdriverio');
var options = {
desiredCapabilities: {
browserName: 'firefox'
}
};
webdriverio
.remote(options)
.init()
.url('http://www.google.com')
.getTitle().then(function(title) {
console.log('Title was: ' + title);
})
.end();

I cannot get url from a website using nodejs

I hope to scrape urls from this website by using the code below:
var request = require("request");
cheerio = require("cheerio");
urls = [];
request("http://news.sabay.com.kh/topics/sport", function(err, resp, body){
if(!err && resp.statusCode ==200){
var $ = cheerio.load(body);
$(".article","h4.title").each(function(){
var url = this.attr("href");
urls.push(url);
});
console.log(urls);
}
});
but I cannot get the result. When I run I got this
$ node server.js
[]
First, use a proper CSS selector :
.article h4.title > a
Then, use the proper field :
var url = this.attribs.href
Which gives :
var request = require("request");
cheerio = require("cheerio");
urls = [];
request("http://news.sabay.com.kh/topics/sport", function(err, resp, body){
if(!err && resp.statusCode ==200){
var $ = cheerio.load(body);
$(".article h4.title > a").each(function(){
var url = this.attribs.href;
urls.push(url);
});
console.log(urls);
}
});
and outputs :
[ 'http://news.sabay.com.kh/article/546826',
'http://news.sabay.com.kh/article/546763',
'http://news.sabay.com.kh/article/546520',
'http://news.sabay.com.kh/article/546568',
'http://news.sabay.com.kh/article/546460',
'http://news.sabay.com.kh/article/546448',
'http://news.sabay.com.kh/article/545674',
'http://news.sabay.com.kh/article/546235',
'http://news.sabay.com.kh/article/545698',
'http://news.sabay.com.kh/article/546091' ]

How to concurrent download files using cheerio and nodejs?

I have a website with multiple pages, each page lists download links which I want to scrap and download.
I have few issues with it:
My script only downloads about 4-5 files and getting stuck.
I would like to concurrently download as much files as my CPU can.
I got stuck with maximum event emitters, I don't understand why is that so I just go
How to follow redirects purely using request module (without follow-redirects)?
How to download the file like the browser does without mentioning it's name? there is no content-disposition but I think the browser follow redirects and the redirected URL has the filename in it's path.
My current code looks like so:
var request = require('request');
var cheerio = require('cheerio');
var https = require('follow-redirects').https;
require('events').EventEmitter.prototype._maxListeners = 1000;
for(var i = 1; i <= 10000; i++) {
(function(i){
url = 'http://mywebsite.com/files?page=' + i;
request(url, gotHTML)
})(i);
}
function gotHTML(err, resp, html) {
var $ = cheerio.load(html);
$('.file-header').each(function() {
var data = $(this);
var fileLink = data.children().first().children().first().attr('href');
var fileName = fileLink.substring(10);
var downloadLink = 'https://mywebsite.com/api/download/' + fileName;
download(downloadLink, function() {
console.log('downloaded');
})
})
}
function download(url, cb) {
var request = https.get(url, function(response) {
var location = request.res.headers.location;
console.log(location);
location = location.split('/').pop();
console.log(location);
var file = fs.createWriteStream(location);
response.pipe(file);
file.on('finish', function() {
file.close(cb);
});
});
}
The default HTTP/HTTPS Agent only uses a maximum of 5 sockets (maxSockets) for requests to the same origin. So this could be causing some issues for you.
Try changing this:
var request = https.get(url, function(response) {
to this:
var options = require('url').parse(url);
options.agent = false; // or use a custom https.Agent with a higher `maxSockets`
var request = https.get(options, function(response) {

How to set the content length with node.js request module

Here is the code that tries to submit a multipart/form-data according to documentation:
var request = require('request');
var req = request.post('http://echo.httpkit.com',
function (err, resp, body) {
console.log(body);
});
var form = req.form()
form.append('name', 'value')
The response is:
411 Length Required
I would recommend using the form-data library: https://github.com/felixge/node-form-data
npm install form-data
Then set your code to something like so:
var FormData = require('form-data');
var request = require('request');
var form = new FormData();
form.append('name', 'value')
form.submit('http://echo.httpkit.com', function(e, r){
console.log(e,r)
});
var r = request.defaults('headers':{'Content-Length':contentlen} });
var post = r.post(url, function(err, response){});
var form = post.form();
form.append('param1', param1);
form.append('data', fs.createReadStream(pathtofile));
here contentlen is int

Resources