I'm trying to do some scraping with nightmare and my work is almost functional. The problem is that I'm facing an issue when I try to perform a click() after the evaluate() and run() have been called. After I run those two functions I'm trying to perform another click to move myself to another part of the website, but is not executing the click().
At this point I'm note sure whats the issue, I have few assumptions, maybe those functions are asynchronous and I'm trying to click() when the callbacks arent ready yet or one of those functions ends de current nightmare object and I dont have the scope anymore.
var Nightmare = require('nightmare');
//var nightmare = Nightmare({show:true})
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
var urlWeb = "someurl";
var selectCity = "#ddl_city";
var selectTheater = "#ddl_theater";
var enterBtn = "#btn_enter";
var mainSelector = "#aspnetForm";
var flagReady = true;
new Nightmare({show:true})
.goto(urlWeb)
.wait(selectCity)
.select(selectCity, '19')
.wait(8000)
.select(selectTheater, '12')
.wait(1000)
.click(enterBtn)
.wait(mainSelector)
.evaluate(function(){
//returning HTML for cheerio
return document.body.innerHTML;
})
.run(function(err, nightmare){
if (err) return console.log(err);
// Loading HTML body on jquery cheerio
var $ = cheerio.load(nightmare);
//Looping on each div for seccion de Carterla para Hoy
$('.showtimeDaily').each(function(index, element){
//spanish title
console.log($(this).find('h3').children().text());
//english title
console.log($(this).find('h4').text());
//schedule for today
console.log($(this).find('li').children().text() + " ");
//img for movie
console.log($(this).find('img').attr('src'));
//show time data such as gender, lenght, language
console.log($(this).find('.showtimeData').text());
var showtimeData = $(this).find('.showtimeData').text();
//console.log(JSON.stringify(showtimeData.replace(/\t|\n/g, "")));
})
console.log('Done!');
})
//*****here is wen I try to click*****
.click('a[href="../showtimes/weekly.aspx"]');
I was having issue with the asynchronous call backs, so what I did is that I nested calls of the nightmare object to make sure that the tasks were running one after the other one. This is the code:
nightmare
.goto(urlWeb)
.wait(selectCity)
.select(selectCity, '19')
.wait(8000)
.select(selectTheater, '12')
.wait(1000)
.click(enterBtn)
.wait(mainSelector)
.evaluate(function(){
//returning HTML for cheerio
return document.body.innerHTML;
})
.then(function(body){
// Loading HTML body on jquery cheerio
var $ = cheerio.load(body);
//Looping on each div for seccion de Carterla para Hoy
$('.showtimeDaily').each(function(index, element){
//spanish title
console.log($(this).find('h3').children().text());
//english title
console.log($(this).find('h4').text());
//schedule for today
console.log($(this).find('li').children().text() + " ");
//img for movie
console.log($(this).find('img').attr('src'));
//show time data such as gender, lenght, language
console.log($(this).find('.showtimeData').text());
var showtimeData = $(this).find('.showtimeData').text();
//console.log(JSON.stringify(showtimeData.replace(/\t|\n/g, "")));
})
//**Here I call nightmare to run after the first call back is done*****
nightmare
.goto('')
.wait('body')
.title()
.then(function(title){
console.log(title);
});
console.log('Done!');
});
Related
I have a large xml which is a combination of xml documents. I'm trying to use a nodejs xml splitter and respond back with the number of documents i have found.
My code looks something like this. I'm looking to get the number of documents outside the function(in the last line). Is there something I can do to achieve this?
var XmlSplit = require('./xmlsplitter.js')
const fs = require('fs')
var xmlsplit = new XmlSplit()
var no_of_docs = 0
var inputStream = fs.createReadStream('./files/input/test.xml')
inputStream.pipe(xmlsplit).on('data', function(data,callback) {
var xmlDocument = data.toString();
no_of_docs = no_of_docs + 1;
})
inputStream.pipe(xmlsplit).on('end', function(){
console.log('Stream ended');
console.log(no_of_docs); <-- This prints the correct value but the value is lost as soon as we exit this.
});
console.log("This is outside the function " + no_of_docs); <-- I need the value here.
I'm attempting to write a very basic scraper that loops through a few pages and outputs all the data from each url to a single json file. The url structure goes as follows:
http://url/1
http://url/2
http://url/n
Each of the urls has a table, which contains information pertaining to the ID of the url. This is the data I am attempting to retrieve and store inside a json file.
I am still extremely new to this and having a difficult time moving forward. So far, my code looks as follows:
app.get('/scrape', function(req, res){
var json;
for (var i = 1163; i < 1166; i++){
url = 'https://urlgoeshere.com' + i;
request(url, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
var mN, mL, iD;
var json = { mN : "", mL : "", iD: ""};
$('html body div#wrap h2').filter(function(){
var data = $(this);
mN = data.text();
json.mN = mN;
})
$('table.vertical-table:nth-child(7)').filter(function(){
var data = $(this);
mL = data.text();
json.mL = mL;
})
$('table.vertical-table:nth-child(8)').filter(function(){
var data = $(this);
iD = data.text();
json.iD = iD;
})
}
fs.writeFile('output' + i + '.json', JSON.stringify(json, null, 4), function(err){
console.log('File successfully written! - Check your project directory for the output' + i + '.json file');
})
});
}
res.send(json);
})
app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;
When I run the code as displayed above, the output within the output.json file only contains data for the last url. I presume that's because I attempt to save all the data within the same variable?
If I include res.send() inside the loop, so the data writes after each page, I receive the error that multiple headers cannot be sent.
Can someone provide some pointers as to what I'm doing wrong? Thanks in advance.
Ideal output I would like to see:
Page ID: 1
Page Name: First Page
Color: Blue
Page ID: 2
Page Name: Second Page
Color: Red
Page ID: n
Page Name: Nth Page
Color: Green
I can see a number of problems:
Your loop doesn't wait for the asynchronous operations in the loop, thus you do some things like res.send() before the asynchronous operations in the loop have completed.
In appropriate use of cheerio's .filter().
Your json variable is constantly being overwritten so it only has the last data in it.
Your loop variable i would lose its value by the time you tried to use it in the fs.writeFile() statement.
Here's one way to deal with those issues:
const rp = require('request-promise');
const fsp = require('fs').promises;
app.get('/scrape', async function(req, res) {
let data = [];
for (let i = 1163; i < 1166; i++) {
const url = 'https://urlgoeshere.com/' + i;
try {
const html = await rp(url)
const $ = cheerio.load(html);
const mN = $('html body div#wrap h2').first().text();
const mL = $('table.vertical-table:nth-child(7)').first().text();
const iD = $('table.vertical-table:nth-child(8)').first().text();
// create object for this iteration of the loop
const obj = {iD, mN, mL};
// add this object to our overall array of all the data
data.push(obj);
// write a file specifically for this invocation of the loop
await fsp.writeFile('output' + i + '.json', JSON.stringify(obj, null, 4));
console.log('File successfully written! - Check your project directory for the output' + i + '.json file');
} catch(e) {
// stop further processing on an error
console.log("Error scraping ", url, e);
res.sendStatus(500);
return;
}
}
// send all the data we accumulated (in an array) as the final result
res.send(data);
});
Things different in this code:
Switch over all variable declarations to let or const
Declare route handler as async so we can use await inside.
Use the request-promise module instead of request. It has the same features, but returns a promise instead of using a plain callback.
Use the promise-based fs module (in latest versions of node.js).
Use await in order to serialize our two asynchronous (now promise-returning) operations so the for loop will pause for them and we can have proper sequencing.
Catch errors and stop further processing and return an error status.
Accumulate an object of data for each iteration of the for loop into an array.
Change .filter() to .first().
Make the response to the request handler be a JSON array of data.
FYI, you can tweak the organization of the data in obj however you want, but the point here is that you end up with an array of objects, one for each iteration of the for loop.
EDIT Jan, 2020 - request() module in maintenance mode
FYI, the request module and its derivatives like request-promise are now in maintenance mode and will not be actively developed to add new features. You can read more about the reasoning here. There is a list of alternatives in this table with some discussion of each one. I have been using got() myself and it's built from the beginning to use promises and is simple to use.
I have two questions.
I am trying to get the value of propval, but when I run the scraper I just get an emptying string back. Should I use a different method
besides filter or am I not choosing the right element? I've tried
different elements, but it does the same thing.
Can I use a loop to go through every single table row to collect all the prop value ids or is there a more efficient way?
const express = require('express');
const fs = require('fs');
const request = require('request');
const cheerio = require('cheerio');
var app = express();
app.get('/scrape', function(req, res) {
url = 'http://streak.espn.com/en/';
request(url, function(error, response, html){
if(!error) {
var $ = cheerio.load(html);
var gameQuestion, propVal;
var json = { gameQuestion : "", propVal : ""};
$('.gamequestion').each(function(){
var data = $(this)
gameQuestion = data.text();
json.gameQuestion = gameQuestion;
})
$('a#.matchupDiv').each(function() {
var data = $(this);
propVal = data.text();
json.propVal = propVal;
})
}
fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err){
console.log('File successfully written! - Check your project directory for the output.json file');
})
res.send('Check your console!')
});
})
app.listen('8081')
console.log('magic happens on port 8081');
exports = module.exports = app;
<tbody>
<tr>
<td rowspan = "2" class='mg-column1 start'></td>
<td rowspan = "2" class='mg-column2 start'></td>
<td rowspan = "2" class='mg-column3 start'></td>
<div class="mg-check" propval="m57207o58439" name="matchupDiv">nbsp;</div>
The jQuery .filter() function takes either an element, a selector, a function, or another jQuery object (http://api.jquery.com/filter/). Since you are passing a function into the filter, it is expecting you to return a boolean (a true/false value).
What you probably want to use is the .each() function. (http://api.jquery.com/jquery.each/). When you query a class, you will get back an array of matching objects. .each() will loop through the array and you could do what you are trying to do there. Give it a shot.
ADDED***** I was looking at the source code, and there is no class propval. You need to change your query to:
$('.mg-check').attr('propVal')
But .mg-check doesn't appear to have any children. What are you trying to scrape exactly?
EDITED *** To get an array of the propVals, try this:
Change your initialization of your json.propVal from '' to [], then...
$('.mg-check').each(function(){ json.propVal.push($(this).attr('propVal')) });
Below is virtually all of the code for a node.js app that lets you get playlists for artists if you run the command simply followed by an artists name
simplay the Beatles
From the output in the terminal, I know that the code in the ._flush method (added to the prototype of UrlsForNamesTransform) is getting run but it's never explicitly called. UrlsForNamesTransform extends the Transform stream in node.js, which I mention because I've seen it in other code before where a function is running without explicitly getting called (at least that I can see). Is it something about Transform or what is happening to make the code in ._flush run?
This is the github repo for the code https://github.com/thlorenz/simplay
'use strict';
var urlschema = 'http://ws.audioscrobbler.com/2.0/?method=artist.getsimilar&artist={{artist}}&api_key={{apikey}}&format=json';
var hyperquest = require('hyperquest')
, table = require('text-table')
, colors = require('ansicolors')
, styles = require('ansistyles')
var stream = require('stream');
var util = require('util');
var Transform = stream.Transform;
util.inherits(UrlsForNamesTransform, Transform);
function UrlsForNamesTransform (opts) {
if (!(this instanceof UrlsForNamesTransform)) return new UrlsForNamesTransform(opts);
opts = opts || {};
Transform.call(this, opts);
this._writableState.decodeStrings = false;
this.artist = opts.artist;
this.json = '';
}
UrlsForNamesTransform.prototype._transform = function (chunk, encoding, cb) {
this.json += chunk.toString();
cb();
};
UrlsForNamesTransform.prototype._flush = function (cb) {
var records = [];
try {
var o = JSON.parse(this.json);
var artists = o.similarartists.artist;
if (!Array.isArray(artists)) {
this.push('Sorry, no records for "' + this.artist + '" where found, please correct your spelling and/or try another artist.');
return cb();
}
artists.forEach(function (node) {
var youtubeurl = 'http://www.youtube.com/results?search_query={{artist}},playlist'.replace('{{artist}}', node.name);
var rdiourl = 'http://www.rdio.com/search/{{artist}}/artists/'.replace('{{artist}}', node.name);
var lastfmurl = 'http://www.last.fm/music/{{artist}}'.replace('{{artist}}', node.name);
var lastfmRadioUrl = 'http://www.last.fm/listen/artist/{{artist}}'.replace('{{artist}}', node.name);
var urls = [
''
, colors.white(' youtube: ') + styles.underline(colors.brightBlue(encodeURI(youtubeurl)))
, colors.blue (' rdio: ') + styles.underline(colors.brightBlue(encodeURI(rdiourl)))
, colors.brightRed (' last.fm: ') + styles.underline(colors.brightBlue(encodeURI(lastfmurl)))
, colors.red (' last.fm radio: ') + styles.underline(colors.brightBlue(encodeURI(lastfmRadioUrl)))
, ''
, ''].join('\n');
records.push([ '\n' + colors.brightYellow(node.name), colors.cyan(node.match), urls ]);
})
this.push(table(records.reverse()));
cb();
} catch (err) {
cb(err);
}
}
var go = module.exports =
/**
* Retrieves similar artists for the given artist from last.fm using the apikey.
* Then it converts the information to display youtube.com, last.fm, rdio playlist/artist urls for each artist.
*
* #name simplay
* #function
* #param {String} artist the artist to find similar artists for
* #param {String} apikey the api key to be used with last.fm
* #return {ReadableStream} that will push the url information
*/
function simplay(artist, apikey) {
if (!artist) throw new Error('Please provid the artist that you like to get similar artist links for');
if (!apikey) throw new Error('Please set LASTFM_API env variable to your API key: http://www.last.fm/api/account/create');
var url = urlschema
.replace('{{artist}}', artist)
.replace('{{apikey}}', apikey);
return hyperquest(url)
.on('error', console.error)
.pipe(new UrlsForNamesTransform({ artist: artist }))
.on('error', console.error)
};
The important line is this one:
util.inherits(UrlsForNamesTransform, Transform);
What this means is that UrlsForNamesTransform is a subclass of Transform. There is very good documentation on subclassing Transform, which can be found on the node.js api site.
Essentially, a subclass of Transform must implement _transform and can implement _flush, but is expected to never call either of those functions. Methods in Transform will call them based on events on the incoming stream.
I'm trying to save the result to a json file but when I see it goes in half, something this wrong in my code but I do not understand that you part, thanks for your help.
var request = require("request");
var cheerio = require("cheerio");
var fs = require('fs');
var urls = ["http://www.fordencuotas.com.ar"]
var req = function(url){
request({
uri: url,
}, function(error, response, body) {
var $ = cheerio.load(body);
$("a").each(function() {
var link = $(this);
var itri = {iti: new Array(link.attr("href"))}
var data = JSON.stringify(itri);
fs.writeFile("file.json", data, function(err){
if(err){console.log(err);} else {console.log("archivo guardado..");}
});
});
});
}
for (var i = 0; i < urls.length; i++){
req(urls[i]);
}
console.log("cargando...");
this output
[opmeitle#localhost crawler1]$ node crawmod.js
cargando...
archivo guardado..
archivo guardado..
archivo guardado..
archivo guardado..
archivo guardado..
...
archivo guardado..
[opmeitle#localhost crawler1]$ cat file.json
{"iti":["productos/autos/nuevo-focus.html"]}us.html"]}
[opmeitle#localhost crawler1]$
There's a couple of issues in your code.
First, you are trying to overwrite the same file (file.json) for each a element. I'm not sure if that's your intention, but it seems rather pointless.
Secondly, fs.writeFile is asynchronous. That means that Node doesn't wait until the file is written before it returns to your loop. In other words, for each a element you open the same file, while it might have already been opened by an earlier iteration of your loop. And each iteration is writing to the same file, so you're going to end up with unexpected results.
You can either use fs.writeFileSync to synchronously write the file, which would make Node wait until the data has been written to the file before continuing, or gather all the data that you want saved to the file in a variable, and — after the $("a").each(...) loop — write that variable to the file just once.
That last solution could look something like this:
var data = [];
$("a").each(function() {
var link = $(this);
var itri = {iti: new Array(link.attr("href"))}
data.push( itri );
});
fs.writeFile("file.json", JSON.stringify(data), function(err){
if(err){console.log(err);} else {console.log("archivo guardado..");}
});