Its probably a stupid mistake, but its taking me too long to find the answer
When i run this simple program (node index.js) i get an error:
SyntaxError: missing ) after argument list (line 55)
You can see sublimelinter found nothing either
I've tried to delete and install all packages again
I've triple checked everything
var request = require('request');
var cheerio = require('cheerio');
var fs = require('fs');
var readLine = require('readline');
var URL_SEED= "https://g1.globo.com/";
var MAX_PAGES_TO_VISIT = 100;
var pagesVisited = []; //array, {objeto}
var numPagesVisited = 0;
var pagesToVisit = [];
var allAbsoluteLinks = [];
var url = new URL(URL_SEED); //pesq
var urlBase = url.protocol + "//" + url.hostname; //pesq
pagesToVisit.push(URL_SEED);
crawl();
//pegar href vs a
function crawl() {
if(numPagesVisited >= MAX_PAGES_TO_VISIT) {
console.log("Limites de páginas que posso visitar atingido (100)");
return;
}
var nextPage = pagesToVisit.pop(); //peq
if (nextPage in pagesVisited) { //peq
// pagina ja visitada
crawl();
} else {
// pagina nao visitada
visitaPagina(nextPage, crawl);
}
}
function visitaPagina(url, callback) {
// Add page to our set
pagesVisiteu[url] = true;
numPagesVisited++;
console.log("Visitando a página " + url);
// Faz requisicao
request(url, function(error, response, body) {
console.log("Status code: " + response.statusCode);
if(response.statusCode !== 200) {
callback();
return;
} else {
// Parse do body
var $ = cheerio.load(body);
coletaLinks($);
}
}
}
function coletaLinks($) {
//var linksRelativos = $("a\[href^='/'\]"); //NAO TA PEGANDO DO JEITO QUE
var linksRelativos = $("a\[href^='/'\]");
console.log("Achei " + linksRelativos.length + " links relativos nessa página");
linksRelativos.each(function() {
pagesToVisit.push(baseUrl + $(this).attr('href'));
});
console.log("\n\nPAGES TO VISIT = " + pagesToVisit + "\n\n");
var key = [];
for (key in linksRelativos) {
if(linksRelativos.hasOwnProperty(key)) { //realmente preciso ver essa property?
console.log(key, linksRelativos\[key\].attribs.href);
}
}
fs.writeFile('relativos.txt', linksRelativos, function(err) {
if(err) {
return console.log(err);
//throw err;
}
});
}
I'm not sure why your linter isn't complaining, but you are missing a closing bracket in the request call inside visitaPagina. Second line from the bottom:
function visitaPagina(url, callback) {
// Add page to our set
pagesVisited[url] = true;
numPagesVisited++;
console.log("Visitando a página " + url);
// Faz requisicao
request(url, function(error, response, body) {
console.log("Status code: " + response.statusCode);
if(response.statusCode !== 200) {
callback();
return;
} else {
// Parse do body
var $ = cheerio.load(body);
coletaLinks($);
}
} // should be })
}
Also, something went wrong with copying the code on SO, you might want to check what's up with that for any future submissions. In this case the screenshots helped, but you should also know that images of code are generally frowned upon on SO.
Good luck!
Related
I'm a beginner of nodejs, async bothers me.
I want my code run sequencely or it will breaks.
I have a for loop, and it simply doesn't work...
Here are all the codes:
const util = require('util');
const request = require('request');
const cheerio = require('cheerio');
var host = "http://www.nicotv.me";
var url = "http://www.nicotv.me/video/play/57838-1-%s.html";
var len = 99;
var tab = /-(\d)-/.exec(url);
tab = tab[1] // '1' not '-1-'
function getLen(url) {
//you can ignore this function, it gives len=2
request(url, function (err, response, html) {
if (err) {
console.log('url:', url);
console.log('error:', err);
console.log('statusCode:', response && response.statusCode);
}
else{
var $ = cheerio.load(html);
var cls = '.ff-playurl-dropdown-%s';
$(util.format(cls, tab)).filter(function (){
var data = $(this);
len = data.html().match(/<a href=/g).length;
console.log("episode:", len);
});
getLink(len, function(){
});
}
});
}
getLen(util.format(url, 1)); //len = 2
var getLink = function(lengths, callback){
for (let i = 1; i <= lengths; i++) {
var tmp = util.format(url, i);
try {
request(tmp, function (err, res, html){
console.log('url:', tmp);
if(err){
console.log("error:", err);
console.log("statusCode:", res && res.statusCode);
}else{
var reg = /src="(\/player.php?.{1,})"/;
var result = reg.exec(html);
console.log(result[1]);
}
});
callback();
} catch (error) {
console.log(error);
break;
}
}
}
here is my output:
episode: 2
url: http://www.nicotv.me/video/play/57838-1-2.html
/player.php?u=aHR0cDovL3R5angyLmtpbmdzbnVnLmNuLzM2MHl1bi0xNS5waHA/dmlkPTE1NzkxMzU2MzEyNDAwNTQ5&p=360biaofan&c=0&j=aHR0cDovL2ppZXhpLmtpbmdzbnVnLmNuLzM2MGJpYW9mYW4ucGhwP3VybD0=&x=10&y=&z=
url: http://www.nicotv.me/video/play/57838-1-2.html
/player.php?u=aHR0cDovL3R5angyLmtpbmdzbnVnLmNuLzM2MHl1bi0xNS5waHA/dmlkPTE1Nzg1MDQyMDYyNDAwNTgx&p=360biaofan&c=0&j=aHR0cDovL2ppZXhpLmtpbmdzbnVnLmNuLzM2MGJpYW9mYW4ucGhwP3VybD0=&x=10&y=&z=aHR0cDovL3R5angyLmtpbmdzbnVnLmNuLzM2MHl1bi0xNS5waHA/dmlkPTE1NzkxMzU2MzEyNDAwNTQ5
First problem is these two /player*** link are from 57838-1-1.html
And one of them are not complete.
Second problem is the url output shows 57838-1-2.html twice.
Thanks for your kindly help.
Yesterday had the same problem, so I solved with:
Using request-promise
Replace the loop method arrTitles.Each with for (const jt of arrTitles)
Here a sample:
const request = require('request-promise');
const cheerio = require('cheerio');
var getUrlData =
async function (url) {
console.log(url);
try {
return await request.get(url);
}
catch (err) {
console.error(`${err}: ${url}`);
}
return;
};
var run =
async function (pageUrl) {
var arrData =
await fn.getUrlData(pageUrl)
.then(response => readTable(response));
console.log(arrData);
};
var readTable =
function (document) {
var $;
let arrData = [];
try {
$ = cheerio.load(document);
$('table tr')
.each(
function (trN) {
$(this)
.children('td')
.each(
function (tdN) {
arrData.push($(this).text().trim());
}
)
});
}
catch { }
return arrData;
};
run();
Being new to nodejs ans async following is the code that I came across.
app = express();
/*
other express use calls like - app.use(bodyParser.json());
*/
var async = require("async");
var server;
app.post('/callType/call', function(req, res) {
var startTime = Date.now();
server = req.body.server;
//async.map asynchronuously call enrollStep1 for every element in the req.body.nodes array
//HOW DOES THIS WORK??!! - WHERE IS THE CALLBACK DEFINED OR SOURCED FROM???
//******************************************************
async.map(req.body.nodes, function(node, callback) {
someFunc(node.property1,node.property2,callback)
},
//This function is called when every task triggered by async.map has called its callback.
function(err, results) {
var response = {};
if (err) {
response.success = false;
response.error = err;
console.log("ERROR returned: " + JSON.stringify(response));
res.json(response);
} else {
var returnResults = [];
//Results is an array of array - flatten it
var flattenedResults = [].concat.apply([], results);
//then remove duplicates
for (var i = 0; i < flattenedResults.length; i++){
var obj = flattenedResults[i];
var isDup = returnResults.some(function(element) {
return element.tid === obj.tid;
});
if (!isDup) {
returnResults.push(obj);
}
}
response.success = true;
response.results = returnResults;
res.json(response);
}
});
});
function someFunc(property1, property2, callback) {
var url = '/'+callTypes +'/'+ call +'/'+ property1 +'/'+ property2
urClient
.get(server + url)
.header('Content-Type', 'application/json')
.end(
function(response) {
if (response.code !== 200) {
callback("Error " + ". Code: " + response.code + " Response: " + JSON.stringify(response));
} else {
callback("Success " + ". Code: " + response.code + " Response: " + JSON.stringify(response));
}
}
);
}
The iteratee function for async.map has a definition starting function(node, callback) { but the callback function is never assigned. How does the callback work over here.
Isn't it supposed to be assigned somewhere like callback = myCallbackFunction;
The async.map takes 3 arguments, the array/object, the function to map the data and the callback function, so your code should be:
async.map(req.body.nodes, someFunc , function(err, results) {
if (err) return console.log(err);
console.log(results);
});
And your someFunc should be:
function someFunc(item, callback) {
// do something with item
// it's each item in the original array/object
callback('The results');
}
This is a basic example: http://code.runnable.com/UyR-6c2DZZ4SmfSh/async-map-example-for-node-js
I looked at other questions regarding this topic but can't wrap my head around how to implement it in this case.
What I am trying to achieve:
Visit site and get content (body)
Visit matching test site and get content (body)
Compare content
Crawl links on page1
Crawl links on page2
Continue
The problem I am having at the moment is that I cannot compare the content because the requests are not waiting for each other.
Here's what my code looks like at the moment.
require('colors');
var request = require('request');
var cheerio = require('cheerio');
var jsdiff = require('diff');
var URL = require('url-parse');
var PROD_START_URL = "https://www.somesite.org";
var MAX_PAGES_TO_VISIT = 100;
var pagesVisited = {};
var numPagesVisited = 0;
var pagesToVisit = [];
var globalProdContent;
var globalTestContent;
var url = new URL(PROD_START_URL);
var baseUrl = url.protocol + "//" + url.hostname;
pagesToVisit.push(PROD_START_URL);
crawl();
function crawl() {
if(numPagesVisited >= MAX_PAGES_TO_VISIT) {
console.log("Reached max limit of number of pages to visit.");
return;
}
var nextPage = pagesToVisit.pop();
if (nextPage in pagesVisited) {
// We've already visited this page, so repeat the crawl
crawl();
} else {
// New page we haven't visited
visitPage(nextPage, crawl);
}
}
function visitPage(url, callback) {
// Add page to our set
pagesVisited[url] = true;
numPagesVisited++;
// Make the request
console.log("Visiting page " + url);
request(url, function(error, response, body) {
// Check status code (200 is HTTP OK)
console.log("Status code: " + response.statusCode);
if(response.statusCode !== 200) {
callback();
return;
}
// Parse the document body
var $ = cheerio.load(body);
globalProdContent = $("#wrapper").text();
// Build new URL for test site
var testURL = url.replace("https://www.somesite.org", "http://matching.testsite");
// Scrape test site
scrapeTestContent(testURL);
collectInternalLinks($);
callback();
});
}
function collectInternalLinks($) {
var relativeLinks = [];
relativeLinks = $("a[href]");
console.log("Found " + relativeLinks.length + " relative links on page");
relativeLinks.each(function() {
pagesToVisit.push(baseUrl + "/" + $(this).attr('href'));
});
}
function scrapeTestContent(testURL) {
console.log("Visiting matching testpage " + testURL);
request(testURL, function(error, response, body) {
console.log("Status code: " + response.statusCode);
if(response.statusCode !== 200) {
callback();
return;
}
var $ = cheerio.load(body);
globalTestContent = $("#wrapper").text();
console.log(globalTestContent);
});
}
Is there an easier way to do this or am I completely off the track?
This can be done in two ways:
1. Add callback to scrapeTestContent
function scrapeTestContent(testURL, cb) {
...
request(testURL, function(error, response, body) {
cb();
});
In visitPage,
function visitPage(url, callback) {
...
scrapeTestContent(testURL, () => collectInternalLinks($));
}
Using ES6 promises. In scrapeTestContent() return new Promise((resolve, reject) => {}. Then in visitPage, use following construct: scrapeTestContent(testUrl).then(() => collectInternalLinks($))
I have written this non-blocking nodejs sample recursive file search code, the problem is I am unable to figure out when the task is complete. Like to calculate the time taken for the task.
fs = require('fs');
searchApp = function() {
var dirToScan = 'D:/';
var stringToSearch = 'test';
var scan = function(dir, done) {
fs.readdir(dir, function(err, files) {
files.forEach(function (file) {
var abPath = dir + '/' + file;
try {
fs.lstat(abPath, function(err, stat) {
if(!err && stat.isDirectory()) {
scan(abPath, done);;
}
});
}
catch (e) {
console.log(abPath);
console.log(e);
}
matchString(file,abPath);
});
});
}
var matchString = function (fileName, fullPath) {
if(fileName.indexOf(stringToSearch) != -1) {
console.log(fullPath);
}
}
var onComplte = function () {
console.log('Task is completed');
}
scan(dirToScan,onComplte);
}
searchApp();
Above code do the search perfectly, but I am unable to figure out when the recursion will end.
Its not that straight forward, i guess you have to rely on timer and promise.
fs = require('fs');
var Q = require('q');
searchApp = function() {
var dirToScan = 'D:/';
var stringToSearch = 'test';
var promises = [ ];
var traverseWait = 0;
var onTraverseComplete = function() {
Q.allSettled(promises).then(function(){
console.log('Task is completed');
});
}
var waitForTraverse = function(){
if(traverseWait){
clearTimeout(traverseWait);
}
traverseWait = setTimeout(onTraverseComplete, 5000);
}
var scan = function(dir) {
fs.readdir(dir, function(err, files) {
files.forEach(function (file) {
var abPath = dir + '/' + file;
var future = Q.defer();
try {
fs.lstat(abPath, function(err, stat) {
if(!err && stat.isDirectory()) {
scan(abPath);
}
});
}
catch (e) {
console.log(abPath);
console.log(e);
}
matchString(file,abPath);
future.resolve(abPath);
promises.push(future);
waitForTraverse();
});
});
}
var matchString = function (fileName, fullPath) {
if(fileName.indexOf(stringToSearch) != -1) {
console.log(fullPath);
}
}
scan(dirToScan);
}
searchApp();
I have this code and I'm not able to create a new array for further use:
var request = require("request");
var cheerio = require("cheerio");
var pag = [];
request('http://www.tastez.ro/tv.php?query=sopcast', function(error, response, body) {
if (error) {
return console.error('upload failed:', error);
}
var $ = cheerio.load(body);
links = $(".page a"); //use your CSS selector here
$(links).each(function(i, link){
var sop = $(this).attr('href');
pag[i] = sop; //aici pun val gasite in locuri in array
});
pag.push(', ');
});
for (var i=0; i<2; i++){
console.log(pag[i]);
}
When I run the code it is listing undefined. But if I put the code like this:
var request = require("request");
var cheerio = require("cheerio");
var pag = [];
request('http://www.tastez.ro/tv.php?query=sopcast', function(error, response, body) {
if (error) {
return console.error('upload failed:', error);
}
var $ = cheerio.load(body);
links = $(".page a"); //use your CSS selector here
$(links).each(function(i, link){
var sop = $(this).attr('href');
pag[i] = sop; //aici pun val gasite in locuri in array
});
pag.push(', ');
for (var i=0; i<2; i++){
console.log(pag[i]);
}
});
Then it is displaying correct result but still undefined when i'd like to use it later.
Can someone help me up with this.
Node.js is async, that means the scrape hasn't finished yet when you go to print out the array.
I'm not totally sure what your end goal is, but here is a way to do what you are trying with minimal changes:
var request = require("request");
var cheerio = require("cheerio");
var pag = [];
var scrape = function( callback ) {
request('http://www.tastez.ro/tv.php?query=sopcast', function(error, response, body) {
if (error) {
return console.error('upload failed:', error);
}
var $ = cheerio.load(body);
links = $(".page a"); //use your CSS selector here
$(links).each(function(i, link){
var sop = $(this).attr('href');
pag[i] = sop; //aici pun val gasite in locuri in array
});
pag.push(', ');
if (callback) callback()
});
}
scrape(function() {
for (var i=0; i<2; i++){
console.log(pag[i]);}
})
Catalyst is right, the problem is that you are not waiting for the async request call to complete. Here is my solution:
function getLinks(callback){
request('http://www.tastez.ro/tv.php?query=sopcast', function(error, response, body) {
if (error) {
callback(new Error('upload failed:', error),null);
}
var pag = [];
var $ = cheerio.load(body);
links = $(".page a"); //use your CSS selector here
$(links).each(function(i, link){
var sop = $(this).attr('href');
pag.push(sop); //aici pun val gasite in locuri in array
});
callback(null, pag);
});
}
getLinks(function(err,links){
if(err) return console.log(err);
console.log(links.join(','));
})
here I am defining a functions that makes the request call and it accepts a callback in the standard node callback convention on putting the error message as the first parameter and the results as the second parameter. Then calling that method with a callback that will print the results.