web scrapy by nightmare about loop ,output is not same every time - node.js

var Nightmare = require('nightmare');
var nightmare = Nightmare({ show: true });
var fs = require('fs');
vo = require('vo');
var result;
nightmare
.goto('http://jufa-kyusyu.jp/teams/')
.wait(1000)
.evaluate(function () {
var options = document.querySelectorAll('option'),i;
var values =[]
for (i = 0; i < options.length; ++i) {
values.push(options[i].value)
}
return values;
})
.then(function (values) {
for (var i = 0; i < values.length; i++) {
if(values[i] == "#") values[i] = "/teams/181.html";
nightmare
.goto("http://www.jufa-kyusyu.jp"+values[i])
.evaluate(function () {
var abc = document.querySelector('iframe[class="autoHeight"]').src.toString()
return abc;
})
.then(function (result) {
console.log(result)
})
.catch(function (error) {
console.error('Search failed:', error);
});}
})
.catch(function (error) {
console.error('Search failed:', error);
});
I want to scrapy the web information by nightmarejs looply.I dont know why have two result link is same and the result is changed in running every time.thank you.

You have to be careful when working with async calls inside a loop with Nightmare
Check this answer and this detailed explanation about the concept.
The main idea can be sumarized by this sentence:
Executing the operations in series requires arranging them to execute
in sequential order
The documentation shows how to achieve that using plain, vanilla js and also with vo
Here is a sneak peek on how to solve this loop issue with plain Javascript:
var urls = ['http://example1.com', 'http://example2.com', 'http://example3.com'];
urls.reduce(function(accumulator, url) {
return accumulator.then(function(results) {
return nightmare.goto(url)
.wait('body')
.title()
.then(function(result){
results.push(result);
return results;
});
});
}, Promise.resolve([])).then(function(results){
console.dir(results);
});
Basically what you need to do is queue all your calls in a list and trigger them using Promise.resolve

Related

I want my aws lamda to not return anything until the mysql query has been executed completely on node.js

I will provide the code first which is an AWS node.js Lambda function
const mysql = require('mysql');
const con = mysql.createConnection({
host: 'testtest.ap-southeast-2.rds.amazonaws.com',
user: 'test',
password: 'test',
database: 'test',
});
exports.handler = function(event,context,callback){
context.callbackWaitsForEmptyEventLoop = false;
var queryResult=[];
var searchbyArray = (event.searchby);
var len = searchbyArray.length;
getResult(len,searchbyArray,function(err,data){
if(err){console.log("the error is "+err);}
else{
callback(null,data);
}
});
};
function getResult(len,searchbyArray,cb){
var results=[];
for(var i=0; i<len; i++){
console.log("before loop"+i);
var sql ="SELECT * FROM aa_customer_device WHERE id LIKE '%"+searchbyArray[i]+"%'";
con.query(sql,function(err,result){
if (err){cb(err,null);}
else{
results.push(result);
}
});
}
cb(null,results);
}
Here, the cb(null, results) gets executed before the for loops gets completed since the con.query(...) is an async function(guess that's what its called) and always returns an empty array in 'results' which is then returned by the handler in its callback function.
Is there any way I can write the code in node.js so that it follows the sequence like
finish executing all the con.query(...) inside the for loop
push the resulting array from each loop iteration to the 'results' array
only after the above two steps has been completed call the cb(null,results)
what's happening here is the cb(null, results) is being executed way before the for loop gets a chance to store the desired information in the results array.
is there any way to accomplish what I want using node.js. I was suggested to use promises, async,await but form what I read all of those solutions won't exactly solve my problem.
You need to manually detect when the last request have completed:
function getResult(len,searchbyArray,cb){
var results=[];
var requestCount = 0;
var error = null;
for(var i=0; i<len; i++){
requestCount++; // count the requests made
var sql ="SELECT * FROM aa_customer_device WHERE id LIKE '%"+searchbyArray[i]+"%'";
con.query(sql,function(err,result){
// Note: All logic needs to happen here since
// this is the ONLY place where code gets
// executed in the FUTURE instead of now.
requestCount --; // count completed requests
if (err){error = err}
else{
results.push(result);
if (requestCount === 0) {
if (error) cb(error,null); // "return" error
else cb(results); // "return" results
}
}
});
}
}
With async/await
If you have access to an ES6 environment you can use async/await to make this much easier to read. First you need to promisify the query:
// Promisify con.query:
function query (con, sql) {
return new Promise((resolve, reject) => {
con.query(sql, (err, result) => {
if (err) reject(err);
else resolve(result);
});
});
}
async function getResult(len,searchbyArray,cb){
var results=[];
for(var i=0; i<len; i++){
var sql ="SELECT * FROM aa_customer_device WHERE id LIKE '%"+searchbyArray[i]+"%'";
try {
var result = await query(con, sql);
results.push(result);
}
catch (err) {
cb(err,null);
return; // stop processing
}
}
cb(results);
}
You already got the problem. cb(null , results) is not waiting for the query to complete. In order to make it wait, as you already mentioned, you can use promise/async-await.
here is how you can write an async/await code to achieve what you are trying to do.
const mysql = require('mysql');
const promisify = require('util').promisify;
const con = mysql.createConnection({
host: 'testtest.ap-southeast-2.rds.amazonaws.com',
user: 'test',
password: 'test',
database: 'test',
});
// con.connect uses a callback, we can use util.promify to convert it to promise driven
const connect = promisify(con.connect);
// con.query uses a callback, we can use util.promify to convert it to promise driven
const query = promisify(con.query);
exports.handler = async function (event, context) {
context.callbackWaitsForEmptyEventLoop = false;
const queryResult = [];
const searchbyArray = (event.searchby);
const len = searchbyArray.length;
try {
// wait for the connection to be established.
await connect();
const data = getResult(len, searchbyArray);
// return the lambda success result
return {
statusCode: 200,
responseBody: JSON.stringify(data)
}
} catch(err) {
console.log("the error is " + err)
// return the lambda failure result
return {
statusCode: 500,
responseBody: err.message
}
}
};
async function getResult(len, searchbyArray, cb) {
const results = [];
for (const i = 0; i < len; i++) {
console.log("before loop" + i);
const sql = `SELECT * FROM aa_customer_device WHERE id LIKE '%${searchbyArray[i]}%'`;
// wait for the query to complete
const results = await query(sql);
results.push(result);
}
return results;
}
hope this helps, feel free to ask me if you don't understand this code.
Reason is con.query() is a callback function. Callbacks are asynchronous, which means it will not execute in order. you have two options.
Use await keyword (If possible)
Wrap function by using Promise
I actually don't know AWS Lambda Mysql support async/await hence I will demonstrate the second option.These two options do the same thing but different patterns.
Situation in here is callbacks execute asynchronously. It will not "await" until result comes that's why you got an empty array.This is how async function behave.It is very important topic to understand, if you are going to develop even server less application in Node js or Java script.Please comment if you want more explanation.
Here you can wrap con.query() function by using Promise.
const con = mysql.createConnection({
host: 'testtest.ap-southeast-2.rds.amazonaws.com',
user: 'test',
password: 'test',
database: 'test',
});
exports.handler = async (event,context,callback)=>{
context.callbackWaitsForEmptyEventLoop = false;
var queryResult=[];
var searchbyArray = (event.searchby);
var len = searchbyArray.length;
try{
const result = await Promise.all(getResult(searchbyArray));
}catch (error){
console.log("the error is "+err);
}
};
const getResult = async (searchbyArray) =>{
const result = searchbyArray.map(searchItem=>{
var sql =`SELECT * FROM aa_customer_device WHERE id LIKE %${searchItem}%`;
return new Promise((resolve, reject) => {
con.query(sql, (err, result) => {
if (err)
reject(err);
else
resolve(result);
});
});
})
return result;
}
I have replace the for loop with map instead of push to array and used Promise.all this application will run more fast instead of await in each loop cycle.

Sending data back to Android from Firebase Cloud Functions

I want to send data back via functions.https.onCall, the code works and returns data to the console but i don't know why I can't send the data back to the device(in return number one, I understand can't why return number 2 is not working because the asynchronous task, so there is a way to to it after its finish?).
const admin = require('firebase-admin');
const functions = require('firebase-functions');
admin.initializeApp(functions.config().firebase);
var posts= new Array();
var finalposts = new Array();
function post (uid,text,user_name,vid,sendtime,vote,tag,pic,lan){
this.uid=uid;
this.text=text;
this.user_name=user_name;
this.vid=vid;
this.sendtime=sendtime;
this.vote=vote;
this.tag=tag;
this.pic=pic;
this.lan=lan;
}
exports.getWarm = functions.https.onCall((data, context) => {
finalposts = [];
posts = [];
db.collection('local uploads/heb/posts').where('vote', '<', 200).where('vote', '>', 100).get()
.then((snapshot) => {
snapshot.forEach((doc) => {
posts.push( new post(doc.data()["uid"],doc.data()["text"],doc.data()["user_name"],doc.data()["vid"],
doc.data()["sendtime"],doc.data()["vote"],doc.data()["tag"],doc.data()["pic"],doc.data()["lan"]));
});
posts.sort(function(a,b) {return (a.sendtime<b.sendtime)? 1:((a.sendtime>b.sendtime)? -1:0);});
for (var i =0; i<data.refresh_num*2; i++) {
finalposts.push(posts[i]);
}
console.log('hey', '=>', finalposts);
// working: the posts are shown in the console
// (1.) return { posts: finalposts };
// returns null
})
.catch((err) => {
console.log('Error getting documents', err);
});
// (2)return { posts: finalposts };
// returns {posts=[]}
return { posts: 'data' };
// returns {posts=data}
});
You need to return a Promise from your Firebase function in order to return data. Try adding return before your db.collection and see if that works.
From the docs: https://firebase.google.com/docs/functions/callable
To return data after an asynchronous operation, return a promise.
EDIT: I am not very familiar with Firebase Firestore, but any calls that generate a promise, add a return statement before it.
Also, from my experience, you will want to move your return { posts: 'data' }; to execute after your last promise by using then.
well it was simpler then I thought, I just needed to return it.
exports.getWarm = functions.https.onCall((data, context) => {
finalposts = [];
posts = [];
return db.collection('local uploads/heb/posts').where('vote', '<', 200).where('vote', '>', 100).get()
.then((snapshot) => {
snapshot.forEach((doc) => {
posts.push( new post(doc.data()["uid"],doc.data()["text"],doc.data()["user_name"],doc.data()["vid"],
doc.data()["sendtime"],doc.data()["vote"],doc.data()["tag"],doc.data()["pic"],doc.data()["lan"]));
});
posts.sort(function(a,b) {return (a.sendtime<b.sendtime)? 1:((a.sendtime>b.sendtime)? -1:0);});
for (var i =0; i<data.refresh_num*2; i++) {
finalposts.push(posts[i]);
}
console.log('hey', '=>', finalposts);
return { posts: finalposts };
})
.catch((err) => {
console.log('Error getting documents', err);
});
});

How could I return an array from my module to my API using promise? Async Functions and Promise - Node.js

I need to return a json object to my api. To do this I have a module that does some requests and should return the results.
My problem is grasping the promise concept and implementing it.
server.js
app.get('/users', function(req, res){
request.getUsers()
.then(function(users){
console.log(users);
res.contentType('application/json');
res.send(JSON.stringify(users));
})
.catch(function(){
console.log(users);
});
});
module.js
exports.getUsers = function(){
var params = {search_string:""};
var users = [];
return new Promise(function(resolve, reject){
var result = connection.Users.get(params, function(error,response)
{
var user = [];
for(let i = 0; i < response.data.length; i++)
{
user = response.data;
}
users.push({user});
});
if(result != null)
{
console.log(result);
resolve(result);
}
else
{
reject(new Error('Try Again'));
}
});
}
When I run the server I get the typeError: expecting a function but got [object object]
I did not really get what is wrong.
How could I return an array from my module to my API using promises?
EDIT:
app.get('/users', function(req, res){
request.getUsers()
.then(function(users){
console.log(users);
res.contentType('application/json');
res.send(JSON.stringify(users));
})
.catch(function(){
console.log("not resolved");
});
});
My problem now is actually that I am getting the .catch even before any request is made the at /users endpoint and I dont know why.
In module.js you used new Promise() constructor but the input parameter should be a function and not an object, so to fix that use:
return new Promise(function(resolve, reject) {
var result = connection.Users.get(params, function(error,response)
...
});
Notice its not new Promise({function(...) but new Promise(function(...)) ...
Read more here:
https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Promise
Edit:
I have modified your code to work to fix the second problem:
exports.getUsers = function(){
var params = {search_string:""};
var users = [];
return new Promise(function(resolve, reject){
var result = connection.Users.get(params, function(error,response) {
if(error || !response)
{
// report error
reject(new Error('Try Again'));
}
else
{
//process response
var user = [];
for(let i = 0; i < response.data.length; i++)
{
user = response.data;
}
users.push({user});
// report success
resolve(users);
}
});
}
You need to call resolve or reject inside connection.Users.get(params, function(error,response) {
Modify your module.js code as below. You passed an object instead of a function.
register.getUsers = function () {
var params = { search_string: "" };
var users = [];
return new Promise(function (resolve, reject) {
var result = connection.Users.get(params, function (error, response) {
var user = [];
for (let i = 0; i < response.data.length; i++) {
user = response.data;
}
users.push({ user });
});
if (result != null) {
console.log(result);
resolve(result);
}
else {
reject(new Error('Try Again'));
}
});
};
you declared user variable as an array and inside the for loop isn't useful because the user variable is always equals to response.data
if response.data is array of JSON object you can push it to users array inside loop
for (let i = 0; i < response.data.length; i++) {
users.push(response.data[i]);
}
I guess you want to return the array of objects
also I recommend you to use bluebird module to return promises
and also you can use Promise.mapSeries instead of for loop like:
return Promise.mapSeries(response.data, item => {
users.push(item)
})

Nightmare Looping

Hello, I am writing an application where I need to be able to loop through an array of urls. I know there is an example of how to do this but my issue is a little different, I will explain with some code.
nightmare
.goto('some url')
.evaluate(() => {
//evaluate code
})
.then(dataArray => {
var innerRun = function* () {
var returnData = [];
for (var i = 0; i < dataArray.length; i++) {
var item = dataArray[i];
yield nightmare
.goto(item.url)
.evaluate(function () {
return false;
})
.screenshot(item.imgPath)
returnData.push(item);
}
return returnData;
}
vo(innerRun)(function (err, ads) {
if (err) {
console.log("Error running", err)
return;
}
});
});
I would like to be able to loop that code by using an array of urls. I had issues implementing this I believe because I am already doing it inside the then. It would stop running once it hit the yield nightmare inside the then
var mainLoop = function* () {
for (var j = 0; j < urlArray.length; j++) {
var url = urlArray[j];
yield nightmare.goto(url)//same code as in example above
}
}
vo(mainLoop)(function (err, d) {
if (err) {
console.log("Error running", err)
return;
}
});
The above code is what I attempted to do. If anyone has any ideas it would be a huge help thank you!
Maybe try this:
var urls = ['http://example.com', 'http://example2.com', 'http://example3.com'];
var results = [];
urls.forEach(function(url) {
nightmare.goto(url)
.wait('body')
.title()
.then(function(result) {
results.push(result);
});
});
console.dir(results)
Source: https://github.com/rosshinkley/nightmare-examples/blob/master/docs/common-pitfalls/async-operations-loops.md

PhantomJS memory leak and process exit failure

I am currently working on a project with PhantomJS that evaluates a list of web pages specified by a CSV file. I installed NPM and node.js to use in my program.
Here is the program:
var async = require("async");
var webpage = require('webpage'),
fs = require('fs');
var file_h = fs.open('C:\\Users\\morgan\\Documents\\FantasyApp\\URLPlayerListActive.txt', 'r');
var urls = [];
while (!file_h.atEnd()) {
urls.push(file_h.readLine());
}
async.eachSeries(urls, function (url, done) {
console.log(url)
var page = webpage.create();
page.open("http://"+url, function (status) {
if (status !== 'success') {
console.log('Unable to access network');
console.log(status)
var closeresults = page.close();
} else {
var evalresults = page.evaluate(function() {
try {
table2csv('pgl_basic');
try {
ga('send','event','Tool','Action','CSV');
}
catch (e) {}
var list = document.querySelectorAll('#csv_pgl_basic');
var stats = [];
for (var i = 0; i < list.length; i++) {
stats.push(list[i].innerText);
}
return stats;
var closeresults = page.close();
} catch (e) {
console.log(e);
}
});
try {
fs.write("C:\\Users\\morgan\\Documents\\FantasyApp\\Data\\"+url+".txt", evalresults.join('\n'), 'w');
var closeresults = page.close();
} catch(e) {
console.log(e);
var closeresults = page.close();
}
}
done();
});
});
phantom.exit();
My symptoms are either the process memory increases until it reaches my Windows maximum and crashes, OR it finishes my list and the process hangs around forever.
I can implement a work around for either of these problems, but because they both happen, I am unable to put this script to work.
I am looking for assistance preventing the memory leak or simply closing my process when the script is finished. It is possible that these symptoms are from the same root cause.
If the page is not correctly garbage collected, you can try to use the same instance over and over again. The other thing is that you should call phantom.exit when the script actually finished e.g. in the callback of eachSeries.
var page = webpage.create();
async.eachSeries(urls, function (url, done) {
console.log(url)
page.open("http://"+url, function (status) {
if (status !== 'success') {
console.log('Unable to access network');
console.log(status)
} else {
var evalresults = page.evaluate(function() {
try {
table2csv('pgl_basic');
try {
ga('send','event','Tool','Action','CSV');
}
catch (e) {}
var list = document.querySelectorAll('#csv_pgl_basic');
var stats = [];
for (var i = 0; i < list.length; i++) {
stats.push(list[i].innerText);
}
return stats;
} catch (e) {
console.log(e);
}
});
try {
fs.write("C:\\Users\\morgan\\Documents\\FantasyApp\\Data\\"+url+".txt", evalresults.join('\n'), 'w');
} catch(e) {
console.log(e);
}
}
done();
});
}, function(err){
phantom.exit();
});
Some other issues:
page.close doesn't return anything, so closeresults will be undefined.
Any statement that comes after return cannot be executed.
page is not defined in the page context (inside page.evaluate) and therefore page.close(); produces an error which may break your code.
Please register to the onConsoleMessage and onError events to see if there are errors.

Resources