Can't scrape and print the links on the fly

Can't scrape and print the links on the fly - node.js

I've written a script in node.js to scrape the links of different titles from a webpage. When I execute my following script, I get undefined printed in the console instead of the links I'm after. My defined selectors are accurate.
I do not wish to put the links in an array and return the results; rather, I wish to print them on the fly. As I'm very new to write scripts using node.js in combination with puppeteer, I can't figure out the mistake I'm making.
This is my script (Link to that site):
const puppeteer = require('puppeteer');
function run () {
return new Promise(async (resolve, reject) => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("https://stackoverflow.com/questions/tagged/web-scraping");
let url = await page.evaluate(() => {
let items = document.querySelectorAll('a.question-hyperlink');
items.forEach((item) => {
//would like to keep the following line intact
console.log(item.getAttribute('href'));
});
})
browser.close();
return resolve(url);
} catch (e) {
return reject(e);
}
})
}
run().then(console.log).catch(console.error);
The following script works just fine if I consider to declare an empty array results and store the scraped links within it and finally return the resultsbut I do not wish to go like this. I would like to stick to the way I tried above, as in printing the result on the fly.
const puppeteer = require('puppeteer');
function run () {
return new Promise(async (resolve, reject) => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("https://stackoverflow.com/questions/tagged/web-scraping");
let urls = await page.evaluate(() => {
let results = [];
let items = document.querySelectorAll('a.question-hyperlink');
items.forEach((item) => {
results.push({
url: item.getAttribute('href'),
});
});
return results;
})
browser.close();
return resolve(urls);
} catch (e) {
return reject(e);
}
})
}
run().then(console.log).catch(console.error);
Once again: my question is how can I print the link like console.log(item.getAttribute('href')); on the fly without storing it in an array?

To run console.log() inside evaluate() simply copy the line below where you are defining page
page.on('console', obj => console.log(obj._text));
so now the whole snippet will be like this now
const puppeteer = require('puppeteer');
function run () {
return new Promise(async (resolve, reject) => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
page.on('console', obj => console.log(obj._text));
await page.goto("https://stackoverflow.com/questions/tagged/web-scraping");
let url = await page.evaluate(() => {
let items = document.querySelectorAll('a.question-hyperlink');
items.forEach((item) => {
//would like to keep the following line intact
console.log(item.getAttribute('href'));
});
})
browser.close();
return resolve(url);
} catch (e) {
return reject(e);
}
})
}
run().then(console.log).catch(console.error);
Hope this help

The library looks a bit awkward to use but found the proper way to get an href from this thread on github- https://github.com/GoogleChrome/puppeteer/issues/628
The working code I have is to use await page.$$eval
async function getStackoverflowLinks(){
return new Promise(async(resolve, reject)=>{
console.log(`going to launch chromium via puppeteer`)
const browser = await puppeteer.launch()
console.log(`creating page/tab`)
const page = await browser.newPage()
await page.goto('https://stackoverflow.com/questions/tagged/web-scraping')
console.log("fetched SO web-scraping, now parsing link href")
let matches = await page.$$eval('a.question-hyperlink', hrefs=>hrefs.map((a)=>{
return a.href
})) // $$eval and map version, $$eval returns an array
console.log("matches = ", matches.length)
await browser.close()
resolve(matches)
})
}
getStackoverflowLinks()
.then(hrefs=>{
console.log("hrefs: ", hrefs)
})

Things to note,
async function will return a promise.
new Promise will also return a promise.
On that note, you can simply use the .console events to print them on fly. Usage,
page.on("console", msg => console.log(msg.text()));
await page.evaluate(async => {
console.log("I will be printed on node console too")
})
Advanced usage has been discussed on this answer.

Related

How to get the quantity of children inside the element using xpath query on the puppeter?

I need to get the number of children inside an element with the page.$x(xpath) function in Puppeteer, mad I was not successful. In the browser console I can do it using $x().children.length. What would be the best way to do this in the puppeteer with node?

You can use either element/JS handle API or pure Web API in page.evaluate(). These are both wways:
const puppeteer = require('puppeteer');
(async function main() {
try {
const browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.goto('https://example.org/');
const [element] = await page.$x('//body/div');
const children = await element.getProperty('children');
const length1 = await (await children.getProperty('length')).jsonValue();
console.log(length1); // 3
const length2 = await page.evaluate(() => {
return document.evaluate(
'//body/div', document, null, XPathResult.FIRST_ORDERED_NODE_TYPE
).singleNodeValue.children.length;
});
console.log(length2); // 3
await browser.close();
} catch (err) {
console.error(err);
}
})();

HTML not extracted NODEJS

I want to scrape twitter tweets using nodejs and puppeteer
I don't want to create a developer account etc
The below code return null in scraping code. But when I write this code in twitter website it gives me HTML
var html = document.querySelector('main nav').nextElementSibling;
Code
'use strict';
const puppeteer = require('puppeteer');
function run() {
return new Promise(async (resolve, reject) => {
try {
const browser = await puppeteer.launch({
headless : false
});
const page = await browser.newPage();
await page.setRequestInterception(true);
// add header for the navigation requests
page.on('request', request => {
// Do nothing in case of non-navigation requests.
if (!request.isNavigationRequest()) {
request.continue();
return;
}
// Add a new header for navigation request.
const headers = request.headers();
// headers['proxy'] = super_proxy;
request.continue({ headers });
});
await page.goto("https://www.twitter.com/Udemy");
await page.evaluate(`window.scrollTo(0, document.body.scrollHeight)`);
await page.waitFor(5000);
await page.waitFor('main nav');
let urls = await page.evaluate(() => {
let results = [];
var parser = new DOMParser();
var html = document.querySelector('main nav').nextElementSibling;
var $ = parser.parseFromString(html, 'text/html');
var html = document.querySelector('section > div > div > div');
//Error return empty HTML --------------------- <<<<<<<<<<<<<
return html;
})
browser.close();
return resolve(urls);
} catch (e) {
return reject(e);
}
})
}
run().then(console.log).catch(console.error);
Result that I want

From the docs:
If the function passed to the page.evaluate returns a non-Serializable value, then page.evaluate resolves to undefined.
and here
The only difference between page.evaluate and page.evaluateHandle is that page.evaluateHandle returns in-page object (JSHandle).
replace page.evaluate with page.evaluateHandle:
let urls = await page.evaluateHandle(() => { ... return html })

Puppeteer getting element from elementHandle causing protocol error

I'm trying to scrape a certain facebook page for its posts written by a certain user and starting with a certain word.
const puppeteer = require('puppeteer');
async function findPosts(page) {
const USERNAME = 'test123';
const posts = await page.$$('.userContentWrapper');
return posts.filter(async post => {
try {
let usernameElement = await post.$('.fwb');
let username = await page.evaluate(element => element.textContent, usernameElement);
if (username === USERNAME) {
let postElement = await post.$('[data-testid="post_message"] p');
let postContent = page.evaluate(element => element.textContent, postElement);
return /\[test \d+\]/.test(postContent);
}
return false;
} catch(e) {
console.log(e);
return false;
}
});
}
(async () => {
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
await page.goto('https://www.facebook.com/groups/groupid/');
const pageTitle = await page.title();
console.log(pageTitle);
const posts = await findPosts(page);
console.log(posts);
await browser.close();
})();
I'm getting
Error: Protocol error (Runtime.callFunctionOn): Target closed. when
I'm trying to get the usernameElement
at this line:
let usernameElement = await post.$('.fwb');
Not sure what's going wrong here, any suggestions?

The problem is that the filter function does not work with Promises. So the return posts.filter(...) will immediately return and after that the browser is closed. Therefore, when you try to run the $ function on the page, the page does not exist anymore and you get the Target closed error.
To make it work with async/await syntax, you could use a simple loop instead:
async function findPosts(page) {
const USERNAME = 'test123';
const posts = await page.$$('.userContentWrapper');
const postsToReturn = [];
for (let post of posts) {
/* ... if else logic */
postsToReturn.push(post); // instead of return true
}
return postsToReturn;
}

puppeteer : cant log in and loop through urls

Hi Guys I want to log in a website and once authenticated want to loop through a given set of URLS and scrape data. What I intend to do can be described by this example,however I get Unhandled promise rejection.
const puppeteer = require("puppeteer");
list = [
"https://www.facebook.com/",
"https://www.google.com/",
"https://www.zocdoc.com/"
];
const getTitle = async (p, url) => {
try{
await p.goto(url);
const title = await p.title();
console.log(title);
}
catch(e) {
console.log(e)
}
return title
};
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
console.log(this)
for (var url of list) {
getTitle(page, url)
}
await browser.close();
})();

There are multiple issues in this example.
You should await the call to function getTitle, you re awaiting inside the function but you have to await the call to the function too.
You should surround getTitle with a try and catch block and check inside the function if theres a title to return (ex. the title for google is null)
const puppeteer = require("puppeteer");
list = [
"https://www.facebook.com/",
"https://www.google.com/",
"https://www.zocdoc.com/"
];
const getTitle = async (p, url) => {
try{
await p.goto(url);
const title = await p.title();
if(title){
return title
}
}
catch(e) {
throw(e)
console.log(e)
}
};
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
console.log(this)
for (var url of list) {
try{
console.log(await getTitle(page, url))
}
catch(e ){
console.log('No title')
}
}
await browser.close();
})();

How to return value from async/await function?

Using puppeteer to collect data from 2 different webpages into arrays for later comparison. However the program does not wait for the returned array before carrying forward.
async function go(){
try{
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('www.webpage.com');
const tds = await page.$$('td');
const data = [];
for (let i = 0; i < tds.length; i++){
const td = tds[i];
const tdcontent = await page.evaluate(td => td.innerText, td);
if (tdcontent.length > 5) {
data[i] = {"content": tdcontent};
}
}
return data;
} catch (e) {
console.log(e);
}
};
(async function main(){
const returnedData = await go();
console.log(returnedData.length);
})();
The return data.length is 0. New to nodejs, and async programming structure. I think it is because the .length is logged before the data is returned?
how do I return the data in a way where can manipulate it and complete my comparisons?

I try to not use page.$$ in such cases. Instead I use document.querySelectorAll and map thru the elements and extract the text.
Here is the modified code:
const getTdData = async () => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("http://example.com");
return page.evaluate(() => {
// get all td elements
const tdList = [...document.querySelectorAll("td")];
return tdList.map(element => ({ content: element.textContent }));
});
} catch (e) {
console.log(e);
}
};
(async function main() {
const returnedData = await getTdData();
console.log(returnedData.length);
})();

First of all, you are missing an apostrophe in your page.$$() function. You should change this to:
const tds = await page.$$('td');
Next, you are trying to pass a non-existent variable to page.evaluate(). You can fix this by passing tds[i] instead of td:
const tdcontent = await page.evaluate(td => td.innerText, tds[i]);
Your final result should look something like this:
const go = async () => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('www.webpage.com');
const tds = await page.$$('td');
const data = [];
for (let i = 0; i < tds.length; i++) {
const tdcontent = await page.evaluate(td => td.innerText, tds[i]);
if (tdcontent.length > 5) {
data[i] = {
content: tdcontent,
};
}
}
return data;
} catch (error) {
console.log(error);
}
};
(async function main() {
const returnedData = await go();
console.log(returnedData.length);
})();
If you are are still experiencing issues, you may want to wait until the page has loaded completely using page.goto( ... , { waitUntil: 'networkidle0' }), or wait until the element in question has been added to the DOM using page.waitForSelector():
await page.goto('www.webpage.com' , {
waitUntil: 'networkidle0',
});
// ...
await page.waitForSelector('td');

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Can't scrape and print the links on the fly - node.js

Related

How to get the quantity of children inside the element using xpath query on the puppeter?

HTML not extracted NODEJS

Puppeteer getting element from elementHandle causing protocol error

puppeteer : cant log in and loop through urls

How to return value from async/await function?

Categories

Resources