HTML not extracted NODEJS - node.js

I want to scrape twitter tweets using nodejs and puppeteer
I don't want to create a developer account etc
The below code return null in scraping code. But when I write this code in twitter website it gives me HTML
var html = document.querySelector('main nav').nextElementSibling;
Code
'use strict';
const puppeteer = require('puppeteer');
function run() {
return new Promise(async (resolve, reject) => {
try {
const browser = await puppeteer.launch({
headless : false
});
const page = await browser.newPage();
await page.setRequestInterception(true);
// add header for the navigation requests
page.on('request', request => {
// Do nothing in case of non-navigation requests.
if (!request.isNavigationRequest()) {
request.continue();
return;
}
// Add a new header for navigation request.
const headers = request.headers();
// headers['proxy'] = super_proxy;
request.continue({ headers });
});
await page.goto("https://www.twitter.com/Udemy");
await page.evaluate(`window.scrollTo(0, document.body.scrollHeight)`);
await page.waitFor(5000);
await page.waitFor('main nav');
let urls = await page.evaluate(() => {
let results = [];
var parser = new DOMParser();
var html = document.querySelector('main nav').nextElementSibling;
var $ = parser.parseFromString(html, 'text/html');
var html = document.querySelector('section > div > div > div');
//Error return empty HTML --------------------- <<<<<<<<<<<<<
return html;
})
browser.close();
return resolve(urls);
} catch (e) {
return reject(e);
}
})
}
run().then(console.log).catch(console.error);
Result that I want

From the docs:
If the function passed to the page.evaluate returns a non-Serializable value, then page.evaluate resolves to undefined.
and here
The only difference between page.evaluate and page.evaluateHandle is that page.evaluateHandle returns in-page object (JSHandle).
replace page.evaluate with page.evaluateHandle:
let urls = await page.evaluateHandle(() => { ... return html })

Related

Dynamic Web Scraping not returning a complete NodeList

Im trying to extract the content of a span tag in a dynamic html, but when i use the querySelectorAll, it returns only the _prevClass of the first element in the NodeList, nothing else.
My code:
const Nightmare = require('nightmare');
const nightmare = Nightmare({ show: true });
const url = 'https://mir4draco.com/price';
nightmare
.goto(url)
.evaluate(() => document.querySelectorAll('span.amount')) // Should return a NodeList with 5 elements
.end()
.then(response => {
console.log(response);
}).catch(error => {
console.error('Search failed:', error);
});
The output:
{ '0': { _prevClass: 'amount' } }
This is the output in the browser
Well, after some tests i got a solution.
For some reason, this site only loads the tags i need after a few seconds when using an automation.
So, i've changed to puppeteer and used the slowMo parameter.
Now it works perfectly
const puppeteer = require('puppeteer');
const url = 'https://mir4draco.com/price'
const precoDraco = async function () {
const browser = await puppeteer.launch({ slowMo: 10000 });
const page = await browser.newPage();
await page.goto(url);
var prices = await page.evaluate(() => {
var div = document.querySelectorAll('span.amount');
var prices = "";
div.forEach(element => {
if (element != null) prices += element['innerText'] + " ";
});
return prices;
})
browser.close();
}
Now this works as it should

Requests for multiple pages with puppeteer

I am trying to get information from many sites (links from array) which have dynamically content (emails and names of companies) with puppeteer. I use "for" cycle to iterate array with links, do page.goto... to each site, wait until the site is loaded , wait several seconds for dynamical content, and begin doing requests. But i have first and last request completed (Promises resolve). Other promises don't return me dynamical content. What should i do for fix that? Thanks
let puppeteer = require('puppeteer');
(async() => {
const browser = await puppeteer.launch();
let page = await browser.newPage();
const url = 'https://abcdsite.com/';
let arrayNames = ['first','second','third','abcd'];
for(let i=0;i<await arrayNames.length;){
let nameUrl = await arrayNames[i];
if (i<4){
let temp1;
console.log(`begin for ${nameUrl}`);
await page.goto(`${url}${nameUrl}`, { waitUntil: 'load' })
.then(()=>{
return new Promise(res=>{
//wait content dynamic load
setTimeout(()=>{
temp1 = page.evaluate(() => {
return new Promise(resolve => { // <-- return the data to node.js from browser
let name = document.querySelector('h1').innerHTML;
let email = document.getElementsByClassName('sidebar-views-contacts h-card vcard')[0]
.children[2].children[0].children[0].innerHTML;
resolve(email);
});
});
res(temp1);
},7000);
})
})
.then((res)=>{
i++;
console.log(`https://abcdsite.com/${nameUrl}`,temp1);
});
}
else{
break
}
}
})();
I think this helps you.
1) make an async function to request and parse your data
2) create an array of parallel tasks.
let puppeteer = require('puppeteer');
async function makeRequest(page, url, nameUrl) {
await page.goto(`${url}${nameUrl}`, { waitUntil: 'load' });
setTimeout(() => {
const userEmail = await page.evaluate(() => {
let name = document.querySelector('h1').innerHTML;
let email = document.getElementsByClassName('sidebar-views-contacts h-card vcard')[0]
.children[2].children[0].children[0].innerHTML;
return email;
});
return Promise.resolve(userEmail);
}, 7000);
}
(async () => {
const browser = await puppeteer.launch();
let page = await browser.newPage();
const url = 'https://abcdsite.com/';
let arrayNames = ['first', 'second', 'third', 'abcd'];
let tasks = [];
for (let i = 0; i < arrayNames.length; i++) {
tasks.push(makeRequest(page, url, arrayNames[i]));
}
Promise.all(tasks)
.then((res) => {
for (let i = 0; i < arrayNames.length; i++) {
console.log(`https://abcdsite.com/${arrayNames[i]}`, res[i]);
}
});
})();
Series solution
For more information read this.
for (let i = 0; i < arrayNames.length; i++) {
let temp = await makeRequest(page, url, arrayNames[i]);
console.log(`https://abcdsite.com/${arrayNames[i]}`, temp);
}
puppeteer's page.goto function has multiple parameters you can use to ensure that the page is fully loaded. See the documentation here.
In addition, you can use the page.waitFor method to wait for a few seconds. See documentation here.
Here you have a simple example that I think may work for you:
const puppeteer = require('puppeteer')
const url = 'https://stackoverflow.com/'
const arrayNames = ['tags', 'users', 'jobs', 'questions'];
(async () => {
const browser = await puppeteer.launch()
const page = await browser.newPage()
const data = {}
for (const nameUrl of arrayNames) {
const fullUrl = `${url}${nameUrl}`
console.log(`begin for ${fullUrl}`)
await page.goto(fullUrl, { waitUntil: 'networkidle0' }) // check networkidle0 parameter and others here: https://pptr.dev/#?product=Puppeteer&version=v2.1.1&show=api-pagegotourl-options
await page.waitFor(2000) // wait 2 seconds to allow a full login. Optional
const pageData = await page.evaluate(() => {
const name = document.querySelector('h1').innerText
const pageTitle = document.querySelector('title').innerText
// get whatever data you need to get from the page.
return { name: name, title: pageTitle }
})
console.log('\t Data from page: ', pageData)
data[fullUrl] = pageData
}
console.log(data)
})()
This does not run all sites in parallel, but you can then play around with the example.
Instead of 'awaiting' the await page.evaluate part, you could get all the promises in an array and then use await Promise.all([listOfPromises])

Scraper with Puppeteer login returns just one element of the array

This code is supposed to loop through the urls that get scraped from the scrapeProductPage function. But before looping, it needs to log in so that it can obtain the prices. The prices are only displayed to logged in users. Instead of looping through the urls it just returns the scraped data from one page. The error I get is "MaxListenersExceededWarning: Possible EventEmitter memory leak detected".
const request = require("request-promise");
const cheerio = require("cheerio");
const ObjectsToCsv = require("objects-to-csv");
const puppeteer = require('puppeteer');
const url = "https://www.example.com";
const scrapeResults = [];
async function scrapeProductPage() {
try {
const htmlResult = await request.get(url);
const $ = await cheerio.load(htmlResult);
$("td.productListing-data > a[style='position:relative;float:left;']").each((index, element) => {
let url = $(element).attr("href");
url = "https\://www.example.com/" + url;
const scrapeResult = { url };
scrapeResults.push(scrapeResult);
});
return scrapeResults;
} catch (err) {
console.error(err);
}
}
async function scrapeDescription(productsWithImages) {
process.setMaxListeners(0);
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
await page.goto('https://www.example.com/login');
await page.waitFor(500);
await page.waitFor('input[name="email_address"]');
await page.type('input[name="email_address"]', 'example#gmail.com');
await page.type('input[name="password"]', '123test');
await page.click('#btnLogin');
return await Promise.all(
productsWithImages.map(async job => {
try {
await page.goto(job.url, { waitUntil: "load" });
const content = await page.content();
const $ = await cheerio.load(content);
job.main_img = $('img#main_img').attr('src');
job.name = $('h2').text();
job.price = $("td.products_info_price").text();
return job;
} catch (error) {
console.error(error);
}
})
);
}
async function saveDataToCsv(data) {
const csv = new ObjectsToCsv(data);
console.log(csv);
}
async function scrapeWona() {
const productsWithImages = await scrapeProductPage();
const wonaFullData = await scrapeDescription(productsWithImages);
await saveDataToCsv(productsWithImages);
}
scrapeWona();
The reason you're getting the warning is because of process.setMaxListeners(0)
Indicates you have a memory leak somewhere in the code.
You can take a look at the documentation here also: https://nodejs.org/docs/latest/api/events.html#events_emitter_setmaxlisteners_n
Take a look at the answer from here: node.js - request - How to "emitter.setMaxListeners()"?

Puppeteer getting element from elementHandle causing protocol error

I'm trying to scrape a certain facebook page for its posts written by a certain user and starting with a certain word.
const puppeteer = require('puppeteer');
async function findPosts(page) {
const USERNAME = 'test123';
const posts = await page.$$('.userContentWrapper');
return posts.filter(async post => {
try {
let usernameElement = await post.$('.fwb');
let username = await page.evaluate(element => element.textContent, usernameElement);
if (username === USERNAME) {
let postElement = await post.$('[data-testid="post_message"] p');
let postContent = page.evaluate(element => element.textContent, postElement);
return /\[test \d+\]/.test(postContent);
}
return false;
} catch(e) {
console.log(e);
return false;
}
});
}
(async () => {
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
await page.goto('https://www.facebook.com/groups/groupid/');
const pageTitle = await page.title();
console.log(pageTitle);
const posts = await findPosts(page);
console.log(posts);
await browser.close();
})();
I'm getting
Error: Protocol error (Runtime.callFunctionOn): Target closed. when
I'm trying to get the usernameElement
at this line:
let usernameElement = await post.$('.fwb');
Not sure what's going wrong here, any suggestions?
The problem is that the filter function does not work with Promises. So the return posts.filter(...) will immediately return and after that the browser is closed. Therefore, when you try to run the $ function on the page, the page does not exist anymore and you get the Target closed error.
To make it work with async/await syntax, you could use a simple loop instead:
async function findPosts(page) {
const USERNAME = 'test123';
const posts = await page.$$('.userContentWrapper');
const postsToReturn = [];
for (let post of posts) {
/* ... if else logic */
postsToReturn.push(post); // instead of return true
}
return postsToReturn;
}

puppeteer : cant log in and loop through urls

Hi Guys I want to log in a website and once authenticated want to loop through a given set of URLS and scrape data. What I intend to do can be described by this example,however I get Unhandled promise rejection.
const puppeteer = require("puppeteer");
list = [
"https://www.facebook.com/",
"https://www.google.com/",
"https://www.zocdoc.com/"
];
const getTitle = async (p, url) => {
try{
await p.goto(url);
const title = await p.title();
console.log(title);
}
catch(e) {
console.log(e)
}
return title
};
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
console.log(this)
for (var url of list) {
getTitle(page, url)
}
await browser.close();
})();
There are multiple issues in this example.
You should await the call to function getTitle, you re awaiting inside the function but you have to await the call to the function too.
You should surround getTitle with a try and catch block and check inside the function if theres a title to return (ex. the title for google is null)
const puppeteer = require("puppeteer");
list = [
"https://www.facebook.com/",
"https://www.google.com/",
"https://www.zocdoc.com/"
];
const getTitle = async (p, url) => {
try{
await p.goto(url);
const title = await p.title();
if(title){
return title
}
}
catch(e) {
throw(e)
console.log(e)
}
};
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
console.log(this)
for (var url of list) {
try{
console.log(await getTitle(page, url))
}
catch(e ){
console.log('No title')
}
}
await browser.close();
})();

Resources