I am using Puppeteer to scrape data from YouTube playlist but can not got any data.
I have tried code with browser and use Query Selector but want to automate this process and generate json file as output of this process.
code
const puppeteer = require('puppeteer');
(async () => {
console.log("begin");
const browser = await puppeteer.launch({headless : false });
const page = await browser.newPage();
console.log("after newPage");
await page.goto('https://www.youtube.com/playlist?list=PL2-FkZlJhxqVXZO1c6gKgsAdiet0zcOAO');
console.log("after goto ");
const selectorA = "a.yt-simple-endpoint.ytd-playlist-video-renderer"
await page.waitForSelector(selectorA);
console.log("after waitForSelector ");
const items = await page.$$eval(selectorA, rows => {
console.log("eval " + rows);
return rows;
});
console.log("items " + items);
await browser.close();
})();
results
begin
after newPage
after goto
after waitForSelector
items undefined
Screenshot from same selector with broswer
According to the docs, various eval functions can transfer only serializable data (roughly, the data JSON can handle, with some additions). Your code returns an array of DOM elements, which are not serializable (they have methods and circular references). Try to retrieve the data in the browser context and returns only serializable data. For example:
return rows.map(row => [row.innerText, row.href]);
Related
I am trying to use puppeteer to get data from a website, mostly for learning purposes, but I am getting the following error:
Error: Evaluation failed: TypeError: Cannot read properties of null
(reading 'innerHTML')
I tested by removing the .innerHTML from the result and it logs the whole element as an object successfully so i know im hitting the right element, its when I add the .innerHTML (i tried .innerText too) that is errors.
I suspect its down to some delay in the page loading as very occasionally it does work but I am not sure how to go about fixing that.
async function getData(searchJob,searchLocation){
const browser = await puppeteer.launch({headless: false});
const page = await browser.newPage();
await page.goto("https://somewebsite");
await page.waitForSelector('#onetrust-accept-btn-handler');
//Click the accept cookies button
await page.evaluate(()=>{
document.querySelector('#onetrust-accept-btn-handler').click();
})
await page.type("#keywords",searchJob);
await page.type("#location",searchLocation);
await Promise.all([page.click(".btn-search"),page.waitForNavigation()]);
const grabJobs = await page.evaluate(() =>{
const jobs = document.querySelectorAll(".job-result-card"); //get the overall container for all of the jobs
let jobsArray = []; //create an array to put the job details into
jobs.forEach((jobTag)=>{ //loop through the retrieved jobs
const company = jobTag.querySelector(".gtmJobListingPostedBy");
jobsArray.push([companyText.innerHTML])
})
console.log(jobsArray);
return jobsArray;
})
await browser2.close();
}
I'm facing an issue trying to scrape datas on the web with puppeteer and querySelector.
I have a nodeJS WebServer that handle a post query, and then call a function to scrape the datas. I'm sending 2 parameters (postBlogUrl & postDomValue).
PostDomValue will contains as string the selector I'm trying to fetch datas from, for example:
[itemprop='articleBody'].
If I manually suggest the selector ([itemprop='articleBody']), everything is working well, I'm able to retrieve datas, but if i use the postDomValue var, nothing is returned.
I already tried to escape the var using CSS.escape(postDomValue), but no luck.
fetchBlogContent: async function(postBlogUrl, postDomValue) {
try {
const puppeteer = require('puppeteer');
const browser = await puppeteer.launch();
page = await browser.newPage();
await page.goto(postBlogUrl, {
waitUntil: 'load'
})
let description = await page.evaluate(() => {
//This works return document.querySelector("[itemprop='articleBody']").innerHTML;
//This won't return document.querySelector(postDomValue).innerHTML;
})
return description
} catch (err) {
// handle err
return err;
}
}
const description = await page.evaluate((value) =>
document.querySelector(value).innerHTML, JSON.stringify(postDomValue));
See docs on how to pass args to page.evaluate() in puppeteer
If I understand correctly, the issue may be that you try to use a variable declared in the Node.js context inside an argument function of page.evaluate() that is executed in the browser context. In such cases, you need to transfer the value of a variable as an additional argument:
let description = await page.evaluate((selector) => {
return document.querySelector(selector).innerHTML;
}, postDomValue);
See more in page.evaluate().
I'm trying to get the src values for all images on Bing image search for a search term. I am using puppeteer for it. I wrote a selector to grab each image tag and it works in the Chrome DevTools. It, however, isn't working when I write it in the code-
const puppeteer = require("puppeteer");
(async () => {
try{
let url = `https://www.bing.com/images/search?q=cannabis`
const browser = await puppeteer.launch({headless: false})
const page = await browser.newPage()
await page.goto(url)
await page.waitForSelector("ul.dgControl_list li img.mimg")
console.log(await page.evaluate(() => {
Array.from(document.querySelectorAll("ul.dgControl_list>li img.mimg"), img => img.src)
}))
} catch(err){
console.log("error - " + err)
}
})()
I get the output as an object containing arrays of 10 items each in the devTools, but when I run it in the console through my code, it is undefined. How do I read this object?
You are not returning any data from the page.evaluate call. To return the data you have to use the return statement or use the short syntax (as explained below):
console.log(await page.evaluate(() => {
return Array.from(document.querySelectorAll("ul.dgControl_list>li img.mimg"), img => img.src)
}))
Explanation: Arrow function
The arrow function has two ways to write them. One is the short syntax, you can use it like this:
const func = () => 1; // func() will simply return 1
You can only put in one statement in there (which might call other statements though). Alternatively, you can use the long form:
const func = () => { return 1; }; // Same function as above
You can use variable declarations and any kind of code inside this function (just as in a normal function() { ... }, but this time you have to use return to return a value.
Therefore, as an alternative, you could also write this (short syntax):
console.log(await page.evaluate(
() => Array.from(document.querySelectorAll("ul.dgControl_list>li img.mimg"), img => img.src)
))
I'm trying to return the whole windows object from a page, and then traversing the object outside of puppeteer.
I'm trying to access the data in Highcharts property, for which I need to access the window object. The normal javascript code being something like window.Highcharts.charts[0].series[0].data.
I thought the easiest way would be to use puppeteer to access the site, and just send me back the windows object, which I could then use outside of puppeteer like any other JS object.
After reading the documentation, I'm finding it difficult to return the object as it would appear just putting 'window' into the chrome console. I'm not sure what I'm missing?
I've read through the documentation, and the following two methods seem like they should work?
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://www.example.com', {waitUntil: 'networkidle2'});
// METHOD 1
// Create a Map object
await page.evaluate(() => window.map = new Map());
// Get a handle to the Map object prototype
const mapPrototype = await page.evaluateHandle(() => Map.prototype);
// Query all map instances into an array
const mapInstances = await page.queryObjects(mapPrototype);
console.log(mapInstances);
await mapInstances.dispose();
await mapPrototype.dispose();
// METHOD 2
const handle = await page.evaluateHandle(() => ({window, document}));
const properties = await handle.getProperties();
const windowHandle = properties.get('window');
const documentHandle = properties.get('document');
var result = await page.evaluate(win => win, windowHandle);
console.log(result)
await handle.dispose();
await browser.close();
})();
However, it only returns the following in the console, and not the simple object I would like;
Not sure if I'm going about this the right way, so any help/advice is much appreciated.
Background:
I am writing a Nodejs script with puppeteer to web scrape data from a web page. I'm not familiar with Nodejs, promises, or puppeteer. I've tried many things and done research for a few days.
Application Flow:
With automation, go to a website
Scrape data from the page, push to an array
If there is a "next page" click the next page button
Scrape data from the page, push to same array
Repeat
Problem:
My problem is with #3. With web automation, clicking the next page button.
All I want, is to use the .click() method in puppeteer, to click on the button selector. However, .click() returns a Promise. Since it's a promise, I need keyword await, but you can't have await in the for loop (or any block other than async).
What Have I Tried:
I've tried creating another async function, with statements for await page.click();and calling that function in the problem area. I've tried creating a regular function with page.click() and calling that in the problem area. Refactoring everything to have it not work as well. I'm not really understanding Promises and Async/Await even after reading about it for a few days.
What I Want Help With:
Help with invoking the .click() method inside the problem area or any help with selecting the 'Next Page' using web automation.
Pseudo Code:
let scrape = async () => {
await //do.some.automation;
const result = await page.evaluate(() => {
for (looping each page) {
if (there is a next page) {
for (loop through data) {
array.push(data);
//----PROBLEM----
//use automation to click the selector of the next page button
//--------------
}
}
}
return data;
});
//close browser
return result;
};
scrape().then((value) => {
//output data here;
});
});
All Code:
let scrape = async () => {
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
await page.goto("GO TO A WEBSITE");
await page.click("CLICK A BUTTON");
await page.waitFor(2000);
//Scraping
const result = await page.evaluate(() => {
let pages = document.getElementsByClassName("results-paging")[2];
let allPages = pages.getElementsByClassName("pagerLink");
let allJobs = [];
//Loop through each page
for (var j = 0; j < allPages.length; j++) {
let eachPage = pages.getElementsByClassName("pagerLink")[j].innerHTML;
if (eachPage) {
//Scrape jobs on single page
let listSection = document.getElementsByTagName("ul")[2];
let allList = listSection.getElementsByTagName("li");
for (var i = 0; i < allList.length; i++) {
let eachList = listSection.getElementsByTagName("li")[i].innerText;
allJobs.push(eachList);
//--------PROBLEM-------------
await page.click('#selector_of_next_page');
//----------------------------
}
}
else {
window.alert("Fail");
}
}
return allJobs;
});
browser.close();
return result;
};
scrape().then((value) => {
let data = value.join("\r\n");
console.log(data);
fs.writeFile("RESULTS.txt", data, function (err) {
console.log("SUCCESS MESSAGE");
});
});
Error Message:
SyntaxError: await is only valid in async function
You can not use page methods inside page.evaluate function.
Based on your example you should change
await page.click('#selector_of_next_page');
to native JS equivalent
document.getElementById('selector_of_next_page').click();