I have a function which scrapes an element and return the element value. This is the code of reale-scraper.js:
module.exports.RealeScraper = function() {
return new Promise((res, rej) => {
var url = 'example.com';
var compagnia;
//Start Puppeteer and scrape element
ptr.launch().then(async browser => {
const page = await browser.newPage();
await page.setViewport({ width: 1280, height: 800 });
await page.goto(url, {waitUntil: "networkidle0"});
await page.type('input[name="username"]', config.utente);
await page.type('input[name="password"]', config.pass);
await Promise.all([
page.click('input[type="SUBMIT"]'),
page.waitForNavigation({waitUntil: 'networkidle2'})
]);
await page.waitForSelector('#tableSceltaProfilo > tbody > tr:nth-child(1) > td:nth-child(2)');
const element = await page.$("#tableSceltaProfilo > tbody > tr:nth-child(1) > td:nth-child(2)");
compagnia = await page.evaluate(element => element.textContent, element);
await page.screenshot({path: 'screenshot.png'});
await browser.close();
});
res(compagnia);
});
}
Then i call that function and try to send data to my ejs template in home.js:
var scraper = require('../scrapers/reale-scraper');
router.get('/home', function(req, res, next) {
RealeScraper().then((compagnia) => {
res.render('nuovo-sinistro', {
titolo: 'Manager Perizie',
compagnia: compagnia
});
}).catch((error) => {
console.log(error);
});
});
I want to wait until 'RealeScraper' is finished and returned me a value so that I can pass it to res.render. I've tried using Promise but it doesn't work. It gives me no errors but when I load the page, the function doesn't start and so is rendered without the variable.
I've also tried different methods but ended up having the page loading forever.
Any help would be really appreciated, thanks!
You call //Start Puppeteer and scrape element and res(compagnia); at the same time, while compagnia is empty, it has been returned.
Just call res when scrape element finished.
...
await browser.close();
res(compagnia);
...
I think it will be better if you only use async\await like this:
module.exports.RealeScraper = async function () {
var url = 'example.com';
var compagnia;
//Start Puppeteer and scrape element
let browser = await ptr.launch();
const page = await browser.newPage();
await page.setViewport({ width: 1280, height: 800 });
await page.goto(url, { waitUntil: "networkidle0" });
await page.type('input[name="username"]', config.utente);
await page.type('input[name="password"]', config.pass);
await page.click('input[type="SUBMIT"]'); // why you do that in parallel?
await page.waitForNavigation({ waitUntil: 'networkidle2' });
await page.waitForSelector('#tableSceltaProfilo > tbody > tr:nth-child(1) > td:nth-child(2)');
const element = await page.$("#tableSceltaProfilo > tbody > tr:nth-child(1) > td:nth-child(2)");
compagnia = await page.evaluate(element => element.textContent, element);
await page.screenshot({ path: 'screenshot.png' });
await browser.close();
return compagnia;
}
// ...
var scraper = require('../scrapers/reale-scraper');
router.get('/home', async function (req, res, next) {
try {
let compagnia = await RealeScraper();
res.render('nuovo-sinistro', {
titolo: 'Manager Perizie',
compagnia: compagnia
});
} catch (error) {
console.log(error);
}
});
Related
This is my code - scraping courses titles works ok, but I have problem with images
const puppeteer = require('puppeteer-extra');
const StealthPlugin = require('puppeteer-extra-plugin-stealth')
puppeteer.use(StealthPlugin());
const fs = require('fs/promises')
function sleep(ms)
{
return new Promise(resolve => setTimeout(resolve, ms));
}
async function start()
{
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
await page.goto("https://www.udemy.com/pl/courses/development/web-development/?lang=pl&sort=popularity&persist_locale=&locale=pl_PL");
await sleep(5000);
const names = await page.evaluate(() => {
return Array.from(document.querySelectorAll(".course-list--container--3zXPS div.udlite-focus-visible-target.udlite-heading-md.course-card--course-title--vVEjC")).map(x => x.textContent)
})
const images = await page.evaluate(() => {
return Array.from(
document.querySelectorAll(".course-list--container--3zXPS div.course-card--image-wrapper--1F9ny")
).map((image) => image.getAttribute(`src`));
});
let m = ";";
for (let i = 0; i < names.length; i++)
{
names[i] = i+m+names[i]+m+images[i]
}
await fs.writeFile("courses.txt", names.join("\r\n"))
await page.screenshot({ path: "udemy.png", fullPage: true });
await browser.close();
}
start()
Now it returns null instead images url, if I change src to srcset nothing changes.
The page that I want to scrape the images from is https://www.udemy.com/pl/courses/development/web-development/?lang=pl&sort=popularity&persist_locale=&locale=pl_PL
On a screenshot that this script takes I can see that courses icons are blacked out. I can scrape images that are visible on screenshot, but not those that are blacked out.
Ok I found the answer - I added setVievport function and img at the end of QuerySelectorAll
const puppeteer = require('puppeteer-extra');
const StealthPlugin = require('puppeteer-extra-plugin-stealth')
puppeteer.use(StealthPlugin());
const fs = require('fs/promises')
function sleep(ms)
{
return new Promise(resolve => setTimeout(resolve, ms));
}
async function start()
{
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
await page.goto("https://www.udemy.com/pl/courses/development/web-development/?lang=pl&sort=popularity&persist_locale=&locale=pl_PL", { "waitUntil": "networkidle0" });
await sleep(1000);
const bodyWidth = await page.evaluate(() => document.body.scrollWidth);
const bodyHeight = await page.evaluate(() => document.body.scrollHeight);
await page.setViewport({ width: bodyWidth, height: bodyHeight });
await sleep(1000);
const names = await page.evaluate(() => {
return Array.from(document.querySelectorAll(".course-list--container--3zXPS div.udlite-focus-visible-target.udlite-heading-md.course-card--course-title--vVEjC")).map(x => x.textContent)
})
const images = await page.evaluate(() => {
return Array.from(
document.querySelectorAll(".course-list--container--3zXPS div.course-card--image-wrapper--1F9ny img")
).map((image) => image.getAttribute(`src`));
});
let m = ";";
for (let i = 0; i < names.length; i++)
{
names[i] = i+m+names[i]+m+images[i]
}
await fs.writeFile("courses.txt", names.join("\r\n"))
await page.screenshot({ path: "udemy.png", fullPage: true });
await browser.close();
}
start()
New to nodejs and puppeteer. I'm trying to loop through some street names in a public county property value search. However, I cannot see what I'm doing wrong. I've had this working for a individual street name before I attempted to do a loop of street names. I've replaced the street names for protection.
const puppeteer = require('puppeteer');
var street_names = ["street1","street2","street3"]
for (var i = 0; i < street_names.length;i++) {
// console.log(street_names[i]); // Used to test if the loop works.
(async () => {
const browser = await puppeteer.launch({executablePath: '/usr/bin/chromium-browser'});
const page = await browser.newPage();
await page.setViewport({ width: 1920, height: 1080 });
await page.setDefaultNavigationTimeout(0);
// Property Search Page
await page.goto('http://propaccess.traviscad.org/clientdb/PropertySearch.aspx?cid=1', {waitUntil: 'domcontentloaded'});
//type the enter street
await page.select('select[name="propertySearchOptions:recordsPerPage"]', '250'); // Select 250 results per page
await page.screenshot({path: 'screenshot.jpg', fullPage: true});
await page.focus('#propertySearchOptions_streetName');
await page.type('input[name="propertySearchOptions:streetName"]',street_names[i]);
//await page.keyboard.type('street_names[i]');
await page.click('#propertySearchOptions_searchAdv');
// Enter Results Page
await page.screenshot({path: 'street_names[i]_screenshot.jpg', fullPage: true});
await page._client.send('Page.setDownloadBehavior', {behavior: 'allow', downloadPath: './results'});
await page.waitForSelector('#footer');
await page.click('#propertySearchResults_exportResults');
await page.waitForTimeout (3500);
await page.screenshot({path: 'screenshot.jpg', fullPage: true});
await browser.close();
process.exit(1);
});
}
You forgot to call (to add () after) the defined async function.
It would be more efficient to open the browser once and then reuse it with its page. To do so, you can place the loop inside the async function.
const puppeteer = require('puppeteer');
var street_names = ["street1","street2","street3"]
(async () => {
const browser = await puppeteer.launch({executablePath: '/usr/bin/chromium-browser'});
const page = await browser.newPage();
await page.setViewport({ width: 1920, height: 1080 });
await page.setDefaultNavigationTimeout(0);
for (var i = 0; i < street_names.length;i++) {
// console.log(street_names[i]); // Used to test if the loop works.
// Property Search Page
await page.goto('http://propaccess.traviscad.org/clientdb/PropertySearch.aspx?cid=1', {waitUntil: 'domcontentloaded'});
// type the enter street
await page.select('select[name="propertySearchOptions:recordsPerPage"]', '250'); // Select 250 results per page
await page.screenshot({path: 'screenshot.jpg', fullPage: true});
await page.focus('#propertySearchOptions_streetName');
await page.type('input[name="propertySearchOptions:streetName"]',street_names[i]);
//await page.keyboard.type('street_names[i]');
await page.click('#propertySearchOptions_searchAdv');
// Enter Results Page
await page.screenshot({path: 'street_names[i]_screenshot.jpg', fullPage: true});
await page._client.send('Page.setDownloadBehavior', {behavior: 'allow', downloadPath: './results'});
await page.waitForSelector('#footer');
await page.click('#propertySearchResults_exportResults');
await page.waitForTimeout (3500);
await page.screenshot({path: 'screenshot.jpg', fullPage: true});
}
await browser.close();
process.exit(1);
})();
I see that you defined the function inside the loop but you do not call the function
I would like to add an image before a screenshot with puppeteer.
The following code works but instead of waiting like this, I would like to wait until the img is here :
element.innerHTML = "<img id=\"logo_website\" src=\"http://random.com/logo.jpg\">";
await page.waitFor(2000)
I tried with the following "waitFor" but it doesn't work.
await page.waitFor("#logo_website")
You can try page.waitForResponse() in this way:
'use strict';
const puppeteer = require('puppeteer');
(async function main() {
try {
const browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.goto('https://example.org/');
await Promise.all([
page.waitForResponse('https://www.iana.org/_img/2013.1/iana-logo-header.svg'),
page.evaluate(() => {
document.body.innerHTML = '<img id="logo_website" src="https://www.iana.org/_img/2013.1/iana-logo-header.svg">';
}),
]);
await page.screenshot({ path: 'scr.png' });
await browser.close();
} catch (err) {
console.error(err);
}
})();
Can someone help me understand why doesn't the product data get printed out? I'm currently using puppeteer to scrape a website for product data.
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
//link to page that i want to scrape
await page.goto(
"link link",
{ waitUntil: "networkidle2" }
);
var data = await page
.evaluate(() => {
var productData = {};
productData["brand"] = document.querySelector(
"a.designer-name > span"
).textContent;
console.log("mimo");
return productData;
})
.catch(err => {
console.log(err);
});
console.log(data);
await browser.close();
})();
you are using promise and callback together. If you instead return a promise from the page.evaluate, it should work.
thanks to #tehhowch.
var data = await page
.evaluate(async () => {
return await new Promise(resolve => { // <-- return the data to node.js from browser
var productData = {};
productData["brand"] = document.querySelector(
"a.designer-name > span"
).textContent;
console.log("mimo");
resolve(productData);
});
})
.catch(err => {
console.log(err);
});
console.log(data);
Hi I am trying to make a screenshot service
const puppeteer = require('puppeteer');
var resWidth = 1366;
var resHeight = 1000;
var browser;
(async () => {
browser = await puppeteer.launch({ignoreHTTPSErrors: true});
});
and when I receive a work I try to do
data.forEach(function(d){
try {
console.log(d["url"]);
(async () => {
var page = await browser.newPage();
await page.setViewport({width: resWidth, height: resHeight});
await page.goto(d["url"], {timeout: 90000, waitUntil: 'networkidle'});
await page.screenshot({path: './picdata/' + d['id'] + '.png' ,fullPage: true});
await page.close();
})();
} catch(e) {}
});
but I can't... here is the error:
(node:40596) UnhandledPromiseRejectionWarning: Unhandled promise rejection (rejection id: 7): TypeError: Cannot read property 'newPage' of undefined
I don't want to open a new browser for each screenshot launching browser takes time and requires more memory?
what should I do?
The problem:
(async () => {
browser = await puppeteer.launch({ignoreHTTPSErrors: true});
});
This code never gets executed. Why? because it's not a true closure.
More on closures, here.
That being said, that wont have work for your given scenario, as they are async tasks.
My try with your example:
'use strict';
const puppeteer = require('puppeteer');
const resWidth = 1366;
const resHeight = 1000;
let browser;
let page;
async function launchBrowser() {
browser = await puppeteer.launch({ headless: true }); //this "{ headless: true }" will ensure a browser window is not open a single time.
};
launchBrowser().then((x) => { // wait until browser has launched
data.forEach(async (d) => {
try {
page = await browser.newPage(); // the document is not very clear about this method, so I am still unsure if I could move this in the above launchBrowser() method.
await page.setViewport({ width: resWidth, height: resHeight });
await page.goto(d['url'], { timeout: 90000, waitUntil: 'networkidle' });
await page.screenshot({ path: './picdata/' + d['id'] + '.png', fullPage: true });
}
catch (e) {
console.log(e);
await browser.close(); // close browser if there is an error
}
});
})
.then(() => {
await browser.close(); // close browser finally.
});
Pro Tip: Start using let, const instead of var.
There is a great article on this, here