Simple code, should work, but it doesn't.
const puppeteer = require ('puppeteer');
async function scrapeProduct(url) {
const browser = await puppeteer.launch({ headless:false });
const page = await browser.newPage();
await page.setExtraHTTPHeaders({
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
});
await page.goto(url)
const [el] = await page.$x('/html/body/main/div[1]/div/div/div[2]/h1');
const txt = await el.getProperty('txt')
const srcText = await txt.jsonValue()
console.log(srcText)
}
scrapeProduct('https://getbootstrap.com/')
//Same result on other urls as well.
I've also tried to querySelector instead of xPath, that worked in some cases, it would log the first value of the node as expected, but then querySelectorAll on the same element would again return "undefined". I've looked everywhere, but simply can't find the solution.
i do it this way
const puppeteer = require("puppeteer");
async function scrapeProduct(url) {
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
await page.setExtraHTTPHeaders({
"user-agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
});
await page.goto(url);
// wait for elements defined by XPath appear in page
await page.waitForXPath("/html/body/main/div[1]/div/div/div[2]/h1");
// evaluate XPath expression of the target selector (it return array of ElementHandle)
const headings = await page.$x("/html/body/main/div[1]/div/div/div[2]/h1");
// prepare to get the textContent of the selector above (use page.evaluate)
let textContent = await page.evaluate((el) => el.textContent, headings[0]);
console.log(textContent);
}
scrapeProduct('https://getbootstrap.com/')
upvote my answer if it helps !
Related
I'm using puppeteer for the first time running it on locally hosted firebase cloud functions.
I've been trying with different accounts, and I waited hours so that the error may resolves, but no luck. The error I'm getting:
I can't interact with the site, and even if I switch routs this is the only thing popping up.
What I did/tried:
I followed this tutorial and coded the exact same app: https://www.youtube.com/watch?v=dXjKh66BR2U
Searched for hours on google if there is anything like my problem, still no solution that worked for me.
Edit:
The code I'm using is basically from fireship.io:
const puppeteer = require('puppeteer');
const scrapeImages = async (username) => {
const browser = await puppeteer.launch( { headless: true });
const page = await browser.newPage();
await page.goto('https://www.instagram.com/accounts/login/');
// Login form
await page.screenshot({path: '1.png'});
await page.type('[name=username]', 'fireship_dev');
await page.type('[name=password]', 'some-pa$$word');
await page.screenshot({path: '2.png'});
await page.click('[type=submit]');
// Social Page
await page.waitFor(5000);
await page.goto(`https://www.instagram.com/${username}`);
await page.waitForSelector('img ', {
visible: true,
});
await page.screenshot({path: '3.png'});
// Execute code in the DOM
const data = await page.evaluate( () => {
const images = document.querySelectorAll('img');
const urls = Array.from(images).map(v => v.src);
return urls;
});
await browser.close();
console.log(data);
return data;
}
The error I'm getting on console:
UnhandledPromiseRejectionWarning: TimeoutError: waiting for selector `input[name="username"]` failed: timeout 30000ms exceeded
Try to add additional headers, before your page.goto(), like this:
await page.setExtraHTTPHeaders({
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
'upgrade-insecure-requests': '1',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9,en;q=0.8'
})
It adds headers to make puppeteer look like a normal browser used from a normal OS
Im running a script that logs into an authenticated session on a website and clicks a button to download an excel file. Im able to run it with no problems while headless: false, but when headless:true, the file does not download.
My research suggests that the browser is closing before the download completes possibly? Ive added a wait of about 15 seconds, which is much longer than it should need to download the file, but still not getting anything. Another solution I tried was manually removing the HeadlessChrome substring from the userAgent in case the site was blocking it, but that didnt work either. Is it okay to use headless:false in a script that is used in a production web application deployed on Heroku?
async function getData () {
try {
const wait = (ms) => new Promise(resolve => setTimeout(resolve, ms))
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('<url>');
//login
await page.type('#username',username);
await page.click('#signIn');
await wait(4000)
await page.type('#password',password);
await page.click('#signIn');
await page.waitForNavigation();
await page.keyboard.press('Enter'); //click out of any pop up
// //go to merchandising page
await page.click('#m_69-link');
await page.waitForSelector('#ExcelReportButton', {visible: true})
//click on export as excel icon
await wait(4000)
await page.click('#ExcelReportButton');
await wait(15000)
await browser.close();
} catch (error) {
console.log(error)
}
};
try by adding additional headers, it worked for me:
await page.setExtraHTTPHeaders({
'Accept-Language': 'en-US,en;q=0.9'
});
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36');
headers["user-agent"] = fakeUa();
console.log(fakeUa())
let firstReq = true;
page.route('**/*', route => {
const request = route.request()
//console.log(request.url(), JSON.stringify(request.headers()));
if("x-j3popqvx-a" in request.headers()){
headers = request.headers();
//console.log(headers);
console.log("exiting");
return;
}
else {
console.log("in");
return route.continue({headers: headers});
}
});
let pageRes = await page.goto(url, {waitUntil: 'load', timeout: 0});
I want to add fake user agent when sending request to url. But it doesn't add the fake useragent rather goes with the default one.
While in puppeteer it was possible with the page.setUserAgent() method to apply a custom UA and page.setExtraHTTPHeaders() to set any custom headers, in playwright you can set custom user agent (userAgent) and headers (extraHTTPHeaders) as options of browser.newPage() or browser.newContext() like:
const page = await browser.newPage({ userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36' })
const page = await browser.newPage({
extraHTTPHeaders: {
'Cache-Control': 'no-cache'
}
})
Edit: In case you are using it with newContext() usage looks like this (make sure to set userAgent in the settings of newContext and not in newPage!):
const context = await browser.newContext({ userAgent: 'hello' })
const page = await context.newPage()
// to check the UA:
console.log(await page.evaluate(() => navigator.userAgent))
If you're using #playwright/test, you can set a user agent as follows:
import {expect, test} from "#playwright/test"; // ^1.30.0
const userAgent =
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36";
test.describe("with user agent", () => {
test.use({userAgent});
test("does stuff", async ({page}) => {
await page.goto("https://example.com/");
await expect(page.locator("h1")).toHaveText("Example Domain");
});
});
i am trying to scrape today's matches on betfair and wanna get:
home team
away team
x odd
draw odd
y odd
problem is i keep getting multiple spaces, i have tried alot and cannot fix it, the problem is not with trim but with the execution flow that causes empty lines
Can somebody tell me what im doing wrong?
My code:
const request = require('request');
const cheerio = require('cheerio');
const fs = require('fs');
var url = 'https://www.betfair.com/sport/football';
var customHeaderRequest = request.defaults({
headers: {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'}
})
customHeaderRequest.get(url, function(err, resp, body){
$ = cheerio.load(body);
links = $('.section-list .section:nth-child(2) .event-list li');
$(links).each(function(i, link){
var home = $(link).find('.event-information div:nth-child(3) div a div span.team-name:nth-child(1)');
var h = home.text();
if(h != null || h!=''){
fs.appendFile('message.txt', h+'\n', function (err) {});
}
});
});
You shouldn't be calling fs.appendFile() in a loop like this and you may need a better test for an empty line than just what you were using. fs.appendFile() is an asynchronous operation and you're essentially calling a whole bunch of fs.appendFile() operations one after another without waiting for the prior ones to finish.
You can either use a stream or you have to wait until the previous fs.appendFile() is done before calling the next one.
And, if you want to make sure you have no blank-looking results, you need a better filter for results that have only whitespace in them (I added .trim() to my code below).
Here's one way to do that:
const request = require('request');
const cheerio = require('cheerio');
const fs = require('fs');
const util = require('util');
const appendFile = util.promisify(fs.appendFile);
var url = 'https://www.betfair.com/sport/football';
var customHeaderRequest = request.defaults({
headers: {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'}
})
customHeaderRequest.get(url, async function(err, resp, body){
try {
let $ = cheerio.load(body);
let links = $('.section-list .section:nth-child(2) .event-list li').toArray();
for (let link of links) {
const home = $(link).find('.event-information div:nth-child(3) div a div span.team-name:nth-child(1)').text().trim();
if (home) {
await appendFile('message.txt', home +'\n');
}
}
} catch(e) {
// error writing to the file, handle that error here
}
});
Other notes: You should also always declare all local variables you are using so they are never allowed to be implicit globals.
I'm using node.js puppeteer library to handle WhatsApp Web. I've managed to handle the entire page, except for when I try to upload a file via upload dialog.
I've tried many ways to handle the window dialog, including VBS, batch,sendkeys etc.
Is there any way that i could enter a text inside the text-line of the dialog box and press "Open" https://i.stack.imgur.com/cRVNJ.jpg][1] as well?
Here is my code up to the adding file in WhatsApp (raw code)
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3264.0 Safari/537.36');
page.goto('https://web.whatsapp.com/', { waitUntil: 'networkidle2', timeout: 0 }).then(async function (response) {
await page.waitFor(networkIdleTimeout);
await page.waitFor(user_chat_selector);
await page.click(user_chat_selector);
await page.waitFor(networkIdleTimeout);
await page.keyboard.type('Testing');
await page.waitFor(networkIdleTimeout);
await page.keyboard.press('Enter');
await page.waitFor(networkIdleTimeout);
await page.waitFor(pin_attach);
await page.click(pin_attach);
await page.waitFor(networkIdleTimeout);
await page.waitFor(add_image_icon);
await page.click(add_image_icon);
//await page.waitFor(networkIdleTimeout);
// await page.keyboard.type("a");
})
You don't need to open a real dialog, there is a method for uploading files:
const elementHandle = await page.$('input');
await elementHandle.uploadFile("/path/to/file");