i am working on a dashboard were the user should be able to click on a button and then get some data that would have been scraped from a site. I used puppeteer and it prints the desired data to the console but now how can i add this to my react js application ?
Here's the puppeteer code i wrote :
const puppeteer = require('puppeteer');
async function scrapeProduct(url) {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url);
const [el] = await page.$x('/html/body/div[6]/div[3]/div/div[1]/div[1]/div/div/div[1]/a/img');
const src = await el.getProperty('src');
const srcTxt = await src.jsonValue();
const [el2] = await page.$x('/html/body/div[6]/div[3]/div/div[1]/div[1]/div/div/div[2]/div/a/h3');
const txt = await el2.getProperty('textContent');
const rawTxt = await txt.jsonValue();
const [el3] = await page.$x('/html/body/div[6]/div[3]/div/div[1]/div[1]/div/div/div[2]/div/div');
const txt2 = await el3.getProperty('textContent');
const tags = await txt2.jsonValue();
console.log({srcTxt, rawTxt, tags});
browser.close();
}
scrapeProduct('https://2degrees-investing.org/resources/');
Is there a way to export this function and use it as an import somewhere ? Thanks in advance.
Related
TimeoutError: Waiting for selector .//*[#id="main-content"]/section[1]/div/div/section[1]/div/div[2]/section/div/ul[2]/li[1] failed: Waiting failed: 30000ms exceeded
This is the error I keep getting. I am copying the XPATH exactly how it is. I am currently trying to scrape job postings for fun on linkedin and this is the "job requirements" section. I am only trying to get the first list item and it keeps saying theres an error
Here is my code:
const puppeteer = require('puppeteer');
const xlsx = require('xlsx');
async function webScraper(url) {
const browser = await puppeteer.launch();
const page = await browser.newPage();
const JobLink = url;
await page.goto(url);
await page.waitForXPath('//*[#id="main-content"]/section[1]/div/section[2]/div/div[1]/div/h1')
await page.waitForXPath('//*[#id="main-content"]/section[1]/div/div/section[1]/div/ul/li[3]/span')
await page.waitForXPath('//*[#id="main-content"]/section[1]/div/section[2]/div/div[1]/div/h4/div[1]/span[1]/a')
await page.waitForXPath('//*[#id="main-content"]/section[1]/div/div/section[1]/div/ul/li[4]/span')
await page.waitForXPath('//*[#id="main-content"]/section[1]/div/div/section[1]/div/ul/li[2]/span')
await page.waitForXPath('//*[#id="main-content"]/section[1]/div/section[2]/div/div[1]/div/h4/div[1]/span[2]')
**await page.waitForXPath('//*[#id="main-content"]/section[1]/div/div/section[1]/div/div[2]/section/div/ul[2]/li[1]')**
const [posName] = await page.$x('//*[#id="main-content"]/section[1]/div/section[2]/div/div[1]/div/h1')
const posSrc = await posName.getProperty('textContent')
const positionName = await posSrc.jsonValue();
const [type] = await page.$x('//*[#id="main-content"]/section[1]/div/div/section[1]/div/ul/li[3]/span')
const typeSrc = await type.getProperty('textContent')
const JobType = await typeSrc.jsonValue();
const [company] = await page.$x('//*[#id="main-content"]/section[1]/div/section[2]/div/div[1]/div/h4/div[1]/span[1]/a')
const companySrc = await company.getProperty('textContent')
const companyName = await companySrc.jsonValue();
const [industry] = await page.$x('//*[#id="main-content"]/section[1]/div/div/section[1]/div/ul/li[4]/span')
const industrySrc = await industry.getProperty('textContent')
const industryType = await industrySrc.jsonValue();
const [comit] = await page.$x('//*[#id="main-content"]/section[1]/div/div/section[1]/div/ul/li[2]/span')
const comitSrc = await comit.getProperty('textContent')
const committment = await comitSrc.jsonValue();
const [location] = await page.$x('//*[#id="main-content"]/section[1]/div/section[2]/div/div[1]/div/h4/div[1]/span[2]')
const locationSrc = await location.getProperty('textContent')
const joblocation = await locationSrc.jsonValue();
** const [requirement] = await page.$x('//*[#id="main-content"]/section[1]/div/div/section[1]/div/div[2]/section/div/ul[2]/li[1]')
const requireSrc = await requirement.getProperty('textContent')
const jobRequirements = await requireSrc.jsonValue();**
return {positionName, JobType, JobLink, companyName, industryType, committment, joblocation, jobRequirements}
}
I tried going to another job posting, but the XPATH is the exact same. Also I was wondering if there was anyway to make it so that if the browser isnt able to wait for an XPATH, it will just skip over it and continue on to the next piece of data since I have to restart my script everytime something like this happens.
I'm trying to get the number of sellers (which are selling only NEW items) on the Amazon product page using puppeteer.
for some reason, I'm getting error in the first button '.olp-text-box'.
Any ideas?
here is my code
const pupUrl = 'https://www.amazon.com/dp/' + req.body.asinIdInput;
async function configureBrowser(){
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(pupUrl , {waitUntil: 'load', timeout: 0});
await page.click('.olp-text-box');
await page.click('#aod-filter-string');
await page.click('.a-size-base.a-color-base')
return page;
}
async function checkSellers(page){
await page.reload();
let html = await page.evaluate(()=> document.body.innerHTML);
$('#aod-filter-offer-count-string',html).each(function(){
var numberOfSellers = $(this).text();
console.log(numberOfSellers);
});
}
async function monitor(){
let page = await configureBrowser();
await checkSellers(page);
}
monitor();
I've been trying to make this simple code work with pkg.
const puppeteer = require("puppeteer");
async function scraper(url) {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url);
const title = await page.title();
await browser.close();
return title;
}
scraper("http://example.com").then(console.log);
But the exe closed at once.
I Know that the problem is that there is no "specified" path for chromium. I searched a lot and tried a lot of different things but nothing worked.
Something I tried:
const browser = await puppeteer.launch({executablePath: '/path/to/Chrome'});
But it never worked as well.
i'm using puppeteer to retrieve datas online, and facing an issue.
Two functions have the same name and return serialized object, the first one returns an empty object, but the second one does contains the datas i'm targeting.
My question is, how can I proceed to select the second occurence of the function instead of the first one, which return an empty object.
Thanks.
My code :
const puppeteer = require('puppeteer');
const cheerio = require('cheerio');
const Variants = require('./variants.js');
const Feedback = require('./feedback.js');
async function Scraper(productId, feedbackLimit) {
const browser = await puppeteer.launch();
const page = await browser.newPage();
/** Scrape page for details */
await page.goto(`${productId}`);
const data = (await page.evaluate()).match(/window.runParams = {"result/)
const data = data.items
await page.close();
await browser.close();
console.log(data);
return data;
}
module.exports = Scraper;
Website source code :
window.runParams = {};
window.runParams = {"resultCount":19449,"seoFeaturedSnippet":};
Please try this, it should work.
const data = await page.content();
const regexp = /window.runParams/g;
const matches = string.matchAll(regexp);
for (const match of matches) {
console.log(match);
console.log(match.index)
}
I'm using Chrome Headless and chrome-remote-interface npm package
const chromeLauncher = require('chrome-launcher')
const CDP = require('chrome-remote-interface')
const fs = require('fs')
async function launchChrome() {
return chromeLauncher.launch({
port: 9222,
chromeFlags: ['--disable-gpu', '--headless']
})
}
(async function () {
const chrome = await launchChrome()
const client = await CDP({port: 9222})
const {Page} = client
await Page.enable()
await Page.navigate({url: 'http://...'})
await Page.loadEventFired()
const {data} = await Page.captureScreenshot()
fs.writeFileSync('screenshot.png', Buffer.from(data, 'base64'))
await protocol.close()
await chrome.kill()
})()
But the page I'm loading contains an iframe and I want to perform some actions inside it (click some elements or access them via DOM.querySelector) before taking the screenshot.
Is it possible to switch current 'active' frame somehow via Chrome DevTools Protocol? Like it is possible in Nightwatch.js/Selenium: browser.frame(frameIndex)