Puppeteer work fines on local but not after deployment? - node.js

I have this browser started like this :
async function startBrowser() {
const browser = await puppeteer.launch({
args: [
'--no-sandbox',
'--disable-gpu',
'--disable-dev-shm-usage',
'--disable-setuid-sandbox',
'--no-first-run',
'--no-zygote',
'--single-process',
],
});
const page = await browser.newPage();
return { browser, page };
}
and I have a function that is called that runs this code :
const { browser, page } = await startBrowser();
const tab = await browser.newPage()
const request = await (await tab.goto(``));
await page.goto(``);
console.log('scraping...');
console.log(page.url());
await page.screenshot({
path: `frontend/public/assets/${coinId}.png`,
});
const text = await request.text();
const $ = cheerio.load(text);
const coinImage = $('#one img').attr('src');
const coinTitle = $('#three div h1').text();
const bids = $('#bidsrow td input').attr('value');
const timeLeft = $('#endsrow #endstext').text();
const currentBid = $('#currentbidtext').text();
const itemId = $('#itemidrow td .bolder').text();
const minBid = $('#minimumbidtext').text().replace('$', '');
On my local machine, it scrapes the data perfectly fine, however when I deploy this app to Heroku or even AWS EC2, it doesnt seem to scrape anything? and just returns empty data.
Is this because the browser is not started correctly? or ?
Spend all day deploying this app to both heroku and aws and still very confused what is wrong?

Related

How to catch a tab drop in puppeteer-extra and refresh the page?

I have a small application on puppeteer-extra, it works through a proxy server, sometimes the proxy server crashes and I get this error on the page.
if you click the "reload" button, the page will refresh and everything will be fine.
But how can I do it programmatically?
How do I catch such a tab drop?
require('dotenv').config();
const puppeteer = require('puppeteer-extra')
const PuppeteerExtraPluginProxy = require('puppeteer-extra-plugin-proxy2')
const pluginStealth = require('puppeteer-extra-plugin-stealth')
const sleep = require('./src/ToolsSleep');
async function main() {
puppeteer.use(PuppeteerExtraPluginProxy({
proxy: 'socks://username:password#gateproxy.com:6969',
}))
puppeteer.use(pluginStealth());
let file_link = await fetchLinkPage();
let browser = await puppeteer.launch({
headless: false,
userDataDir: './var/prof',
args: [
'--window-size=1200,1400',
'--window-position=000,000',
'--no-sandbox',
'--disable-dev-shm-usage',
'--disable-web-security',
'--disable-features=IsolateOrigins',
'--disable-site-isolation-trials'
]
});
let page = await browser.newPage();
await page.setExtraHTTPHeaders({ referer: file_link.referer })
await page.goto(file_link.link);
let pages = await browser.pages();
while (true) {
for await (let tab of pages) {
await sleep(1500);
if (await isDesiredPage(tab)) {
await DesiredPage(tab);
}else{
// we will close the ad if it is in other tabs
await tab.close();
}
}
await sleep(500);
}
}
main().catch((e) => {
throw e
})
I want to make sure that my "relaod" button is pressed automatically when the tab drops. How do I do this?

puppeteer Error: No node found for selector: .olp-text-box

I'm trying to get the number of sellers (which are selling only NEW items) on the Amazon product page using puppeteer.
for some reason, I'm getting error in the first button '.olp-text-box'.
Any ideas?
here is my code
const pupUrl = 'https://www.amazon.com/dp/' + req.body.asinIdInput;
async function configureBrowser(){
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(pupUrl , {waitUntil: 'load', timeout: 0});
await page.click('.olp-text-box');
await page.click('#aod-filter-string');
await page.click('.a-size-base.a-color-base')
return page;
}
async function checkSellers(page){
await page.reload();
let html = await page.evaluate(()=> document.body.innerHTML);
$('#aod-filter-offer-count-string',html).each(function(){
var numberOfSellers = $(this).text();
console.log(numberOfSellers);
});
}
async function monitor(){
let page = await configureBrowser();
await checkSellers(page);
}
monitor();

How to export this node.js function in react.js

i am working on a dashboard were the user should be able to click on a button and then get some data that would have been scraped from a site. I used puppeteer and it prints the desired data to the console but now how can i add this to my react js application ?
Here's the puppeteer code i wrote :
const puppeteer = require('puppeteer');
async function scrapeProduct(url) {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url);
const [el] = await page.$x('/html/body/div[6]/div[3]/div/div[1]/div[1]/div/div/div[1]/a/img');
const src = await el.getProperty('src');
const srcTxt = await src.jsonValue();
const [el2] = await page.$x('/html/body/div[6]/div[3]/div/div[1]/div[1]/div/div/div[2]/div/a/h3');
const txt = await el2.getProperty('textContent');
const rawTxt = await txt.jsonValue();
const [el3] = await page.$x('/html/body/div[6]/div[3]/div/div[1]/div[1]/div/div/div[2]/div/div');
const txt2 = await el3.getProperty('textContent');
const tags = await txt2.jsonValue();
console.log({srcTxt, rawTxt, tags});
browser.close();
}
scrapeProduct('https://2degrees-investing.org/resources/');
Is there a way to export this function and use it as an import somewhere ? Thanks in advance.

Running Puppeteer from GitLab

I am trying to execute the below script from GitLab CI/CD.
This puppeteer script is in a .js which is getting called from the GitLab repositories .gitlab-ci.yml file.
The purpose of the script is to navigate to INITIAL_PAGE_URL, login and navigate to the HOME_PAGE. The sign-in-button has a click method which on successful login navigates to the HOME_PAGE.
The script runs perfectly when running from the local system but when running from GitLab:
no error is shown
console.log("logged in") is executed and prints the message.
However, it does not navigate to the next page and page.url() still shows the INITIAL_PAGE_URL.
Any suggestions?
const HOME_PAGE = "https://www.abcd.com/home"
const SIGN_IN_FORM = "#frmSignIn";
const USERNAME_SELECTOR = 'input#EmailAddress';
const PASSWORD_SELECTOR = 'input#Password';
const LOGIN_BUTTON_SELECTOR = '#sign-in-button';
const SECRET_EMAIL = 'username';
const SECRET_PASSWORD = 'password';
const CHROME_EXE_PATH =
process.env.CHROME_EXE_PATH === "" ? "" : process.env.CHROME_EXE_PATH || "/usr/bin/chromium-browser";
const puppeteer = require('puppeteer')
const main = async () => {
const browser = await puppeteer.launch({
headless: true,
executablePath: CHROME_EXE_PATH,
args: ['--no-sandbox', '--disable-setuid-sandbox']
})
console.log("browser loaded")
const page = await browser.newPage()
await page.setViewport({width: 1366, height: 768})
//Script for login page - start
console.log("Navigating to initial page")
await page.goto(INITIAL_PAGE_URL, { waitUntil: 'networkidle2' })
await page.waitForSelector('#frmSignIn')
await page.type('input#EmailAddress', SECRET_EMAIL)
await page.type('input#Password', SECRET_PASSWORD)
await page.click(LOGIN_BUTTON_SELECTOR)
console.log("logged in")
console.log(page.url());
await browser.close();
}
main()

reuse browser instance puppeterr

I would like to know if it is possible to have one .js file that opens a browser instance, creates new page/tab logs in to a website (with username/password) and just stays idle. And in a second .js file use file one browser instance and its page.
1.js
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch({
headless: true,
args: ['--no-sandbox'],
ignoreDefaultArgs: ["--hide-scrollbars"]
});
const page = await browser.newPage();
const response = await page.goto('https://google.com');
console.log('Browser open in the background (headless)!');
//await browser.close();
})();
2.js
const puppeteer = require('puppeteer');
(async () => {
// instructions on browser instance/page from 1.js ...
})();
The crawler object keeps the state of the browser instance and
wherever you call/pass that instance, it refers to the same chromium
in the "background". If this is an overkill, and you just want to
connect to an already running chromium using puppeteer, you can do it
with puppeteer.connect. take a look at this:
How to "hook in" puppeteer into a running Chrome instance/tab – mbit
Yeah I guess its to overkill for me :). But the link you posted was what I wanted but have 2 questions.
This Is a sample what I have.
// 1.js
// open chromium, new tab, go to google.com, print browserWSEndpoint, disconnect
const puppeteer = require('puppeteer');
(async () => {
var browser = await puppeteer.launch({headless: false});
var page = await browser.newPage();
var response = await page.goto('https://google.com');
var browserWSEndpoint = browser.wsEndpoint();
console.log(browserWSEndpoint); // prints: ws://127.0.0.1:51945/devtools/browser/6462daeb-469b-4ae4-bfd1-c3bd2f26aa5e
browser.disconnect();
})();
And
// 2.js
// connect to the open browser with the browserWSEndpoint manualy put in, ... , disconect.
const puppeteer = require('puppeteer');
(async () => {
var browser = await puppeteer.connect({browserWSEndpoint: 'ws://127.0.0.1:51945/devtools/browser/6462daeb-469b-4ae4-bfd1-c3bd2f26aa5e'});
// somehow use the tab that is open from 1.js (google.com)
await browser.disconnect();
})();
I get the browserWSEndpoint string from the console.log 1.js.
It works great but I have two difficulties.
1 - How can I use the variable browserWSEndpoint from 1.js so I dont have to always copy paste it to 2.js.
2- If I open a new page/tab on 1.js and go for example to google and disconnect (browser.disconnect()), how can use that page/tab on 2.js.
Working tested code
getEmail.js is where actual page will be exported. ask clarifications in comments.
getBrowser.js
const puppeteer = require("puppeteer");
module.exports = {
browser: {},
pptr_instance_url:"",
getBrow: async function(){ try {
console.log("line6",this.pptr_instance_url);
this.browser = await puppeteer.connect({browserWSEndpoint: this.pptr_instance_url}).catch(async e =>{
console.log("end point",this.pptr_instance_url);
this.browser = await puppeteer.launch({timeout: 0});
this.pptr_instance_url = this.browser.wsEndpoint();
console.log("line 11",this.pptr_instance_url);
return this.browser;
});
return this.browser;
}catch (e){
console.log(e)
} }
}
pageRenderer.js
const abc = require("../getBrowsernew")
const pageRenderer = async (request) => {
const {reactProjectUrl} = constants, uuidStorageKey = uuidv4(),
localStorageObject = {[uuidStorageKey]: request.body};
const browser = await abc.getBrow();
let url = "someurl.com"
await setLocalStorage(browser, url, localStorageObject);
const page = await browser.newPage();
const response = await page.goto(
url,
{
waitUntil: "networkidle0"
}, {waitUntil: 'load', timeout: 0}
);
return page;
}
module.exports = pageRenderer;
getEmail.js
const pageRenderer = require("./pageRenderer");
const getEmail =async (request) =>{
const page = await pageRenderer(request)
const emailbody = await page.content();
page.close();
return emailbody;
}
module.exports = getEmail;
You can implement this in many ways like having separate modules with functions, or different classes, and it depends on your particular need.
You can have a class that launches the browser and creates pages plus some extra functionalities.
//1.js
const puppeteer = require('puppeteer');
class Crawler {
constructor() {
//init with whatever values you'll need in your class
//or throw an error if the object wasn't created through build
}
static async build() {
let crawler = new Crawler();
await crawler._init();
return crawler;
}
async _init() {
//launch the browser and keep its state
this._browser = await puppeteer.launch({timeout: 0});
//create a page and keep its state
this._page = await this._browser.newPage();
}
//getter
get browser() {
return this._browser;
}
//getter
get page() {
return this._page;
}
async login(url) {
await this._page.goto(url);
//do whatever is related to the login process
}
}
module.exports = {Crawler};
Note that we can't have async functions in the constructor. Since launching browser is async, we use something like a build function to initiate the browser when creating the object. Then we create the crawler object like this:
//2.js
const {Crawler} = require('./1.js');
(async() => {
let crawler = await Crawler.build();
await crawler.login("https://example.com");
//access crawler's page
console.log(crawler.page.url());
})();
Keep in mind that this is only an example and by no means representative of the best practices. So first, you need to understand what you want to achieve out of such encapsulation, then adopt the method that suits you best.
Read more on JS classes here

Resources