How to remember authorization when parsing a site on nodejs cookies - node.js

i go to the site and look for an authorization form, if I find it, then I log in, if not, then I do the actions I need
let cookie_ = fs.readFileSync("cookies.json");//I am looking for a file with saved cookies
cookie = JSON.parse(cookie_);//Converting to json
nightmare
.goto('https://site.ru/login')//I go to the site
.cookies.set(cookie)//I substitute cookies from the file
.evaluate(function () {
return document.querySelector('input[id="email"]');//I am looking for a field to enter mail
})
.then(function (page) {
if(page) {//I check if there is a field for entering mail
f().then(function (cookies) {//We get the result from the function
require('fs').writeFileSync(//And write to file
'cookies.json',
JSON.stringify(cookies)
);
})
} else {
console.log('You are logged in');
}
})
async function f() {//I call the function if we are not authorized
return new Promise((resolve, reject) => {
nightmare
.goto('https://site.ru/login')
.type('input[id="email"]', 'login')//Enter mail
.type('input[id="password"]', 'passord')//Enter your password
.click('.btn.btn-danger')//Click on the authorization button
.wait(2000)//We wait 2 seconds
.cookies.get()//We receive cookies
.then(resolve)
});
}
The file is created, cookies are written, but with the next attempts to run the script, the authorization form still appears
I also tried first to go to - goto('about: blank') then set cookies and then go to goto('https://site.ru/login')
Error - UnhandledPromiseRejectionWarning: Error: Setting cookie failed

Unfortunately, it was not possible to solve the problem through nightmare
Solved with puppeteer
Example - saving cookies to file
const puppeteer = require('puppeteer')
const fs = require('fs');
(async () => {
const browser = await puppeteer.launch()
const page = await browser.newPage()
await page.goto('https://github.com/login')
await page.type('#login_field', process.env.GITHUB_USER)
await page.type('#password', process.env.GITHUB_PWD)
await page.waitForSelector('.js-cookie-consent-reject')
await page.click('.js-cookie-consent-reject')
await page.$eval('[name="commit"]', (elem) => elem.click())
await page.waitForNavigation()
const cookies = await page.cookies()
const cookieJson = JSON.stringify(cookies)
fs.writeFileSync('cookies.json', cookieJson)
await browser.close()
})()
Reading cookies from a file
const puppeteer = require('puppeteer')
const fs = require('fs');
(async () => {
const browser = await puppeteer.launch()
const page = await browser.newPage()
const cookies = fs.readFileSync('cookies.json', 'utf8')
const deserializedCookies = JSON.parse(cookies)
await page.setCookie(...deserializedCookies)
await page.goto(`https://github.com/${process.env.GITHUB_USER}`)
await browser.close()
})()
Article

Related

fs.writeFile crashes node app after writing first json file

I'm trying to crawl several web pages to check broken links and writing the results of the links to a json files, however, after the first file is completed the app crashes with no error popping up...
I'm using Puppeteer to crawl, Bluebird to run each link concurrently and fs to write the files.
WHAT IVE TRIED:
switching file type to '.txt' or '.php', this works but I need to create another loop outside the current workflow to convert the files from '.txt' to '.json'. Renaming the file right after writing to it also causes the app to crash.
using try catch statements for fs.writeFile but it never throws an error
the entire app outside of express, this worked at some point but i trying to use it within the framework
const express = require('express');
const router = express.Router();
const puppeteer = require('puppeteer');
const bluebird = require("bluebird");
const fs = require('fs');
router.get('/', function(req, res, next) {
(async () => {
// Our (multiple) URLs.
const urls = ['https://www.testing.com/allergy-test/', 'https://www.testing.com/genetic-testing/'];
const withBrowser = async (fn) => {
const browser = await puppeteer.launch();
try {
return await fn(browser);
} finally {
await browser.close();
}
}
const withPage = (browser) => async (fn) => {
const page = await browser.newPage();
// Turns request interceptor on.
await page.setRequestInterception(true);
// Ignore all the asset requests, just get the document.
page.on('request', request => {
if (request.resourceType() === 'document' ) {
request.continue();
} else {
request.abort();
}
});
try {
return await fn(page);
} finally {
await page.close();
}
}
const results = await withBrowser(async (browser) => {
return bluebird.map(urls, async (url) => {
return withPage(browser)(async (page) => {
await page.goto(url, {
waitUntil: 'domcontentloaded',
timeout: 0 // Removes timeout.
});
// Search for urls we want to "crawl".
const hrefs = await page.$$eval('a[href^="https://www.testing.com/"]', as => as.map(a => a.href));
// Predefine our arrays.
let links = [];
let redirect = [];
// Loops through each /goto/ url on page
for (const href of Object.entries(hrefs)) {
response = await page.goto(href[1], {
waitUntil: 'domcontentloaded',
timeout: 0 // Remove timeout.
});
const chain = response.request().redirectChain();
const link = {
'source_url': href[1],
'status': response.status(),
'final_url': response.url(),
'redirect_count': chain.length,
};
// Loops through the redirect chain for each href.
for ( const ch of chain) {
redirect = {
status: ch.response().status(),
url: ch.url(),
};
}
// Push all info of target link into links
links.push(link);
}
// JSONify the data.
const linksJson = JSON.stringify(links);
fileName = url.replace('https://www.testing.com/', '');
fileName = fileName.replace(/[^a-zA-Z0-9\-]/g, '');
// Write data to file in /tmp directory.
fs.writeFile(`./tmp/${fileName}.json`, linksJson, (err) => {
if (err) {
return console.log(err);
}
});
});
}, {concurrency: 4}); // How many pages to run at a time.
});
})();
});
module.exports = router;
UPDATE:
So there is nothing wrong with my code... I realized nodemon was stopping the process after each file was saved. Since nodemon would detect a "file change" it kept restarting my server after the first item

How to intercept downloads of blob generated in client side of website through puppeteer?

I have a page on this link (https://master.d3tei1upkyr9mb.amplifyapp.com/report) with 3 export buttons.
These export buttons generate XLSX, CSV, PDF on the frontend, and hence there are no URLs for XLSX, CSV, PDF.
I need puppeteer to be able to download or get or intercept the blobs or buffers of these files in my node backend.
I tried different ways to achieve this but still haven't figured out.
It was possible through playwright library through the code written below. But I need to be able to do it with Puppeteer.
const {chromium} = require('playwright');
const fs = require('fs');
(async () => {
const browser = await chromium.launch();
const context = await browser.newContext({acceptDownloads: true});
const page = await context.newPage();
await page.goto('http://localhost:3000/');
const [ download ] = await Promise.all([
page.waitForEvent('download'), // <-- start waiting for the download
page.click('button#expoXLSX') // <-- perform the action that directly or indirectly initiates it.
]);
const path = await download.path();
console.log(path);
const newFile = await fs.readFileSync(path);
console.log(newFile);
fs.writeFile("test.xlsx", newFile, "binary",function(err) {
if(err) {
console.log(err);
} else {
console.log("The file was saved!");
}
});
await browser.close()
})();
Is there any way?
Any reason not to simulate the click on the frontend and allow puppeteer download the file to the location of your choice? You can easily download the file this way with the following:
Edit: You can determine when the file download completes by listening to the Page.downloadProgress event and checking for the completed state. Getting the actual filename saved to disk isn't 100% guaranteed with this method, but you are able to get what is termed the suggestedFileName from the Page.downloadWillBegin event, which in my tests thus far (at least on the example page in the question) does match the filename persisted to disk.
const puppeteer = require('puppeteer');
const path = require('path');
const downloadPath = path.resolve('./download');
(async ()=> {
let fileName;
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
await page.goto(
'https://master.d3tei1upkyr9mb.amplifyapp.com/report',
{ waitUntil: 'networkidle2' }
);
await page._client.send('Page.setDownloadBehavior', {
behavior: 'allow',
downloadPath: downloadPath
});
await page._client.on('Page.downloadWillBegin', ({ url, suggestedFilename }) => {
console.log('download beginning,', url, suggestedFilename);
fileName = suggestedFilename;
});
await page._client.on('Page.downloadProgress', ({ state }) => {
if (state === 'completed') {
console.log('download completed. File location: ', downloadPath + '/' + fileName);
}
});
await page.click('button#expoPDF');
})();

How to download pdf file that opens in new tab with puppeteer?

I am trying to download invoice from website using puppeteer, I just started to learn puppeteer. I am using node to create and execute the code. I have managed to login and navigate to the invoice page, but it opens in new tab, so, code is not detecting it since its not the active tab. This is the code I used:
const puppeteer = require('puppeteer')
const SECRET_EMAIL = 'emailid'
const SECRET_PASSWORD = 'password'
const main = async () => {
const browser = await puppeteer.launch({
headless: false,
})
const page = await browser.newPage()
await page.goto('https://my.apify.com/sign-in', { waitUntil: 'networkidle2' })
await page.waitForSelector('div.sign_shared__SignForm-sc-1jf30gt-2.kFKpB')
await page.type('input#email', SECRET_EMAIL)
await page.type('input#password', SECRET_PASSWORD)
await page.click('input[type="submit"]')
await page.waitForSelector('#logged-user')
await page.goto('https://my.apify.com/billing#/invoices', { waitUntil: 'networkidle2' })
await page.waitForSelector('#reactive-table-1')
await page.click('#reactive-table-1 > tbody > tr:nth-child(1) > td.number > a')
const newPagePromise = new Promise(x => browser.once('targetcreated', target => x(target.page())))
const page2 = await newPagePromise
await page2.bringToFront()
await page2.screenshot({ path: 'apify1.png' })
//await browser.close()
}
main()
In the above code I am just trying to take screenshot. Can anyone help me?
Here is an example of a work-around for the chromium issue mentioned in the comments above. Adapt to fit your specific needs and use-case. Basically, you need to capture the new page (target) and then do whatever you need to do to download the file, possibly pass it as a buffer to Node as per the example below if no other means work for you (including a direct request to the download location via fetch or ideally some request library on the back-end)
const [PDF_page] = await Promise.all([
browser
.waitForTarget(target => target.url().includes('my.apify.com/account/invoices/' && target).then(target => target.page()),
ATT_page.click('#reactive-table-1 > tbody > tr:nth-child(1) > td.number > a'),
]);
const asyncRes = PDF_page.waitForResponse(response =>
response
.request()
.url()
.includes('my.apify.com/account/invoices'));
await PDF_page.reload();
const res = await asyncRes;
const url = res.url();
const headers = res.headers();
if (!headers['content-type'].includes('application/pdf')) {
await PDF_page.close();
return null;
}
const options = {
// target request options
};
const pdfAb = await PDF_page.evaluate(
async (url, options) => {
function bufferToBase64(buffer) {
return btoa(
new Uint8Array(buffer).reduce((data, byte) => {
return data + String.fromCharCode(byte);
}, ''),
);
}
return await fetch(url, options)
.then(response => response.arrayBuffer())
.then(arrayBuffer => bufferToBase64(arrayBuffer));
},
url,
options,
);
const pdf = Buffer.from(pdfAb, 'base64');
await PDF_page.close();

How to authentication using session cookie in puppeteer

I want to store my session cookie and authenticate my account using puppeteer.
Right now I'm using my username and password directly to authenticate.
const puppeteer = require('puppeteer');
(async ()=>{
const browser= await puppeteer.launch({
"headless": false,
"slowMo":20
});
const page= await browser.newPage();
await page.goto("https://www.linkedin.com/login");
await page.type('[id=username]','username');
await page.type('[id=password]','password');
await page.keyboard.press('Enter',{delay:2000});
await browser.close();
})();
Here below is an example of how to login on a web app using Puppeteer. You need to install apify (a npm module).
const Apify = require('apify');
Apify.main(async () => {
const input = await Apify.getValue('INPUT');
const browser = await Apify.launchPuppeteer();
const page = await browser.newPage();
await page.goto('https://facebook.com');
// Login
await page.type('#email', input.username);
await page.type('#pass', input.password);
await page.click('#loginbutton input');
await page.waitForNavigation();
// Get cookies
const cookies = await page.cookies();
// Use cookies in other tab or browser
const page2 = await browser.newPage();
await page2.setCookie(...cookies);
await page2.goto('https://facebook.com'); // Opens page as logged user
await browser.close();
console.log('Done.');
});
To Save the Session Cookies in puppeteer.
const cookiesObject = await page.cookies()
// Write cookies to temp file to be used in other profile pages
jsonfile.writeFile(cookiesFilePath, cookiesObject, { spaces: 2 },
function(err) {
if (err) {
console.log('The file could not be written.', err)
}
console.log('Session has been successfully saved')
})
Then, on your next iteration right before using page.goto() you can call page.setCookie() to load the cookies from the file one by one.
const previousSession = fileExistSync(cookiesFilePath)
if (previousSession) {
// If file exist load the cookies
const cookiesArr = require(`.${cookiesFilePath}`)
if (cookiesArr.length !== 0) {
for (let cookie of cookiesArr) {
await page.setCookie(cookie)
}
console.log('Session has been loaded in the browser!')
return true
}
}
The CDPSession instances are used to talk raw Chrome Devtools Protocol:
Protocol methods can be called with session.send method.
Protocol events can be subscribed to with session.on method.
Here are the official links for these as follows:
https://github.com/puppeteer/puppeteer/blob/master/docs/api.md#pagecookiesurls
https://github.com/puppeteer/puppeteer/blob/master/docs/api.md#pagesetcookiecookies

reuse browser instance puppeterr

I would like to know if it is possible to have one .js file that opens a browser instance, creates new page/tab logs in to a website (with username/password) and just stays idle. And in a second .js file use file one browser instance and its page.
1.js
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch({
headless: true,
args: ['--no-sandbox'],
ignoreDefaultArgs: ["--hide-scrollbars"]
});
const page = await browser.newPage();
const response = await page.goto('https://google.com');
console.log('Browser open in the background (headless)!');
//await browser.close();
})();
2.js
const puppeteer = require('puppeteer');
(async () => {
// instructions on browser instance/page from 1.js ...
})();
The crawler object keeps the state of the browser instance and
wherever you call/pass that instance, it refers to the same chromium
in the "background". If this is an overkill, and you just want to
connect to an already running chromium using puppeteer, you can do it
with puppeteer.connect. take a look at this:
How to "hook in" puppeteer into a running Chrome instance/tab – mbit
Yeah I guess its to overkill for me :). But the link you posted was what I wanted but have 2 questions.
This Is a sample what I have.
// 1.js
// open chromium, new tab, go to google.com, print browserWSEndpoint, disconnect
const puppeteer = require('puppeteer');
(async () => {
var browser = await puppeteer.launch({headless: false});
var page = await browser.newPage();
var response = await page.goto('https://google.com');
var browserWSEndpoint = browser.wsEndpoint();
console.log(browserWSEndpoint); // prints: ws://127.0.0.1:51945/devtools/browser/6462daeb-469b-4ae4-bfd1-c3bd2f26aa5e
browser.disconnect();
})();
And
// 2.js
// connect to the open browser with the browserWSEndpoint manualy put in, ... , disconect.
const puppeteer = require('puppeteer');
(async () => {
var browser = await puppeteer.connect({browserWSEndpoint: 'ws://127.0.0.1:51945/devtools/browser/6462daeb-469b-4ae4-bfd1-c3bd2f26aa5e'});
// somehow use the tab that is open from 1.js (google.com)
await browser.disconnect();
})();
I get the browserWSEndpoint string from the console.log 1.js.
It works great but I have two difficulties.
1 - How can I use the variable browserWSEndpoint from 1.js so I dont have to always copy paste it to 2.js.
2- If I open a new page/tab on 1.js and go for example to google and disconnect (browser.disconnect()), how can use that page/tab on 2.js.
Working tested code
getEmail.js is where actual page will be exported. ask clarifications in comments.
getBrowser.js
const puppeteer = require("puppeteer");
module.exports = {
browser: {},
pptr_instance_url:"",
getBrow: async function(){ try {
console.log("line6",this.pptr_instance_url);
this.browser = await puppeteer.connect({browserWSEndpoint: this.pptr_instance_url}).catch(async e =>{
console.log("end point",this.pptr_instance_url);
this.browser = await puppeteer.launch({timeout: 0});
this.pptr_instance_url = this.browser.wsEndpoint();
console.log("line 11",this.pptr_instance_url);
return this.browser;
});
return this.browser;
}catch (e){
console.log(e)
} }
}
pageRenderer.js
const abc = require("../getBrowsernew")
const pageRenderer = async (request) => {
const {reactProjectUrl} = constants, uuidStorageKey = uuidv4(),
localStorageObject = {[uuidStorageKey]: request.body};
const browser = await abc.getBrow();
let url = "someurl.com"
await setLocalStorage(browser, url, localStorageObject);
const page = await browser.newPage();
const response = await page.goto(
url,
{
waitUntil: "networkidle0"
}, {waitUntil: 'load', timeout: 0}
);
return page;
}
module.exports = pageRenderer;
getEmail.js
const pageRenderer = require("./pageRenderer");
const getEmail =async (request) =>{
const page = await pageRenderer(request)
const emailbody = await page.content();
page.close();
return emailbody;
}
module.exports = getEmail;
You can implement this in many ways like having separate modules with functions, or different classes, and it depends on your particular need.
You can have a class that launches the browser and creates pages plus some extra functionalities.
//1.js
const puppeteer = require('puppeteer');
class Crawler {
constructor() {
//init with whatever values you'll need in your class
//or throw an error if the object wasn't created through build
}
static async build() {
let crawler = new Crawler();
await crawler._init();
return crawler;
}
async _init() {
//launch the browser and keep its state
this._browser = await puppeteer.launch({timeout: 0});
//create a page and keep its state
this._page = await this._browser.newPage();
}
//getter
get browser() {
return this._browser;
}
//getter
get page() {
return this._page;
}
async login(url) {
await this._page.goto(url);
//do whatever is related to the login process
}
}
module.exports = {Crawler};
Note that we can't have async functions in the constructor. Since launching browser is async, we use something like a build function to initiate the browser when creating the object. Then we create the crawler object like this:
//2.js
const {Crawler} = require('./1.js');
(async() => {
let crawler = await Crawler.build();
await crawler.login("https://example.com");
//access crawler's page
console.log(crawler.page.url());
})();
Keep in mind that this is only an example and by no means representative of the best practices. So first, you need to understand what you want to achieve out of such encapsulation, then adopt the method that suits you best.
Read more on JS classes here

Resources