List element of a dataLayer with puppeteer - node.js

i'm doing a web scraping bot thanks to puppeteer, and i want to verify a field of a dataLayer on some url thanks to cloud functions on Google Cloud Plateform.
right now i'm using the await page.$('dataLayer') and it return a "true", so it find the dataLayer.
there is the dataLayer on my browser (not puppeteer)
my code:
exports.datalayer = async(page) => {
//verfify if there is a dataLayer
let dl = await page.$('dataLayer')
if(dl =! null){
console.log('DataLayer found')
//want to do something like that
for(let key in dl){
let value = dl[key]
console.log('key: ' + key + ' value: ' + value)
}
//or like that
dl.forEach(element => {
console.log(element)
});
}else{
console.log('error with the dataLayer')
}
}
In order to catch the OnetrustActiveGroups data.
when i do the forEach method i get this error:
and for the for method i get this error:

better use the page.evaluate('dataLayer')
then stringify it or use it as you want :)
and with the evaluate the for(let item in dl) works fine !

Related

Chrome Extension API Calls order and DOM Information

I'm working on an extension that is supposed to extract information from the DOM based specific classes/tags,etc, then allow the user to save the information as a CSV file.
I'm getting stuck on a couple of places and haven't been able to find answers to questions similar enough.
Where I am tripped up at is:
1) Making sure that the page has completely loaded so the chrome.tabs.query doesn't return null a couple of times before the promise actually succeeds and allows the blocksF to successfully inject. I have tried placing it within a settimeout function but the chrome api doesn't seem to work within such the function.
2) Saving the extracted information so when the user moves onto a new page, the information is still there. I'm not sure if I should use the chrome.storage api call or simply save the information as an array and keep passing it through. It's just text, so I don't believe that it should take up too much space.
Then main function of the background.js is below.
let mainfunc = chrome.tabs.onUpdated.addListener(
async(id, tab) => {
if (buttonOn == true) {
let actTab = await chrome.tabs.query({
active: true,
currentWindow: true,
status: "complete"
}).catch(console.log(console.error()));
if (!actTab) {
console.log("Could not get URL. Turn extension off and on again.");
} else {
console.log("Tab information recieved.")
};
console.log(actTab);
let blocksF = chrome.scripting.executeScript({
target: { tabId: actTab[0]['id'] },
func: createBlocks
})
.catch(console.error)
if (!blocksF) {
console.log("Something went wrong.")
} else {
console.log("Buttons have been created.")
};
/*
Adds listeners and should return value of the works array if the user chose to get the information
*/
let listenersF = chrome.scripting.executeScript({
target: { tabId: actTab[0]['id'] },
func: loadListeners
})
.catch(console.error)
if (!listenersF) {
console.log("Listeners failed to load.")
} else {
console.log("Listeners loaded successfully.")
};
console.log(listenersF)
};
});
Information from the DOM is extracted through an event listener on a div/button that is added. The event listener is added within the loadListeners function.
let workArr = document.getElementById("getInfo").addEventListener("click", () => {
let domAr = Array.from(
document.querySelectorAll(<class 1>, <class 2>),
el => {
return el.textContent
}
);
let newAr = []
for (let i = 0; i < domAr.length; i++) {
if (i % 2 == 0) {
newAr.push([domAr[i], domAr[i + 1]])
}
}
newAr.forEach((work, i) => {
let table = document.getElementById('extTable');
let row = document.createElement("tr");
row.appendChild(document.createElement("td")).textContent = work[0];
row.appendChild(document.createElement("td")).textContent = work[1];
table.appendChild(row);
});
return newAr
I've been stuck on this for a couple of weeks now. Any help would be appreciated. Thank you!
There are several issues.
chrome methods return a Promise in MV3 so you need to await it or chain on it via then.
tabs.onUpdated listener's parameters are different. The second one is a change info which you can check for status instead of polling the active tab, moreover the update may happen while the tab is inactive.
catch(console.log(console.error())) doesn't do anything useful because it immediately calls these two functions so it's equivalent to catch(undefined)
Using return newArr inside a DOM event listener doesn't do anything useful because the caller of this listener is the internal DOM event dispatcher which doesn't use the returned value. Instead, your injected func should return a Promise and call resolve inside the listener when done. This requires Chrome 98 which added support for resolving Promise returned by the injected function.
chrome.tabs.onUpdated.addListener(onTabUpdated);
async function onTabUpdated(tabId, info, tab) {
if (info.status === 'complete' &&
/^https?:\/\/(www\.)?example\.com\//.test(tab.url) &&
await exec(tabId, createBlocks)) {
const [{result}] = await exec(tabId, loadListeners);
console.log(result);
// here you can save it in chrome.storage if necessary
}
}
function exec(tabId, func) {
// console.error returns `undefined` so we don't need try/catch,
// because executeScript is always an array of objects on success
return chrome.scripting.executeScript({target: {tabId}, func})
.catch(console.error);
}
function loadListeners() {
return new Promise(resolve => {
document.getElementById('getInfo').addEventListener('click', () => {
const result = [];
// ...add items to result
resolve(result);
});
});
}

Is it possible to extract how many images and if they have alt text, tables, interactive elements and page count from a PDF using PDF.js?

I am wanting to build a simple tool that can evaluate any given PDF and see how many pages it has if it has images and if so how many (and if they have alt text) and to give me a total count of how many tables, interactive elements, etc. there are. I am getting the PDFs via user upload as such I am having to deal with buffers. Is this task possible using PDF.js or something else? I have taken a look at pdf-parser and it works well for getting page count, but not more details about how many images there are if they have alt text, etc. I also tried PDF2JSON but its parsebuffer method was not working. I haven't been able to find much on the docs or Google about this.
Thank you all!
Update:
If the PDF has some tags I am able to get those tags from the PDFs structure with the following code as they seem to be positioned in the ops.argsArray[i][0], however, I am not successfully locating the alt text. Any advice?
let doc = await pdfJS.getDocument({ data }).promise;
console.log("here is the document");
console.log(doc);
console.log("here is doc info");
console.log(doc._pdfInfo);
console.log("here is meta data");
let metaData = await doc.getMetadata();
console.log(metaData);
pageCount = doc.numPages;
Array.from({ length: doc.numPages }, async (v, i) => {
pages = await doc.getPage(i + 1);
console.log("here is page");
console.log(pages);
const ops = await pages.getOperatorList();
console.log("here is ops");
console.log(ops);
console.log("here is the function array");
console.table(ops.fnArray);
console.log("here is args array");
console.table(ops.argsArray);
for (let i = 0; i < ops.fnArray.length; i++) {
if (Array.isArray(ops.argsArray[i])) {
console.log("this is data from ops");
console.log(ops.argsArray[i]);
console.log("this is info from ops.argsArray[i][0]");
console.log(ops.argsArray[i][0]);
console.log("this is data from ops.argsArray[i][0][0]");
console.log(ops.argsArray[i][0][0]);
if (typeof ops.argsArray[i][0] === "string") {
stringArray.push(ops.argsArray[i][0]);
}
// pdfObjects.push(ops.argsArray[i][0]);
} else if (ops.argsArray[i]) {
// console.log("this is null");
}
}
// console.log(stringArray);
});
return stringArray;
}

Index messed up if I upload more than one file at once

I've got the following firebase function to run once a file is uploaded to firebase storage.
It basically gets its URL and saves a reference to it in firestore. I need to save them in a way so that I can query them randomly from my client. Indexes seem to be to best fit this requirement.
for the firestore reference I need the following things:
doc ids must go from 0 to n (n beeing the index of the last
document)
have a --stats-- doc keeping track of n (gets
incremented every time a document is uploaded)
To achieve this I've written the following node.js script:
const incrementIndex = admin.firestore.FieldValue.increment(1);
export const image_from_storage_to_firestore = functions.storage
.object()
.onFinalize(async object => {
const bucket = gcs.bucket(object.bucket);
const filePath = object.name;
const splittedPath = filePath!.split("/");
// se siamo nelle immagini
// path = emotions/$emotion/photos/$photographer/file.jpeg
if (splittedPath[0] === "emotions" && splittedPath[2] === "photos") {
const emotion = splittedPath[1];
const photographer = splittedPath[3];
const file = bucket.file(filePath!);
const indexRef = admin.firestore().collection("images")
.doc("emotions").collection(emotion).doc("--stats--");
const index = await indexRef.get().then((doc) => {
if (!doc.exists) {
return 0;
} else {
return doc.data()!.index;
}
});
if (index === 0) {
await admin.firestore().collection("images")
.doc("emotions")
.collection(emotion)
.doc("--stats--")
.set({index: 0});
}
console.log("(GOT INDEX): " + index);
let imageURL;
await file
.getSignedUrl({
action: "read",
expires: "03-09-2491"
})
.then(signedUrls => {
imageURL = signedUrls[0];
});
console.log("(GOT URL): " + imageURL);
var docRef = admin.firestore()
.collection("images")
.doc("emotions")
.collection(emotion)
.doc(String(index));
console.log("uploading...");
await indexRef.update({index: incrementIndex});
await docRef.set({ imageURL: imageURL, photographer: photographer });
console.log("finished");
return true;
}
return false;
});
Getting to the problem:
It works perfectly if I upload the files one by one.
It messes up the index if I upload more than one file at once, because two concurrent uploads will read the same index value from --stats-- and one will overwrite the other.
How would you solve this problem? would you use another approach instead of the indexed one?
You should use a Transaction in which you:
read the value of the index (from "--stats--" document),
write the new index and
write the value of the imageURL in the "emotion" doc.
See also the reference docs about transactions.
This way, if the index value is changed in the "--stats--" document while the Transaction is being executed, the Cloud Function can catch the Transaction failure and generates an error which finishes it.
In parallel, you will need to enable retries for this background Cloud Function, in order it is retried if the Transaction failed in a previous run.
See this documentation item https://firebase.google.com/docs/functions/retries, including the video from Doug Stevenson which is embedded in the doc.

Click on every 'a' tag in page puppeteer

I am trying to get puppeteer to go to all a tags in a page and load them, add them to an array and return it. My puppeteer version is 1.5.0. Here is my code:
module.exports.scrapeLinks = async (page, linkXpath) => {
page.waitForNavigation();
linksElement = await page.$x(linkXpath);
var url_list_arr = [];
console.log(linksElement.length);
i=1;
for(linksElementItem in linksElement)
{
const linksData = await page.$x('(' + linkXpath + ')[' + (i + 1) +']');
if (linksData.length > 0) {
linksData[0].click();
console.log(page.url());
url_list_arr.push(page.url());
}
else {
throw new Error('Link not found');
}
}
return url_list_arr;
};
However with this code, I get an
UnhandledPromiseRejectionWarning: Error: Node is either not visible or
not an HTMLElement
I also found out through the docs that is not possible to use the xpath on the page.click function. Is there anyway to achieve this?
It is also okay if there is a function to get all the link from a page, but I couldn't find it in the docs.
To get a handle on all a-tags in an array:
const aTags= await page.$$('a')
Loop through them with:
for (const aTag of aTags) {...}
Inside the loop you can interact with each of these elementHandle separately.
Note that
await aTag.click()
will destroy (garbage collect) all elementHandles when the page context is navigated. In this case you need a workaround like loading the initial page inside a loop to always start with a fresh instance.

Calling external function from within Phantomjs+node.js

I'm going to be honest. I'm way in over my head here.
I need to scrape data from a dynamic site for my employer. Before the data is visible on the page, there are some clicks and waits necessary. Simple PHP scraping won't do. So I found out about this NodeJS + PhantomJS combo. Quite a pain to set up, but I did manage to load a site, run some code and get a result.
I wrote a piece of jQuery which uses timeout loops to wait for some data to be loaded. Eventually I get a js object that I want to write to a file (JSON).
The issue I'm facing.
I build up the the js object inside the PhantomJS .evaluate scope, which runs in a headerless browser, so not directly in my Node.JS server scope. How do I send the variable I built up inside evaluate back to my server so I can write it to my file?
Some example code (I know it's ugly, but it's for illustrative purposes). I use node-phantom-simple as a bridge between Phantom and Node
var phantom = require('node-phantom-simple'),
fs = require('fs'),
webPage = 'https://www.imagemedia.com/printing/business-card-printing/'
phantom.create(function(err, ph) {
return ph.createPage(function(err, page) {
return page.open(webPage, function(err, status) {
page.onConsoleMessage = function(msg) {
console.log(msg);
};
console.log("opened site? ", status);
page.evaluate(function() {
setTimeout(function() {
$('.price-select-cnt').eq(0).find('select').val('1266').change()
timeOutLoop()
function timeOutLoop() {
console.log('looping')
setTimeout(function() {
if ($('#ajax_price_tool div').length != 6) {
timeOutLoop()
} else {
$('.price-select-cnt').eq(1).find('select').val('25')
$('.price-select-cnt').eq(2).find('select').val('Premium Card Stock')
$('.price-select-cnt').eq(3).find('select').val('Standard').change()
timeOutLoop2()
}
}, 100)
}
function timeOutLoop2() {
console.log('looping2')
setTimeout(function() {
if ($('.pricing-cost-cnt').text() == '$0' || $('.pricing-cost-cnt').text() == '') {
timeOutLoop2()
} else {
var price = $('.pricing-cost-cnt').text()
console.log(price)
}
}, 100)
}
}, 4000)
});
});
});
});
function writeJSON(plsWrite) {
var key = 'file'
fs.writeFile('./results/' + key + '.json', plsWrite, 'utf8', function() {
console.log('The JSON file is saved as');
console.log('results/' + key + '.json');
});
}
So do do I write the price this code takes from the website, get it out of the evaluate scope and write it to a file?

Resources