Node puppeteer scraping YouTube and encountering redirected you too many times

Node puppeteer scraping YouTube and encountering redirected you too many times - node.js

I'm trying to scrape a YouTube playlists URL using Node / puppeteer. It was working, but now I'm getting ERR_TOO_MANY_REDIRECTS error. I can still access the page using chrome from my desktop.
I've tried using the chromium browser and chrome browsers. I've also tried using the puppeteer-extra stealth plugin and the random-useragent.
This is how my code stand at the moment:
const browser = await puppeteer.launch({
stealth: true,
headless: false // true,
executablePath: "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe",
args: [
'--disable-notifications', '--disable-features=site-per-process'
],
defaultViewport: null
});
const page = await browser.newPage()
await page.setUserAgent(random_useragent.getRandom());
await page.goto(<playlist-url, {
waitUntil: 'networkidle2',
timeout: 0
})
await page.waitForSelector('button[aria-label="Agree to the use of cookies and other data for the purposes described"')
It at the page.goto it bombs. And it happens even if I try going to https://www.youtube.com.
Any suggestions what I should try next. I tried a proxy server but couldn't get it to work. I suspect I need a proxy to actually route through.

If all you need is playlist IDs for a given channel, it's possible to query a feed at:
https://youtube.com/feeds/videos.xml?channel_id=<Channel ID>
To get IDs of videos you can query a feed at:
https://youtube.com/feeds/videos.xml?playlist_id=PLAYLIST_ID

You can get playlists (and Mixes) links from YouTube like in the code example below (also check full code the online IDE):
const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
puppeteer.use(StealthPlugin());
const searchString = "java course";
const requestParams = {
baseURL: `https://www.youtube.com`,
encodedQuery: encodeURI(searchString), // what we want to search for in URI encoding
};
async function fillPlaylistsDataFromPage(page) {
const dataFromPage = await page.evaluate((requestParams) => {
const mixes = Array.from(document.querySelectorAll("#contents > ytd-radio-renderer")).map((el) => ({
title: el.querySelector("a > h3 > #video-title")?.textContent.trim(),
link: `${requestParams.baseURL}${el.querySelector("a#thumbnail")?.getAttribute("href")}`,
videos: Array.from(el.querySelectorAll("ytd-child-video-renderer a")).map((el) => ({
title: el.querySelector("#video-title")?.textContent.trim(),
link: `${requestParams.baseURL}${el.getAttribute("href")}`,
length: el.querySelector("#length")?.textContent.trim(),
})),
thumbnail: el.querySelector("a#thumbnail #img")?.getAttribute("src"),
}));
const playlists = Array.from(document.querySelectorAll("#contents > ytd-playlist-renderer")).map((el) => ({
title: el.querySelector("a > h3 > #video-title")?.textContent.trim(),
link: `${requestParams.baseURL}${el.querySelector("a#thumbnail")?.getAttribute("href")}`,
channel: {
name: el.querySelector("#channel-name a")?.textContent.trim(),
link: `${requestParams.baseURL}${el.querySelector("#channel-name a")?.getAttribute("href")}`,
},
videoCount: el.querySelector("yt-formatted-string.ytd-thumbnail-overlay-side-panel-renderer")?.textContent.trim(),
videos: Array.from(el.querySelectorAll("ytd-child-video-renderer a")).map((el) => ({
title: el.querySelector("#video-title")?.textContent.trim(),
link: `${requestParams.baseURL}${el.getAttribute("href")}`,
length: el.querySelector("#length")?.textContent.trim(),
})),
thumbnail: el.querySelector("a#thumbnail #img")?.getAttribute("src"),
}));
return [...mixes, ...playlists];
}, requestParams);
return dataFromPage;
}
async function getYoutubeSearchResults() {
const browser = await puppeteer.launch({
headless: false,
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
const page = await browser.newPage();
const URL = `${requestParams.baseURL}/results?search_query=${requestParams.encodedQuery}`;
await page.setDefaultNavigationTimeout(60000);
await page.goto(URL);
await page.waitForSelector("#contents > ytd-video-renderer");
const playlists = await fillPlaylistsDataFromPage(page);
await browser.close();
return playlists;
}
getYoutubeSearchResults().then(console.log);
📌Note: to get thumbnail you need to scroll playlist into view (using .scrollIntoView() method).
Output:
[
{
"title":"Java Complete Course | Placement Series",
"link":"https://www.youtube.com/watch?v=yRpLlJmRo2w&list=PLfqMhTWNBTe3LtFWcvwpqTkUSlB32kJop",
"channel":{
"name":"Apna College",
"link":"https://www.youtube.com/c/ApnaCollegeOfficial"
},
"videoCount":"35",
"videos":[
{
"title":"Introduction to Java Language | Lecture 1 | Complete Placement Course",
"link":"https://www.youtube.com/watch?v=yRpLlJmRo2w&list=PLfqMhTWNBTe3LtFWcvwpqTkUSlB32kJop",
"length":"18:46"
},
{
"title":"Variables in Java | Input Output | Complete Placement Course | Lecture 2",
"link":"https://www.youtube.com/watch?v=LusTv0RlnSU&list=PLfqMhTWNBTe3LtFWcvwpqTkUSlB32kJop",
"length":"42:36"
}
],
"thumbnail":null
},
{
"title":"Java Tutorials For Beginners In Hindi",
"link":"https://www.youtube.com/watch?v=ntLJmHOJ0ME&list=PLu0W_9lII9agS67Uits0UnJyrYiXhDS6q",
"channel":{
"name":"CodeWithHarry",
"link":"https://www.youtube.com/c/CodeWithHarry"
},
"videoCount":"113",
"videos":[
{
"title":"Introduction to Java + Installing Java JDK and IntelliJ IDEA for Java",
"link":"https://www.youtube.com/watch?v=ntLJmHOJ0ME&list=PLu0W_9lII9agS67Uits0UnJyrYiXhDS6q",
"length":"19:00"
},
{
"title":"Basic Structure of a Java Program: Understanding our First Java Hello World Program",
"link":"https://www.youtube.com/watch?v=zIdg7hkqNE0&list=PLu0W_9lII9agS67Uits0UnJyrYiXhDS6q",
"length":"14:09"
}
],
"thumbnail":null
}
]
You can read more about scraping YouTube playlists from blog post Web scraping YouTube secondary search results with Nodejs.

Related

Firebase function will not deploy when requiring outside packages

I am having trouble deploying my web scraping function and do not know how to fix the issue.
Index.js
const functions = require("firebase-functions");
const pup = require("puppeteer");
const WebsiteData = require('./schema');
exports.scrape = functions
.runWith({ memory: '1GB' })
.pubsub.schedule('0 0 * * *')
.onRun(async (context) => {
const browser = await pup.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox']
});
const page = await browser.newPage();
await page.setUserAgent(
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36'
)
var pages = [
{
name: 'aave-protocol',
fee:'0.09',
url: 'https://aave.com',
},
{
name: 'uniswap-v2',
fee:'0.30',
url: 'https://uniswap.org',
},
{
name: 'dydx',
fee:'0.00',
url: 'https://dydx.exchange',
},
{
name: 'dodo-bsc',
fee:'0.00',
url: 'https://dodoex.io',
},
{
name: 'cream-finance',
fee:'0.03',
url: 'https://cream.finance',
},
{
name: 'multiplier-finance',
fee:'0.06',
url: 'https://multiplier.finance',
}
]
var result = [];
for (var each in pages) {
await page.goto(`https://www.dapp.com/app/${pages[each].name}`, { waitUntil: 'networkidle0' })
var users = await page.evaluate(() => {
return document.querySelector('#app > div.app-container > div.root.dapp-detail > section.detail-page-outer > div > div.left-sec > div.stats-card > div.chart-section > div:nth-child(2) > div:nth-child(4) > span.value > span').innerText
})
var volume = await page.evaluate(() => {
return document.querySelector('#app > div.app-container > div.root.dapp-detail > section.detail-page-outer > div > div.left-sec > div.stats-card > div.chart-section > div:nth-child(3) > div:nth-child(4) > span.value > span:nth-child(2)').innerText
})
var obj = { "name": `${pages[each].name}`, "nDUniqueUsers": `${users}`, "nDTransactions": `${volume}`, "fee": `${pages[each].fee}`, "url": `${pages[each].url}`};
result.push(obj);
}
await browser.close();
const websiteMongo = new WebsiteData({ sites: result });
await websiteMongo.save(function(err,data){
if (err) {
console.log(err);
return null;
}
});
console.log("Done.")
return null;
});
The function is meant to use puppeteer to open up around 5 pages collect the info and upload the data to a MongoDB database. The code works perfectly in local host, but when I run firebase deploy, I get an error. Here is the error message: "Function failed on loading user code. This is likely due to a bug in the user code. Error message: Error: please examine your function logs to see the error cause: https://cloud.google.com/functions/docs/monitoring/logging#viewing_logs. Additional troubleshooting documentation can be found at https://cloud.google.com/functions/docs/troubleshooting#logging. Please visit https://cloud.google.com/functions/docs/troubleshooting for in-depth troubleshooting documentation."}"
I know that the problem consists of these two lines:
const pup = require("puppeteer");
const WebsiteData = require('./schema');
When I comment out those two lines I can deploy the function. Here is the code for schema.js:
var mongoose = require('mongoose')
mongoose.connect("URI");
var Schema = mongoose.Schema
var websiteData = new Schema({
sites: [{
name: {
type: String,
required : true,
},
nDUniqueUsers: {
type: String,
required : true,
},
nDTransactions: {
type: String,
required : true,
}, fee: {
type: String,
required : true,
}, url: {
type: String,
required : true,
}
}],
})
var WebsiteData = mongoose.model("SiteData", websiteData);
module.exports = WebsiteData
I do not know how to fix this. Any help would be greatly appreciated.

I suspect you haven't included "puppeteer" in the dependencies section of the package.json deployed alongside your function. Or possibly you've inadvertently included it in the devDependencies section instead of dependencies.
// package.json
{
// ...
"dependencies": {
"puppeteer": "^13.5.1" // be sure you have this
}
}

Puppeteer nodejs Error: input.on is not a function\n at new Interface

M generating a pdf from html using the npm module puppeteer.
When the running the following code m getting an error.
It is working properly on windows , but when the same is executed on linux red hat server , it is giving an error
let poptions = {
path: pdfPath, scale: 0.8, printBackground: true, format: "letter"
,"margin": {
"bottom": 70,
"left": 25,
"right": 35,
"top": 70,
},
landscape:true
}
console.log(htmlPath);
const browser = await puppeteer.launch({ args: [
'--no-sandbox'
],"dumpio": true})
const page = await browser.newPage();
page.on('console', (msg) => console.log('PAGE LOG:', msg.text()));
await page.goto(htmlPath);
// await page.emulateMedia('print');
poption=Object.assign(poptions,pageoptions)
if(pageStyle)await page.addStyleTag(pageStyle);
const pdf = await page.pdf(poptions);
await browser.close();
Error: input.on is not a function
at new Interface (readline.js:207:11)
at Object.createInterface (readline.js:75:10)
at Promise (/microservice/node_modules/puppeteer/lib/Launcher.js:329:25)
at new Promise ()
at waitForWSEndpoint (/microservice/node_modules/puppeteer/lib/Launcher.js:326:10)
at Launcher.launch (/microservice/node_modules/puppeteer/lib/Launcher.js:170:41)

Used the following parameters while launching the chrome.--disable-setuid-sandbox resolved the issue
const browser = await puppeteer.launch({ args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--headless',
'--disable-dev-shm-usage',
'--disable-gpu',
'--disable-features=NetworkService',
'--window-size=1920x1080',
'--disable-features=VizDisplayCompositor',
'--log-file=/home/ec2-user/credence/microservices/reporting-server/log/server.log',
'--log-level=0'
],"dumpio": true})

Using the default chrome profile with puppeteer that my chrome app uses

I'm having issues getting puppeteer to use the default profile that my chrome browser uses. I've tried setting path to the user profile, but when I go to a site with puppeteer that I know is saved with chrome app's userDataDir, there's nothing saved there. What am I doing wrong? I appreciate any help!
const browser = await puppeteer.launch({
headless: false,
userDataDir: 'C:\\Users\\Bob\\AppData\\Local\\Google\\Chrome\\User Data',
}).then(async browser => {
I've also tried userDataDir: 'C:/Users/Phil/AppData/Local/Google/Chrome/User Data',, but still nothing.
UPDATED:
const username = os.userInfo().username;
(async () => {
try {
const browser = await puppeteer.launch({
headless: false, args: [
`--user-data-dir=C:/Users/${username}/AppData/Local/Google/Chrome/User Data`]
}).then(async browser => {

I had same exact issue before. However connecting my script to a real chrome instance helped to solve a lot of problems specially the profile one.
You can see the steps here:
https://medium.com/#jaredpotter1/connecting-puppeteer-to-existing-chrome-window-8a10828149e0
//MACOS
/*
Open this instance first:
/Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --remote-debugging-port=9222 --no-first-run --no-default-browser-check --user-data-dir=$(mktemp -d -t 'chrome-remote_data_dir')
// Windows:
- Add this to Target of launching chrome --remote-debugging-port=9222
- Navigate to http://127.0.0.1:9222/json/version
- copy webSocketDebuggerUrl
More Info: https://medium.com/#jaredpotter1/connecting-puppeteer-to-existing-chrome-window-8a10828149e0
*/
// Puppeteer Part
// Always update this socket after running the instance in terminal (look up ^)
and this is abstracted controller written in Typescript, that I always use in any project:
import * as puppeteer from 'puppeteer';
import { Browser } from 'puppeteer/lib/cjs/puppeteer/common/Browser';
import { Page } from 'puppeteer/lib/cjs/puppeteer/common/Page';
import { PuppeteerNode } from 'puppeteer/lib/cjs/puppeteer/node/Puppeteer';
import { getPuppeteerWSUrl } from './config/config';
export default class Puppeteer {
public browser: Browser;
public page: Page;
getBrowser = () => {
return this.browser;
};
getPage = () => {
return this.page;
};
init = async () => {
const webSocketUrl = await getPuppeteerWSUrl();
try {
this.browser = await ((puppeteer as unknown) as PuppeteerNode).connect({
browserWSEndpoint: webSocketUrl,
defaultViewport: {
width: 1920,
height: 1080,
},
});
console.log('BROWSER CONNECTED OK');
} catch (e) {
console.error('BROWSER CONNECTION FAILED', e);
}
this.page = await this.browser.newPage();
this.page.on('console', (log: any) => console.log(log._text));
};
}
Abstracted webosocket fecther:
import axios from "axios";
import { exit } from "process";
export const getPuppeteerWSUrl = async () => {
try {
const response = await axios.get("http://127.0.0.1:9222/json/version");
return response.data.webSocketDebuggerUrl;
} catch (error) {
console.error("Can not get puppeteer ws url. error %j", error);
console.info(
"Make sure you run this command (/Applications/Google Chrome.app/Contents/MacOS/Google Chrome --remote-debugging-port=9222 --no-first-run --no-default-browser-check --user-data-dir=$(mktemp -d -t 'chrome-remote_data_dir')) first on a different shell"
);
exit(1);
}
};
Feel free to adjust the template to suit whatever you enviroment/tools currrently look like.

While using capture-website, puppeteer with webpack throwing error:browser not downloaded, at ChromeLauncher.launch (webpack-internal)

I am trying to take screenshot by providing html file on node js.
I have used capture-website package.
Here is the code:
try{
await captureWebsite.file('file.html', 'file.png', {overwrite: true}, function (error) {
if (error) {
console.log('error',error);
}
});
}catch(e){
console.log('error in capture image:', e)
}
Version:
node:12.16.1
capture-website: "0.8.1"
Angular : 7

You can do something like:
async function screenshotFromHtml ({ html, timeout = 2000 }: ScreenshotOptions) {
const browser = await puppeteer.launch({
headless: !process.env.DEBUG_HEADFULL,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
]
})
const page = await browser.newPage()
// Set viewport to something big
// Prevents Carbon from cutting off lines
await page.setViewport({
width: 2560,
height: 1080,
deviceScaleFactor: 2
})
page.setContent(html)
const base64 = await page.screenshot({ encoding: "base64" }) as string;
// Wait some more as `waitUntil: 'load'` or `waitUntil: 'networkidle0'
await page.waitFor(timeout)
// Close browser
await browser.close()
return base64
}
This code is in typescript, but you can use the function body in your JS project
In this github file you can see a html render code too

Jest - Storyshot - getCustomBrowser - how to manage?

Having issue where i want to use a custom browser for storyshot with jest but I'm having a hard time finding any example or docs about managing the browser lifecycle - it's just mentioned offhand. My initStoryshots looks like this
initStoryshots({
suite: 'Image storyshots',
storyKindRegex: /^((?!.*?skipImageSnapshot).)*$/,
test: imageSnapshot({
storybookUrl,
getMatchOptions,
getCustomBrowser: async () => {
let browser = await puppeteer.launch({
args: [
'--no-sandbox ',
'--headless',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-lcd-text',
],
});
return browser
}
}),
});
So I'm not clear where I can add an afterAll or some other way to get the browser and .close() it?
Hoping to find some guidance here. Please let me know what details I can add.

Ok, solved it. Leaving record here for the next person:
Solution was to capture the testFn returned by imageSnapshot and override the afterAll on that.
let browser;
let afterAll = () => {
if (browser) {
browser.close();
}
};
let testFn = imageSnapshot({
storybookUrl,
getMatchOptions,
getCustomBrowser: async () => {
browser = await puppeteer.launch({
args: [
'--no-sandbox ',
'--headless',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-lcd-text',
],
});
return browser;
},
});
testFn.afterAll = afterAll;
initStoryshots({
suite: 'Image storyshots',
storyKindRegex: /^((?!.*?skipImageSnapshot).)*$/,
test: testFn,
});

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Node puppeteer scraping YouTube and encountering redirected you too many times - node.js

If all you need is playlist IDs for a given channel, it's possible to query a feed at: https://youtube.com/feeds/videos.xml?channel_id=<Channel ID> To get IDs of videos you can query a feed at: https://youtube.com/feeds/videos.xml?playlist_id=PLAYLIST_ID

Related

Firebase function will not deploy when requiring outside packages

Puppeteer nodejs Error: input.on is not a function\n at new Interface

Using the default chrome profile with puppeteer that my chrome app uses

While using capture-website, puppeteer with webpack throwing error:browser not downloaded, at ChromeLauncher.launch (webpack-internal)

Jest - Storyshot - getCustomBrowser - how to manage?

Categories

Resources