Here is the entirety of my puppeteer controller:
import { Readability } from '#mozilla/readability';
const puppeteer = require('puppeteer-extra');
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
const jsdom = require('jsdom');
const { JSDOM } = jsdom;
const summarize = require('summarize');
const keyword_extractor = require('keyword-extractor');
const amex = require('../../csv/AMEX.json');
const nasdaq = require('../../csv/NASDAQ.json');
const nyse = require('../../csv/NYSE.json');
const cryptotickers = require('../../csv/cryptos.json');
puppeteer.use(StealthPlugin());
class Reader {
constructor() {
this.browser = null;
}
async getLink(link) {
this.browser = await puppeteer.launch({
devtools: false,
headless: true,
// product: 'firefox',
executablePath: '/usr/bin/chromium-browser',
args: [
'--proxy-server=' + process.env.PROXY_HOST,
'--no-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
'--single-process',
'--disable-setuid-sandbox',
'--no-zygote',
'--shm-size=4gb',
'--disable-infobars',
'--ignore-certifcate-errors',
'--ignore-certifcate-errors-spki-list',
// '--user-agent="Mozilla/5.0 (iPhone; CPU iPhone OS 14_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1"'
],
});
const { htm, title } = await this.spa(link);
if (!htm) {
await this.browser.close();
return;
}
const text = txt(htm, link);
const data = Object.assign({}, text);
const parts = new URL(link);
if (!data.title) {
data.title = title;
}
data.summary = summary(data.content, data.title);
data.tickers = tickers(data.content, data.textContent);
data.cryptos = cryptos(data.content, data.textContent);
data.meta = getMeta(htm);
if (!data.title && data.meta.title) {
data.title = data.meta.title;
}
data.url = link;
data.htm = htm;
data.host = parts.host;
data.text = data.textContent;
delete data.textContent;
console.log('data fetched: ' + link);
await this.browser.close();
// await this.browser.disconnect();
return data;
}
async spa(url) {
let htm;
let title;
try {
let page = await this.browser.newPage();
await page.setRequestInterception(true);
page.on('request', (req) => {
if (
req.resourceType() === 'stylesheet' ||
req.resourceType() === 'font' ||
req.resourceType() == 'image'
) {
req.abort();
} else {
req.continue();
}
});
await page.authenticate({
username: process.env.PROXY_USER,
password: process.env.PROXY_PASS,
});
await page.setViewport({ width: 800, height: 600 });
// await page.goto(url, { waitUntil: 'networkidle2' });
await page.goto(url, { waitUntil: 'domcontentloaded' });
await this.autoScroll(page);
await page.evaluate(() => window.scrollTo(0, 50));
htm = await page.content();
title = await page.evaluate(() => document.title);
if (htm.indexOf('<title') === -1) {
htm = await page.evaluate(() => document.documentElement.outerHTML);
}
console.log(title, 'title');
} catch (err) {
console.error(err, url);
}
return { htm, title };
}
async autoScroll(page) {
await page.evaluate(async () => {
new Promise((resolve, reject) => {
try {
const maxScroll = Number.MAX_SAFE_INTEGER;
let lastScroll = 0;
const interval = setInterval(() => {
window.scrollBy(0, document.body.offsetHeight);
const { scrollTop } = document.documentElement;
if (scrollTop === maxScroll || scrollTop === lastScroll) {
clearInterval(interval);
resolve();
} else {
lastScroll = scrollTop;
}
}, 1000);
} catch (error) {
reject(error);
}
}).catch((error) => {
console.error(error); // add catch here
});
});
// await page.evaluate(async () => {
// await new Promise((resolve, reject) => {
// let totalHeight = 0;
// let distance = 300;
// let timer = setInterval(() => {
// const scrollHeight = document.body.scrollHeight;
// window.scrollBy(0, distance);
// totalHeight += distance;
// if(totalHeight >= scrollHeight){
// clearInterval(timer);
// resolve();
// }
// }, 100);
// });
// });
}
} // end Class Reader
async function summarization2(text) {
let res;
let data;
console.log(text, process.env.DEEPAI_KEY);
try {
const body = new FormData();
body.append('text', text);
res = await fetch(`https://api.deepai.org/api/summarization`, {
method: 'POST',
body,
headers: {
'api-key': process.env.DEEPAI_KEY,
},
});
data = await res.json();
} catch (err) {
console.error(err);
}
return data;
}
async function sentiment(text) {
return await deepai.callStandardApi('sentiment-analysis', { text });
}
async function summarization(text) {
return await deepai.callStandardApi('summarization', { text }).catch(console.error);
}
function summary(text, title) {
if (!text) return {};
const summary = summarize(`${title} - ${text}`);
summary.topics = keyword_extractor
.extract(`${title} - ${text}`, {
language: 'english',
remove_digits: true,
return_changed_case: true,
remove_duplicates: false,
})
.map(process);
const counts = summary.topics.reduce(
(acc, value) => ({
...acc,
[value]: (acc[value] || 0) + 1,
}),
{},
);
let topics = [];
for (let topic in counts) {
topics.push({ topic, count: counts[topic] });
}
topics = topics.filter((t) => t.topic);
topics = topics.sort((a, b) => {
return b.count - a.count;
});
topics = topics.slice(0, 10);
topics = topics.map((topic) => topic.topic);
summary.topics = topics;
function process(topic) {
topic = topic.toLowerCase().trim();
topic = topic.replace(/[\W_]+/g, '');
topic = topic.replace(/\s+/g, '-');
return topic;
}
console.log('summary: ', summary);
return summary;
}
function tickers(htm, text) {
if (!text) return {};
const tickers = [];
function findTicker(ticker, exchange) {
let name = ticker.Name;
if (name && name.indexOf('Twitter') === -1 && name.indexOf('Facebook') === -1) {
name = name.replace(/,? ?Inc\.?/gi, '').replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}
const regex = new RegExp(`\\b${name}\\b`, 'gi');
if (text.match(regex)) {
console.log(name);
console.log(regex.toString());
tickers.push({ name: ticker.Name, symbol: ticker.Symbol, exchange });
}
}
amex.forEach((ticker) => {
findTicker(ticker, 'amex');
});
nasdaq.forEach((ticker) => {
findTicker(ticker, 'nasdaq');
});
nyse.forEach((ticker) => {
findTicker(ticker, 'nyse');
});
console.log(tickers);
return tickers;
}
function cryptos(htm, text) {
if (!text) return {};
const tickers = [];
function findTicker(ticker) {
const name = ticker.name.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
const regex = new RegExp(`\\b${name}\\b`, 'g');
if (text.match(regex)) {
console.log(name);
console.log(regex.toString());
tickers.push({ name: ticker.name, symbol: ticker.symbol });
}
}
cryptotickers.forEach(findTicker);
console.log(tickers);
return tickers;
}
function getMeta(htm) {
const doc = new JSDOM(htm);
const meta = {};
const thumb =
doc.window.document.querySelector('meta[property="og:image"]') ||
doc.window.document.querySelector('meta[name="twitter:image"]');
const title = doc.window.document.title;
meta.title = title;
meta.thumb = thumb && thumb.getAttribute('content');
return meta;
}
function txt(htm, link) {
const url = new URL(link);
const doc = new JSDOM(htm);
doc.window.document
.querySelectorAll('img')
.forEach(
(el) =>
(el.src =
el.src.indexOf('http') === 0 || el.src.indexOf('//') === 0
? el.src.indexOf('http://')
? el.src.replace('http:', '')
: el.str
: '//' + url.host + el.src),
);
doc.window.document
.querySelectorAll('a[href]')
.forEach(
(el) =>
(el.href =
el.href && el.href.indexOf('/') === 0
? url.protocol + '//' + url.host + el.href
: el.href),
);
const reader = new Readability(doc.window.document);
return reader.parse();
}
export default Reader;
For some reason after a few days the docker container has too many puppeteer processes because for some reason when fetching urls the browser doesn't exit properly.
Eventually the container is out of resources and the entire app freezes and is inaccessible.
I had the same issue when using Puppeteer inside docker. The solution was to implement dumb-init within docker. The Dockerfile should somehow look like this then (I assume you are developing a node-project therefore we call npm start at the end:
RUN apt-get install dumb-init // ... plus your other packages
... your remaining docker things
ENTRYPOINT ["/usr/bin/dumb-init", "--"]
CMD [ "npm", "start" ]
Related
I have been scraping for some time now, and recently started using node and puppeteer for some projects. I build this scraper to collect telegram links from this crypto coin marketplace site. But it's kinda slow, but I don't really know where to start to figure out how to speed it up. So my question is, how do I learn to speed up my web scrappers without losing information that is collected??
Here is what I have now it tries to scrape the telegram links from about, 10000 different coin pages then saves those links to a csv.
const puppeteer = require('puppeteer');
const ObjectsToCsv = require('objects-to-csv');
(async () => {
const browser = await puppeteer.launch({
headless: true,
defaultViewport: null,
args: ['--start-maximized']
});
const page = await browser.newPage();
// const baseUrl = "https://coinmarketcap.com/"
let totalTelegramLinks = []
for (let i = 50; i < 101;i++){
await page.goto(`https://coinmarketcap.com/?page=${i}`, {waitUntil : 'networkidle2' }).catch(e => void 0);
console.log(`[+] Scraping Page ${i}`);
await autoScroll(page);
let allLinks = []
const grabedTableLinks = await page.evaluate(() => {
const aTags = Array.from(document.querySelectorAll('table.cmc-table tbody tr td div.sc-16r8icm-0.escjiH a.cmc-link'))
return aTags.map(a=>a.getAttribute('href'))
})
// allLinks.push([...new Set([...grabedTableLinks, ...allLinks])])
allLinks.push(...grabedTableLinks)
await page.screenshot({
path: 'yoursite.png',
fullPage: true
});
// console.log(allLinks);
console.log(allLinks.length);
// const await clickCoinLinks(page, allLinks)
totalTelegramLinks.push(...(await clickCoinLinks(page, allLinks)))
}
saveToFile(totalTelegramLinks)
console.log('\u0007')
await browser.close();
})();
const telegramRegex = new RegExp('(?:http|https):\/\/(?:t\.me|telegram\.me)\/.*')
const baseUrl = "https://coinmarketcap.com"
async function autoScroll(page){
await page.evaluate(async () => {
await new Promise((resolve, reject) => {
var totalHeight = 0;
var distance = 100;
var timer = setInterval(() => {
var scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
if(totalHeight >= scrollHeight - window.innerHeight){
clearInterval(timer);
resolve();
}
}, 100);
});
});
}
async function clickCoinLinks(page, links){
let navigations = 0
let totalLinks = []
for (const url of links){
await page.goto(`${baseUrl}${url}`,{waitUntil : 'networkidle2' }).catch(e => void 0)
navigations++
const title = await page.title()
// console.log('---------')
// console.log(title)
const simpleLinkBtns = await page.$$('a.link-button')
let telegramLinks = await linkHandler(simpleLinkBtns, page)
if (telegramLinks.length){
totalLinks.push(...telegramLinks)
// telegramLinks.forEach(link => console.log(link))
}else{
// console.log('[-] No Immediate Link');
const hoverLinkBtns = await page.$$('button.link-button')
telegramLinks = await dropdownBtnHandler(hoverLinkBtns, page)
// console.log('Testing for dropdown link');
if (telegramLinks.length) totalLinks.push(...telegramLinks);
// telegramLinks ? telegramLinks.forEach(link => console.log(link)) : console.log('No dropdown Link either')
}
}
// console.log(totalLinks);
return totalLinks
}
const linkHandler = async (eleHandles, page)=>{
let linkUrls = []
for (const aTag of eleHandles){
linkUrls.push(await (await aTag.getProperty('href')).jsonValue())
}
const telegramLink = testLinks(linkUrls, page)
return telegramLink
}
async function dropdownBtnHandler(eleHandles, page){
let linkUrls = []
let telegramLink
for (const btn of eleHandles){
const btnText = await (await btn.getProperty('innerText')).jsonValue()
if(btnText == 'Chat'){
await btn.hover()
const dropdownLinks = await page.$$('li > a.dropdownItem')
for (const aTag of dropdownLinks){
const hrefVal = await (await aTag.getProperty('href')).jsonValue();
linkUrls.push(hrefVal)
}
telegramLink = testLinks(linkUrls, page)
}
}
return telegramLink ? telegramLink : []
}
const testLinks = async (links, page) =>{
const coin = await page.url().split('/').at(-2)
let telegramLinks = []
let coinLinks = []
links.forEach(link => {
if (telegramRegex.test(link)){
coinLinks.push(link)
}
})
// console.log(telegramLinks);
if(coinLinks.length){
const linkObj = {}
linkObj['coin'] = coin
linkObj['telegram_links'] = coinLinks
telegramLinks.push(linkObj)
}
return telegramLinks
}
const saveToFile = async (links) =>{
const csv = new ObjectsToCsv(links);
// Save to file:
await csv.toDisk('./telegram_links.csv');
// Return the CSV file as string:
// console.log(await csv.toString());
}
This seems to be a common error, and I have read through all the search results on google.
Chrome options are set as suggested in other posts
Installed Xvfb libXfont Xorg
and a half dozen other suggestions from the many other similar posts about this issue.
Ubuntu 22.04 LTS on AWS EC2
chromedriver 101.0.4951.41
selenium-webdriver 4.2.0
This is a new EC2 instance just for this project, if I could uninstall and install over again to get it working I wouldn't care..
theres a docker file for this, and I haven't used docker before but I'm now considering it as I can't get passed this issue- I've overcome the handful of issues before this but theres no fix yet that I've seen that works.
(author of this bot im trying to run suggests using chromedriver 88, so if anyone knows how to uninstall and install just that version that would be great tip)
const { exec } = require("child_process")
const webdriver = require('selenium-webdriver')
const chrome = require('selenium-webdriver/chrome')
const YoutubeDlWrap = require("youtube-dl-wrap")
const youtubeDlWrap = new YoutubeDlWrap()
class Video {
async load(url, youtube_dl, msg) {
if (this.in_loading) return
this.in_loading = true
this.driver.executeScript('video.innerHTML = null')
if (youtube_dl) {
await msg.edit("Downloading...")
.then(async msg => {
console.log("Downloading...")
const fileName = await this.download(url, msg)
url = __dirname + "/client/tmp/" + fileName
})
}
await this.driver.executeScript(`video.src='${url}'`)
.then(_ => {
console.log('Loading...')
msg.edit("Loading...")
.then(_ => {
var int1 = setInterval(() => {
is_error && clearInterval(int1)
if (this.killed) {
msg.edit(":no_entry_sign: Loading stopped")
this.in_loading = false
this.killed = false
clearInterval(int1)
clearInterval(int2)
clearInterval(int3)
}
this.driver.getCurrentUrl()
.then(url => {
if (!this.init && url === "file:///channels/#me") {
this.init = true
this.open_guild()
this.join(msg)
clearInterval(int1)
}
else if(this.init)
clearInterval(int1)
})
}, 10)
})
})
// Wait until video load
let is_load
var int2 = setInterval(() => {
this.driver.executeScript("return video.duration")
.then(result => {
if (result) {
is_load = true
this.duration = result
this.in_loading = false
msg.edit("Done, Type `*play` to start playing.")
clearInterval(int2)
}
else if (is_error)
clearInterval(int2)
})
}, 10)
// Error event
let is_error
var int3 = setInterval(() => {
this.driver.executeScript('return video_error')
.then(error_msg => {
if (error_msg) {
msg.edit(":no_entry_sign: " + error_msg)
is_error = true
this.in_loading = false
this.driver.executeScript('video_error = ""')
clearInterval(int3)
return
}
else if (is_load)
clearInterval(int3)
})
}, 10)
}
download(url, msg) {
return new Promise((resolve, reject) => {
const fileName = Date.now()
const path = "./client/tmp"
exec(`rm -rf ${path}/*`, _ => {
this.download_process = youtubeDlWrap.exec([url, "-o", `${path}/video`])
.on("progress", progress => {
//console.log(progress.percent)
})
.on("error", err => {
msg.edit(":no_entry_sign: " + err.message)
.then(_ => {
this.in_loading = false
})
})
.on("close", () => {
if (this.killed) {
msg.edit(":no_entry_sign: Downloading process killed")
this.killed = false
}
else
exec(`mv ${path}/* ${path}/${fileName}`, _ => {
resolve(fileName)
})
}).youtubeDlProcess
})
})
}
play() {
console.log("Play")
this.start()
this.driver.executeScript('video.play()')
}
pause() {
console.log("Pause")
this.driver.executeScript('video.pause()')
}
current(time=null) {
if (time) {
if (time[0] === '+' || time[0] === '-') {
this.current().then(c => {
if (!c) return
let r
c = parseFloat(c)
const s = parseInt(time.slice(1))
time[0] === '+' ?
r = c + s :
r = c - s
this.driver.executeScript(`video.currentTime = ${r}`)
})
}
else
this.driver.executeScript(`video.currentTime = ${time}`)
}
else
return this.driver.executeScript("return video.currentTime")
}
hms(sec) {
if (sec)
return new Date(sec * 1000).toISOString().substr(11, 8)
return sec
}
}
class Stream extends Video {
client_url = `file://${__dirname}/client/index.html`
constructor(token, headless=true) {
super()
const chrome_options = new chrome.Options()
headless && chrome_options.addArguments('--headless')
chrome_options.addArguments('--no-sandbox')
chrome_options.addArguments('--window-size=1920,1080')
chrome_options.addArguments('--disable-web-security')
chrome_options.addArguments('--disable-dev-shm-usage')
chrome_options.addArguments('--autoplay-policy=no-user-gesture-required')
chrome_options.addArguments('user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36')
console.log("Webdriver started")
this.driver = new webdriver.Builder().forBrowser('chrome').setChromeOptions(chrome_options).build()
this.driver.get(this.client_url)
this.driver.executeScript(`localStorage.setItem("token", '"${token}"')`)
}
open_guild() {
this.driver.executeScript(`document.querySelector('[data-list-item-id="guildsnav___${this.guild_id}"]').click()`)
}
is_full() {
return this.driver.executeScript(`
return document.querySelector("[aria-label='Channel is full']")
`)
}
is_locked() {
return this.driver.executeScript(`
return document.querySelector("[data-list-item-id='channels___${this.channel_id}']").innerHTML.includes("Voice (Locked)")
`)
}
scroll() {
this.driver.executeScript(`
var c_inject = document.getElementById("channels");
if( c_inject.scrollTop === (c_inject.scrollHeight - c_inject.offsetHeight))
c_inject.scroll(0, 0)
else
c_inject.scroll(0, c_inject.scrollTop + 250)
`)
}
join(msg) {
var intJoin = setInterval(() => {
this.driver.executeScript(`document.querySelector("[data-list-item-id='channels___${this.channel_id}']").click()`)
.then(() => {
// this.is_locked()
// .then(result => {
// if (result) {
// msg.channel.send(":no_entry_sign: Channel is locked")
// return
// }
// })
// this.is_full()
// .then(result => {
// if (result) {
// msg.channel.send(":no_entry_sign: Channel is full")
// return
// }
// })
setTimeout(() => {
this.start()
}, 1000)
clearInterval(intJoin)
})
.catch(() => this.scroll())
}, 10)
}
start() {
this.driver.executeScript(`
var streamBtn_inject = document.querySelector('[aria-label="Share Your Screen"]')
!streamBtn_inject.className.includes('buttonActive-3FrkXp') &&
streamBtn_inject.click()
`).catch(e => e)
}
stop() {
console.log("Stop")
this.init = false
this.driver.get(this.client_url)
}
}
exports.Stream = Stream
I'm trying to log my lambda app after following serverless-next.js because of the issue where I can't go to the root of my file. So basically I'm deploying nextJS app in AWS through lambda#edge, s3, and cloudfront.
I'm new to AWS so I'm not really sure how to debug this thing at all. I assume traditional console.log in my lambda where every request comes in would log it in the cloudwatch. I also made sure that I deployed my lambda to my cloud front
Here's the code:
"use strict";
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.handler = void 0;
const prerender_manifest_json_1 = __importDefault(require("./prerender-manifest.json"));
const manifest_json_1 = __importDefault(require("./manifest.json"));
const next_aws_cloudfront_1 = __importDefault(require("#sls-next/next-aws-cloudfront"));
const addS3HostHeader = (req, s3DomainName) => {
req.headers["host"] = [{ key: "host", value: s3DomainName }];
};
const isDataRequest = (uri) => uri.startsWith("/_next/data");
const normaliseUri = (uri) => (uri === "/" ? "/index" : uri);
const normaliseS3OriginDomain = (s3Origin) => {
if (s3Origin.region === "us-east-1") {
return s3Origin.domainName;
}
if (!s3Origin.domainName.includes(s3Origin.region)) {
const regionalEndpoint = s3Origin.domainName.replace("s3.amazonaws.com", `s3.${s3Origin.region}.amazonaws.com`);
return regionalEndpoint;
}
return s3Origin.domainName;
};
const router = (manifest) => {
const { pages: { ssr, html } } = manifest;
const allDynamicRoutes = Object.assign(Object.assign({}, ssr.dynamic), html.dynamic);
return (uri) => {
let normalisedUri = uri;
if (isDataRequest(uri)) {
normalisedUri = uri
.replace(`/_next/data/${manifest.buildId}`, "")
.replace(".json", "");
}
if (ssr.nonDynamic[normalisedUri]) {
return ssr.nonDynamic[normalisedUri];
}
console.log(uri);
for (const route in allDynamicRoutes) {
const { file, regex } = allDynamicRoutes[route];
const re = new RegExp(regex, "i");
const pathMatchesRoute = re.test(normalisedUri);
if (pathMatchesRoute) {
return file;
}
}
if (html.nonDynamic["/404"] !== undefined) {
return "pages/404.html";
}
return "pages/_error.js";
};
};
exports.handler = (event) => __awaiter(void 0, void 0, void 0, function* () {
const request = event.Records[0].cf.request;
const uri = normaliseUri(request.uri);
const manifest = manifest_json_1.default;
const prerenderManifest = prerender_manifest_json_1.default;
const { pages, publicFiles } = manifest;
const isStaticPage = pages.html.nonDynamic[uri];
const isPublicFile = publicFiles[uri];
const isPrerenderedPage = prerenderManifest.routes[request.uri];
const origin = request.origin;
const s3Origin = origin.s3;
const isHTMLPage = isStaticPage || isPrerenderedPage;
const normalisedS3DomainName = normaliseS3OriginDomain(s3Origin);
s3Origin.domainName = normalisedS3DomainName;
if (isHTMLPage || isPublicFile) {
s3Origin.path = isHTMLPage ? "/static-pages" : "/public";
addS3HostHeader(request, normalisedS3DomainName);
if (isHTMLPage) {
request.uri = `${uri}.html`;
}
return request;
}
const pagePath = router(manifest)(uri);
if (pagePath.endsWith(".html")) {
s3Origin.path = "/static-pages";
request.uri = pagePath.replace("pages", "");
addS3HostHeader(request, normalisedS3DomainName);
return request;
}
const page = require(`./${pagePath}`);
const { req, res, responsePromise } = next_aws_cloudfront_1.default(event.Records[0].cf);
if (isDataRequest(uri)) {
const { renderOpts } = yield page.renderReqToHTML(req, res, "passthrough");
res.setHeader("Content-Type", "application/json");
res.end(JSON.stringify(renderOpts.pageData));
}
else {
page.render(req, res);
}
return responsePromise;
});
Permission:
Allow: logs:CreateLogGroup
Allow: logs:CreateLogStream
Allow: logs:PutLogEvents
What else should I do? Should I create a new stream or is it automatically created? I can see a log group in my cloudwatch named aws/lambda but i'm not sure how to connect them
Really appreciate any help
Cheers
I have got problem with webRTC screen sharing. When one person shares his screen another person can`t see shared screen stream, and asks him for again sharing screen.I am useing node.js express server with socket.io. I am useing Google chrome. It requires HTTPS connection if it is not local.
This is web application code`
(function() {
const socket = io.connect(window.location.origin);
const localVideo = document.querySelector('.localVideo');
const remoteVideos = document.querySelector('.remoteVideos');
const peerConnections = {};
var url_string =window.location.href
var url = new URL(url_string);
var de = url.searchParams.get("key");
let room = de
let getUserMediaAttempts = 5;
let gettingUserMedia = false;
let getdisplaymedia=true;
const config = {
'iceServers': [{
'urls': ['stun:stun.l.google.com:19302']
}]
};
/** #type {MediaStreamConstraints} */
const constraints = {
audio: true,
video: { facingMode: "user" }
};
socket.on('bye', function(id) {
handleRemoteHangup(id);
});
if (room && !!room) {
socket.emit('join', room);
}
window.onunload = window.onbeforeunload = function() {
socket.close();
};
socket.on('ready', function (id) {
if (!(localVideo instanceof HTMLVideoElement) || !localVideo.srcObject) {
return;
}
const peerConnection = new RTCPeerConnection(config);
peerConnections[id] = peerConnection;
if (localVideo instanceof HTMLVideoElement) {
peerConnection.addStream(localVideo.srcObject);
}
peerConnection.createOffer()
.then(sdp => peerConnection.setLocalDescription(sdp))
.then(function () {
socket.emit('offer', id, peerConnection.localDescription);
});
peerConnection.onaddstream = event => handleRemoteStreamAdded(event.stream, id);
peerConnection.onicecandidate = function(event) {
if (event.candidate) {
socket.emit('candidate', id, event.candidate);
}
};
});
socket.on('offer', function(id, description) {
const peerConnection = new RTCPeerConnection(config);
peerConnections[id] = peerConnection;
if (localVideo instanceof HTMLVideoElement) {
peerConnection.addStream(localVideo.srcObject);
}
peerConnection.setRemoteDescription(description)
.then(() => peerConnection.createAnswer())
.then(sdp => peerConnection.setLocalDescription(sdp))
.then(function () {
socket.emit('answer', id, peerConnection.localDescription);
});
peerConnection.onaddstream = event => handleRemoteStreamAdded(event.stream, id);
peerConnection.onicecandidate = function(event) {
if (event.candidate) {
socket.emit('candidate', id, event.candidate);
}
};
});
socket.on('candidate', function(id, candidate) {
peerConnections[id].addIceCandidate(new RTCIceCandidate(candidate))
.catch(e => console.error(e));
});
socket.on('answer', function(id, description) {
peerConnections[id].setRemoteDescription(description);
});
function getUserMediaSuccess(stream) {
gettingUserMedia = false;
if (localVideo instanceof HTMLVideoElement) {
!localVideo.srcObject && (localVideo.srcObject = stream);
}
socket.emit('ready');
}
function handleRemoteStreamAdded(stream, id) {
const remoteVideo = document.createElement('video');
remoteVideo.srcObject = stream;
remoteVideo.setAttribute("id", id.replace(/[^a-zA-Z]+/g, "").toLowerCase());
remoteVideo.setAttribute("playsinline", "true");
remoteVideo.setAttribute("autoplay", "true");
remoteVideos.appendChild(remoteVideo);
if (remoteVideos.querySelectorAll("video").length === 1) {
remoteVideos.setAttribute("class", "one remoteVideos");
} else {
remoteVideos.setAttribute("class", "remoteVideos");
}
}
function getUserMediaError(error) {
console.error(error);
gettingUserMedia = false;
(--getUserMediaAttempts > 0) && setTimeout(getUserMediaDevices, 1000);
}
function getUserMediaDevices() {
if (localVideo instanceof HTMLVideoElement) {
if (localVideo.srcObject) {
getUserMediaSuccess(localVideo.srcObject);
} else if (!gettingUserMedia && !localVideo.srcObject) {
gettingUserMedia = true;
navigator.mediaDevices.getDisplayMedia(constraints)
.then(getUserMediaSuccess)
.catch(getUserMediaError);
}
}
}
function handleRemoteHangup(id) {
peerConnections[id] && peerConnections[id].close();
delete peerConnections[id];
document.querySelector("#" + id.replace(/[^a-zA-Z]+/g, "").toLowerCase()).remove();
if (remoteVideos.querySelectorAll("video").length === 1) {
remoteVideos.setAttribute("class", "one remoteVideos");
} else {
remoteVideos.setAttribute("class", "remoteVideos");
}
}
getUserMediaDevices();
})();
This is node.js code`
const credentials = require('./credentials');
const express = require('express');
const app = express();
let server;
let port;
if (credentials.key && credentials.cert) {
const https = require('https');
server = https.createServer(credentials, app);
port = 443;
} else {
const http = require('http');
server = http.createServer(app);
port = 1517;
}
const io = require('socket.io')(server);
const RoomService = require('./RoomService')(io);
io.sockets.on('connection', RoomService.listen);
io.sockets.on('error', e => console.log(e));
app.use(express.static(__dirname + '/public'));
app.get('*', function(req, res) {
res.sendFile(${__dirname}/public/index.html);
});
server.listen(port, () => console.log(Server is running on port ${port}));
ok man this is fixed code you can use yuu need just stream getDisplayMedia value
(function() {
const socket = io.connect(window.location.origin);
const localVideo = document.querySelector('.localVideo');
const remoteVideos = document.querySelector('.remoteVideos');
const peerConnections = {};
var url_string =window.location.href
var url = new URL(url_string);
var de = url.searchParams.get("key");
let room = de
let getUserMediaAttempts = 5;
let gettingUserMedia = false;
let getdisplaymedia=true;
/** #type {RTCConfiguration} */
const config = {
'iceServers': [{
'urls': ['stun:stun.l.google.com:19302']
}]
};
/** #type {MediaStreamConstraints} */
const constraints = {
audio: true,
video: { facingMode: "user" }
};
socket.on('full', function(room) {
alert('Room ' + room + ' is full');
});
socket.on('bye', function(id) {
handleRemoteHangup(id);
});
if (room && !!room) {
socket.emit('join', room);
}
window.onunload = window.onbeforeunload = function() {
socket.close();
};
socket.on('ready', function (id) {
if (!(localVideo instanceof HTMLVideoElement) || !localVideo.srcObject) {
return;
}
const peerConnection = new RTCPeerConnection(config);
peerConnections[id] = peerConnection;
if (localVideo instanceof HTMLVideoElement) {
peerConnection.addStream(localVideo.srcObject);
}
peerConnection.createOffer()
.then(sdp => peerConnection.setLocalDescription(sdp))
.then(function () {
socket.emit('offer', id, peerConnection.localDescription);
});
peerConnection.onaddstream = event => handleRemoteStreamAdded(event.stream, id);
peerConnection.onicecandidate = function(event) {
if (event.candidate) {
socket.emit('candidate', id, event.candidate);
}
};
});
socket.on('offer', function(id, description) {
const peerConnection = new RTCPeerConnection(config);
peerConnections[id] = peerConnection;
if (localVideo instanceof HTMLVideoElement) {
peerConnection.addStream(localVideo.srcObject);
}
peerConnection.setRemoteDescription(description)
.then(() => peerConnection.createAnswer())
.then(sdp => peerConnection.setLocalDescription(sdp))
.then(function () {
socket.emit('answer', id, peerConnection.localDescription);
});
peerConnection.onaddstream = event => handleRemoteStreamAdded(event.stream, id);
peerConnection.onicecandidate = function(event) {
if (event.candidate) {
socket.emit('candidate', id, event.candidate);
}
};
});
socket.on('candidate', function(id, candidate) {
peerConnections[id].addIceCandidate(new RTCIceCandidate(candidate))
.catch(e => console.error(e));
});
socket.on('answer', function(id, description) {
peerConnections[id].setRemoteDescription(description);
});
function getUserMediaSuccess(stream) {
gettingUserMedia = false;
if (localVideo instanceof HTMLVideoElement) {
!localVideo.srcObject && (localVideo.srcObject = stream);
}
socket.emit('ready');
}
function handleRemoteStreamAdded(stream, id) {
const remoteVideo = document.createElement('video');
remoteVideo.srcObject = stream;
remoteVideo.setAttribute("id", id.replace(/[^a-zA-Z]+/g, "").toLowerCase());
remoteVideo.setAttribute("playsinline", "true");
remoteVideo.setAttribute("autoplay", "true");
remoteVideos.appendChild(remoteVideo);
if (remoteVideos.querySelectorAll("video").length === 1) {
remoteVideos.setAttribute("class", "one remoteVideos");
} else {
remoteVideos.setAttribute("class", "remoteVideos");
}
}
function getUserMediaError(error) {
console.error(error);
gettingUserMedia = false;
(--getUserMediaAttempts > 0) && setTimeout(getUserMediaDevices, 1000);
}
function getUserMediaDevices() {
var constraints = { audio: true, video: { width: 1280, height: 720 } };
navigator.mediaDevices.getDisplayMedia(constraints)
.then(function(mediaStream) {
var video = document.querySelector('video');
video.srcObject = mediaStream;
video.onloadedmetadata = function(e) {
video.play();
getUserMediaSuccess(video.srcObject)
};
})
.catch(function(err) { console.log(err.name + ": " + err.message); }); // always check for errors at the end.
}
function handleRemoteHangup(id) {
peerConnections[id] && peerConnections[id].close();
delete peerConnections[id];
document.querySelector("#" + id.replace(/[^a-zA-Z]+/g, "").toLowerCase()).remove();
if (remoteVideos.querySelectorAll("video").length === 1) {
remoteVideos.setAttribute("class", "one remoteVideos");
} else {
remoteVideos.setAttribute("class", "remoteVideos");
}
}
getUserMediaDevices();
})();
I have written a scraper in typescript, Running on node:10.12.0,
Issue: The code goes on sleep after few hours, randomly. And I had to restart it. My best guess is it stucks on url request
Tools/Packages Using:
Puppeteer
Cheerio
Typescript
Code:
import * as cheerio from "cheerio";
import * as request from "request";
import * as fs from "fs";
import * as shell from "shelljs";
import pup = require("puppeteer");
class App {
// #ts-ignore
public browser: pup.Browser;
public appendToFile(file: string, content: string): Promise < string > {
return new Promise < string > ((resolve, reject) => {
try {
fs.appendFileSync(file, content);
resolve("DONE");
} catch (e) {
reject(e);
}
});
}
public loadPage(url: string): Promise < any > {
return new Promise < any > ((resolve, reject) => {
request.get(url, async (err, res, html) => {
if (!err && res.statusCode === 200) {
resolve(html);
} else {
if (err) {
reject(err);
} else {
reject(res);
}
}
});
});
}
public step1(url: string): Promise < string > {
return new Promise < string > (async (resolve, reject) => {
let page: pup.Page | undefined;
try {
let next = false;
let urlLink = url;
let first = true;
let header = "unknown";
let f = url.split("/");
let folder = f[f.length - 3];
folder = folder || header;
let path = "data/" + folder;
shell.mkdir("-p", path);
page = await this.browser.newPage();
await page.goto(url, {
timeout: 0
});
let count = 1;
do {
next = false;
let res = await page.evaluate(() => {
let e = document.querySelectorAll(".ch-product-view-list-container.list-view li ul > li > h6 > a");
let p: string[] = [];
e.forEach((v) => {
p.push(("https://www.link.com") + (v.getAttribute("href") as string));
});
return p;
});
// for(const l of res) {
// try {
// await this.step2(l, "" , "")
// } catch(er) {
// this.appendToFile("./error.txt", l + "::" + url + "\n").catch(e=>e)
// }
// }
let p = [];
let c = 1;
for (const d of res) {
p.push(await this.step2(d, folder, c.toString()).catch((_e) => {
console.log(_e);
fs.appendFileSync("./error-2.txt", urlLink + " ### " + d + "\n");
}));
c++;
}
await Promise.all(p);
await this.appendToFile("./processed.txt", urlLink + ":" + count.toString() + "\n").catch(e => e);
count++;
console.log(urlLink + ":" + count);
let e = await page.evaluate(() => {
let ele = document.querySelector("#pagination-next") as Element;
let r = ele.getAttribute("style");
return r || "";
});
if (e === "") {
next = true;
await page.click("#pagination-next");
// console.log('waitng')
await page.waitFor(1000);
// console.log('done wait')
// await page.waitForNavigation({waitUntil: 'load'}).catch(e=> console.log(e));
// await Promise.all([
// page.click("#pagination-next"),
// page.waitForNavigation({ waitUntil: 'networkidle0'}), // ]);
}
} while (next);
// await page.close();
resolve("page all scrapped");
} catch (errrr) {
reject(errrr);
} finally {
if (page !== undefined) {
await page.close().catch(e => e);
}
}
});
}
public step2(url: string, folder: string, file: string): Promise < string > {
return new Promise < string > (async (resolve, reject) => {
try {
let html = await this.loadPage(url).catch(e => reject(e));
let $ = cheerio.load(html);
let ress: any = {};
let t = $(".qal_title_heading").text();
if (t) {
ress.header = t.replace(/"/g, "'").replace(/\n|\r|\t/g, "");
}
let d = $("div.ch_formatted_text.qal_thread-content_text.asker").html();
if (d) {
ress.body = d.replace(/"/g, "'").replace(/\n|\r|\t/g, "");
}
// let sprit = "-------------------------------";
let filename = "data" + file + ".json"; // ((t.replace(/[^\w\s]/gi, "")).substring(0,250)+".txt")
let data = JSON.stringify(ress) // t +sprit + d + "\n---end---\n"; await this.appendToFile("./data/"+ folder + "/" +filename, data+",\n")
.then((r) => {
resolve(r);
});
} catch (err) {
reject(err);
}
});
}
}
async function main() {
process.on("SIGTERM", () => {
console.log("SigTerm received");
process.exit(1);
});
process.on("SIGINT", () => {
console.log("SigInt received");
process.exit(1);
});
let path = "data/unknown";
shell.mkdir("-p", path);
let c = new App();
let list: string[] = [];
console.log(process.argv[2]);
require("fs").readFileSync(process.argv[2], "utf-8").split(/\r?\n/).forEach((line: string) => {
list.push(line);
});
console.log("total links->" + list.length);
c.browser = await pup.launch({
headless: true
});
for (const l of list) {
await c.step1(l).then(e => {
fs.appendFileSync("./processed.txt", l);
}).catch(e => {
fs.appendFileSync("./error.txt", l);
});
}
}
main();
Let me know if you need something else from me. Also this is all the code.
So , I figured two problems.
The chrome (under puppeteer) consumes high CPU, which gives the trend like this:
at start it's on moderate usage. and it gradually increases. My trend was it started off with 4% usage and after a day, it reached 100%. I've submitted an issue on their git
I did not specify the timeout in request
was:
request.get(url, async (err, res, html) => {
should be:
request.get(url,{timeout: 1500} async (err, res, html) => {
So far my code is running fine for more than a day now. only issue is high cpu usage. But it's none of my concern as for now.