Migrating from background page to service worker - google-chrome-extension

In January 2023 the extensions with the Manifest V2 will stop working, and I try to migrate to V3 but I have the problem that I have to migrate to service worker
With the V2 manifest I had my background.js file
const browser = chrome || browser
const tabQuery = (options, params = {}) => new Promise(res => {
if (!options.countPinnedTabs) params.pinned = false
browser.tabs.query(params, tabs => res(tabs))
})
const windowRemaining = options =>
tabQuery(options, { currentWindow: true })
.then(tabs => options.maxWindow - tabs.length)
const totalRemaining = options =>
tabQuery(options)
.then(tabs => options.maxTotal - tabs.length)
const updateBadge = options => {
if (!options.displayBadge) {
browser.browserAction.setBadgeText({
text: "" })
return;
}
Promise.all([windowRemaining(options), totalRemaining(options)])
.then(remaining => {
browser.browserAction.setBadgeText({
text: Math.min(...remaining).toString()
});
chrome.browserAction.setBadgeBackgroundColor({
color: "#7e7e7e"
})
})
}
const detectTooManyTabsInWindow = options => new Promise(res => {
tabQuery(options, { currentWindow: true }).then(tabs => {
if (options.maxWindow < 1) return;
if (tabs.length > options.maxWindow) res("window");
});
})
const detectTooManyTabsInTotal = options => new Promise(res => {
tabQuery(options).then(tabs => {
if (options.maxTotal < 1) return;
if (tabs.length > options.maxTotal) res("total");
});
})
const getOptions = () => new Promise((res, rej) => {
browser.storage.sync.get("defaultOptions", (defaults) => {
browser.storage.sync.get(defaults.defaultOptions, (options) => {
res(options);
})
})
})
const displayAlert = (options, place) => new Promise((res, rej) => {
if (!options.displayAlert) { return res(false) }
const replacer = (match, p1, offset, string) => {
switch (p1) {
case "place":
case "which":
return place === "window" ?
"one window" : "total";
break;
case "maxPlace":
case "maxWhich":
return options[
"max" + capitalizeFirstLetter(place)
];
break;
default:
return options[p1] || "?";
}
};
const renderedMessage = options.alertMessage.replace(
/{\s*(\S+)\s*}/g,
replacer
)
alert(renderedMessage);
})
let tabCount = -1
let previousTabCount = -1
let amountOfTabsCreated = -1
const updateTabCount = () => new Promise(res => browser.tabs.query({}, tabs => {
if (tabs.length == tabCount) {
return res(amountOfTabsCreated);
}
previousTabCount = tabCount
tabCount = tabs.length
amountOfTabsCreated =
~previousTabCount ? tabCount - previousTabCount : 0
res(amountOfTabsCreated)
}))
let passes = 0;
const handleExceedTabs = (tab, options, place) => {
console.log(place)
if (options.exceedTabNewWindow && place === "window") {
browser.windows.create({ tabId: tab.id, focused: true});
} else {
browser.tabs.remove(tab.id);
}
}
const handleTabCreated = tab => options => {
return Promise.race([
detectTooManyTabsInWindow(options),
detectTooManyTabsInTotal(options)
])
.then((place) => updateTabCount().then(amountOfTabsCreated => {
if (passes > 0) {
console.log("passed with pass no. ", passes)
passes--;
return;
}
console.log("amountOfTabsCreated", amountOfTabsCreated)
displayAlert(options, place)
if (amountOfTabsCreated === 1) {
handleExceedTabs(tab, options, place);
app.update()
} else if (amountOfTabsCreated > 1) {
passes = amountOfTabsCreated - 1
} else if (amountOfTabsCreated === -1) {
handleExceedTabs(tab, options, place);
app.update()
} else {
throw new Error("weird: multiple tabs closed after tab created")
}
}))
}
const app = {
init: function() {
browser.storage.sync.set({
defaultOptions: {
maxTotal: 20,
maxWindow: 20,
exceedTabNewWindow: false,
displayAlert: true,
countPinnedTabs: false,
displayBadge: true,
alertMessage: chrome.i18n.getMessage("string_7")
}
});
browser.tabs.onCreated.addListener(tab =>
getOptions().then(handleTabCreated(tab))
)
console.log("init", this)
browser.windows.onFocusChanged.addListener(app.update)
browser.tabs.onCreated.addListener(app.update)
browser.tabs.onRemoved.addListener(app.update)
browser.tabs.onUpdated.addListener(app.update)
},
update: () => {
updateTabCount();
getOptions().then(updateBadge)
}
};
app.init();
app.update();
function capitalizeFirstLetter(string) {
return string[0].toUpperCase() + string.slice(1);
}
Any suggestions on how to convert it for service worker to work with Manifest V3?
Thank you and I look forward to your comments

Related

Camera flipping issue -Twilio version 2.24

The flip camera doesn't work on devices where they have more than 2 video inputs. In the first load, the video appears but when the flip camera button is clicked the application throws an error.
Expected behavior:
The camera should be flipped (environment)
Actual behavior:
It throws the following error:
error : call to getusermedia failed domexception could not start video source
Software versions:
Browser(s): Chrome
Operating System: Android (devices that I'm checking & it's not working eq. Samsung M31, Redmi note 11 T, One Plus 7T)
twilio-video.js: 2.24.0
Third-party libraries (e.g., Angular, nodejs, etc.):
Code used to start twilio stream
async startTwilioStream(twilioToken: string, localVideo: ElementRef, remoteVideo: ElementRef): Promise<void> {
console.log('startTwilioStream');
this.localVideoElement = localVideo;
this.remoteVideoElement = remoteVideo;
await this.startLocalVideo(this.localVideoElement);
this.connectOptions = {
video: false,
audio: false,
tracks: [this.localAudioTrack, this.localVideoTrack],
audioConstraints: {
mandatory: {
googAutoGainControl: false,
},
},
region: 'in1',
preferredAudioCodecs: ['opus'],
preferredVideoCodecs: ['H264'],
};
connect(twilioToken, this.connectOptions).then((twilioRoom: any) => {
console.log('twilioRoom.localParticipant ================== ', twilioRoom.localParticipant);
setTimeout(() => {
if (this.remoteVideoElement?.nativeElement) {
this.remoteVideoElement.nativeElement.muted = false;
}
}, 5000);
this.twilioRoom = twilioRoom;
console.log('this.twilioRoom vvvv', this.twilioRoom);
twilioRoom.localParticipant.setNetworkQualityConfiguration({
local: 2,
remote: 1,
});
// flip.addEventListener('change', this.updateVideoDevice);
twilioRoom.on('participantConnected', participant => {
console.log('participant Connected===============', participant);
participant.tracks.forEach((publication) => {
console.log('publication', publication);
if (publication.isSubscribed) {
const track = publication.track;
this.attachTracks([track]);
}
});
this.twilioRoom = twilioRoom;
});
twilioRoom.on('participantDisconnected', participant => {
console.log('participantDisconnected', participant);
console.log('SOME PARTICIPANT DISCONNECTED');
if ((participant.identity === 'agent-screen-share' && this.serviceUserType !== 'agent') || (participant.identity === 'consumer-screen-share' && this.serviceUserType !== 'consumer')) {
this.changeDetectionEmitter.emit('remoteScreenShareStopped');
this.isRemoteScreenShareOn = false;
} else if (participant.identity !== 'agent-screen-share' && participant.identity !== 'consumer-screen-share') {
console.log('real participant dced');
this.remoteMediaStream = null;
this.detachTracks(participant);
this.isRemoteVideoOn = false;
}
this.twilioRoom = twilioRoom;
});
twilioRoom.participants.forEach((participant) => {
participant.tracks.forEach((publication) => {
if (publication.track) {
const track = publication.track;
this.attachTracks([track]);
}
});
participant.on('trackSubscribed', (track) => {
console.log('trackSubscribed', track);
this.attachTracks([track]);
});
this.twilioRoom = twilioRoom;
});
twilioRoom.on('trackAdded', (track, participant) => {
console.log('trackAdded', track, participant);
this.attachTracks([track]);
this.twilioRoom = twilioRoom;
});
// When a Participant adds a Track, attach it to the DOM.
twilioRoom.on('trackSubscribed', (track, err, participant) => {
console.log('trackSubscribed', track);
this.sendLoaderStatus('ringing');
if ((participant.identity === 'agent-screen-share' && this.serviceUserType !== 'agent') || (participant.identity === 'consumer-screen-share' && this.serviceUserType !== 'consumer')) {
this.attachScreenShareTrack([track]);
} else if (participant.identity === 'agent-screen-share' || participant.identity === 'consumer-screen-share') {
} else {
this.attachTracks([track]);
}
this.twilioRoom = twilioRoom;
});
// When a Participant removes a Track, detach it from the DOM.
twilioRoom.on('trackRemoved', (track, participant) => {
console.log('trackRemoved', track);
this.detachTracks([track]);
this.twilioRoom = twilioRoom;
});
}, err => {
});
}
Start local video and local audio track
async startLocalVideo(localVideo: ElementRef, deviceId = 'user'): Promise<void> {
this.localVideoElement = localVideo;
const localAudioTrack = await createLocalAudioTrack({
audio: true
});
const localVideoTrack = await createLocalVideoTrack({
facingMode: deviceId
});
this.localAudioTrack = localAudioTrack;
this.localVideoTrack = localVideoTrack;
if (!this.localAudioTrack) {
alert('Audio source not found, do you hava a mic connected ?');
}
if (!this.localVideoTrack) {
alert('Video source not found, do you hava a videocam connected ?');
}
console.log('this.localVideoTrack to check', this.localVideoTrack);
this.localDisplayMediaStream = new MediaStream();
console.log('this.localVideoTrack.mediaStreamTrack to check', this.localVideoTrack.mediaStreamTrack);
this.localDisplayMediaStream.addTrack(this.localVideoTrack.mediaStreamTrack);
console.log('this.localDisplayMediaStream to check', this.localDisplayMediaStream);
this.localVideoElement.nativeElement.srcObject = this.localDisplayMediaStream;
}
Flip event listener calls on the click of switch button
const flip = document.querySelector('#flip');
flip.addEventListener('click', (e) => {
if (this.facingMode == "user") {
this.facingMode = "environment";
this.twilioService.switch(this.facingMode)
} else {
this.facingMode = "user";
this.twilioService.switch(this.facingMode)
}
});
Switch camera function calls in flip event listener
async switch(facingMode) {
console.log(this.localDisplayMediaStream);
if (this.localDisplayMediaStream) {
this.localDisplayMediaStream.getTracks().forEach(track => {
track.stop();
});
if (this.twilioRoom) {
await this.twilioRoom.localParticipant.videoTracks.forEach((track: any) => {
console.log('track', track);
track.track.stop();
});
}
}
const localVideoTrack = await createLocalVideoTrack({
facingMode: facingMode
});
this.localVideoTrack = localVideoTrack;
this.localDisplayMediaStream = new MediaStream();
this.localDisplayMediaStream.addTrack(this.localVideoTrack.mediaStreamTrack);
this.localVideoElement.nativeElement.srcObject = this.localDisplayMediaStream;
}

WebDriverError: unknown error: DevToolsActivePort file doesn't exist - no fixes have worked

This seems to be a common error, and I have read through all the search results on google.
Chrome options are set as suggested in other posts
Installed Xvfb libXfont Xorg
and a half dozen other suggestions from the many other similar posts about this issue.
Ubuntu 22.04 LTS on AWS EC2
chromedriver 101.0.4951.41
selenium-webdriver 4.2.0
This is a new EC2 instance just for this project, if I could uninstall and install over again to get it working I wouldn't care..
theres a docker file for this, and I haven't used docker before but I'm now considering it as I can't get passed this issue- I've overcome the handful of issues before this but theres no fix yet that I've seen that works.
(author of this bot im trying to run suggests using chromedriver 88, so if anyone knows how to uninstall and install just that version that would be great tip)
const { exec } = require("child_process")
const webdriver = require('selenium-webdriver')
const chrome = require('selenium-webdriver/chrome')
const YoutubeDlWrap = require("youtube-dl-wrap")
const youtubeDlWrap = new YoutubeDlWrap()
class Video {
async load(url, youtube_dl, msg) {
if (this.in_loading) return
this.in_loading = true
this.driver.executeScript('video.innerHTML = null')
if (youtube_dl) {
await msg.edit("Downloading...")
.then(async msg => {
console.log("Downloading...")
const fileName = await this.download(url, msg)
url = __dirname + "/client/tmp/" + fileName
})
}
await this.driver.executeScript(`video.src='${url}'`)
.then(_ => {
console.log('Loading...')
msg.edit("Loading...")
.then(_ => {
var int1 = setInterval(() => {
is_error && clearInterval(int1)
if (this.killed) {
msg.edit(":no_entry_sign: Loading stopped")
this.in_loading = false
this.killed = false
clearInterval(int1)
clearInterval(int2)
clearInterval(int3)
}
this.driver.getCurrentUrl()
.then(url => {
if (!this.init && url === "file:///channels/#me") {
this.init = true
this.open_guild()
this.join(msg)
clearInterval(int1)
}
else if(this.init)
clearInterval(int1)
})
}, 10)
})
})
// Wait until video load
let is_load
var int2 = setInterval(() => {
this.driver.executeScript("return video.duration")
.then(result => {
if (result) {
is_load = true
this.duration = result
this.in_loading = false
msg.edit("Done, Type `*play` to start playing.")
clearInterval(int2)
}
else if (is_error)
clearInterval(int2)
})
}, 10)
// Error event
let is_error
var int3 = setInterval(() => {
this.driver.executeScript('return video_error')
.then(error_msg => {
if (error_msg) {
msg.edit(":no_entry_sign: " + error_msg)
is_error = true
this.in_loading = false
this.driver.executeScript('video_error = ""')
clearInterval(int3)
return
}
else if (is_load)
clearInterval(int3)
})
}, 10)
}
download(url, msg) {
return new Promise((resolve, reject) => {
const fileName = Date.now()
const path = "./client/tmp"
exec(`rm -rf ${path}/*`, _ => {
this.download_process = youtubeDlWrap.exec([url, "-o", `${path}/video`])
.on("progress", progress => {
//console.log(progress.percent)
})
.on("error", err => {
msg.edit(":no_entry_sign: " + err.message)
.then(_ => {
this.in_loading = false
})
})
.on("close", () => {
if (this.killed) {
msg.edit(":no_entry_sign: Downloading process killed")
this.killed = false
}
else
exec(`mv ${path}/* ${path}/${fileName}`, _ => {
resolve(fileName)
})
}).youtubeDlProcess
})
})
}
play() {
console.log("Play")
this.start()
this.driver.executeScript('video.play()')
}
pause() {
console.log("Pause")
this.driver.executeScript('video.pause()')
}
current(time=null) {
if (time) {
if (time[0] === '+' || time[0] === '-') {
this.current().then(c => {
if (!c) return
let r
c = parseFloat(c)
const s = parseInt(time.slice(1))
time[0] === '+' ?
r = c + s :
r = c - s
this.driver.executeScript(`video.currentTime = ${r}`)
})
}
else
this.driver.executeScript(`video.currentTime = ${time}`)
}
else
return this.driver.executeScript("return video.currentTime")
}
hms(sec) {
if (sec)
return new Date(sec * 1000).toISOString().substr(11, 8)
return sec
}
}
class Stream extends Video {
client_url = `file://${__dirname}/client/index.html`
constructor(token, headless=true) {
super()
const chrome_options = new chrome.Options()
headless && chrome_options.addArguments('--headless')
chrome_options.addArguments('--no-sandbox')
chrome_options.addArguments('--window-size=1920,1080')
chrome_options.addArguments('--disable-web-security')
chrome_options.addArguments('--disable-dev-shm-usage')
chrome_options.addArguments('--autoplay-policy=no-user-gesture-required')
chrome_options.addArguments('user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36')
console.log("Webdriver started")
this.driver = new webdriver.Builder().forBrowser('chrome').setChromeOptions(chrome_options).build()
this.driver.get(this.client_url)
this.driver.executeScript(`localStorage.setItem("token", '"${token}"')`)
}
open_guild() {
this.driver.executeScript(`document.querySelector('[data-list-item-id="guildsnav___${this.guild_id}"]').click()`)
}
is_full() {
return this.driver.executeScript(`
return document.querySelector("[aria-label='Channel is full']")
`)
}
is_locked() {
return this.driver.executeScript(`
return document.querySelector("[data-list-item-id='channels___${this.channel_id}']").innerHTML.includes("Voice (Locked)")
`)
}
scroll() {
this.driver.executeScript(`
var c_inject = document.getElementById("channels");
if( c_inject.scrollTop === (c_inject.scrollHeight - c_inject.offsetHeight))
c_inject.scroll(0, 0)
else
c_inject.scroll(0, c_inject.scrollTop + 250)
`)
}
join(msg) {
var intJoin = setInterval(() => {
this.driver.executeScript(`document.querySelector("[data-list-item-id='channels___${this.channel_id}']").click()`)
.then(() => {
// this.is_locked()
// .then(result => {
// if (result) {
// msg.channel.send(":no_entry_sign: Channel is locked")
// return
// }
// })
// this.is_full()
// .then(result => {
// if (result) {
// msg.channel.send(":no_entry_sign: Channel is full")
// return
// }
// })
setTimeout(() => {
this.start()
}, 1000)
clearInterval(intJoin)
})
.catch(() => this.scroll())
}, 10)
}
start() {
this.driver.executeScript(`
var streamBtn_inject = document.querySelector('[aria-label="Share Your Screen"]')
!streamBtn_inject.className.includes('buttonActive-3FrkXp') &&
streamBtn_inject.click()
`).catch(e => e)
}
stop() {
console.log("Stop")
this.init = false
this.driver.get(this.client_url)
}
}
exports.Stream = Stream

How can I compare data from two maps

I tried comparing data from each table from my data base but i failed
I'm not too familiar with react, I'm still working on it, I'm trying to compare the data from recommendation and customization and if they are the same I display them.
const getRecommendation = () => {
Axios.get("http://localhost:5000/recommendations").then((response) => {
setRecomList(response.data);
});
};
const getCostumization = () => {
Axios.get("http://localhost:5000/customizations").then((response) => {
setCustomList(response.data);
});
};
const getRecById = async (id) => {
Axios.get(`http://localhost:5000/recommendations/${id}`).then((res) => {
setRecById(
recById.filter((val) => {
return val._id === id;
})
);
});
};
useEffect(() => {
{
recommendation.map((rec, i) => {
customization.map((cus, j) => {
if (
rec.type === cus.type &&
rec.violonBody === cus.violonBody &&
rec.violonStick === cus.violonStick &&
rec.violonChincrest === cus.violonChincrest
) {
getCostumization();
}
});
});
}
});
Thank you!
You can use like below
const compare = (obj1, obj2) => {
const keys1 = Object.keys(obj1);
const keys2 = Object.keys(obj2);
if (keys1.length !== keys2.length) return false;
return keys1.every((key) => obj1[key] === obj2[key]);
};
console.log(compare({ a: 1, b: 2 }, { a: 1, b: 2}));
console.log(compare({ a: 1, b: 2 }, { a: 1, b: 2, c:3 }));
Comparing objects & array is not an easy task to do by ourselves since it involves doing deep comparison.
One of the popular and convenient way is to use the _.isEqual from the lodash library.
You could use it like this:
var object = { 'a': 1 };
var other = { 'a': 1 };
_.isEqual(object, other);
// => true
object === other;
// => false
Could you do something like this? This should return an array of objects that are found in both arrays:
function compareArrays(arr1, arr2) {
const same = [];
for (const i = 0; i < arr1.length; i++) {
for (const j = 0; j < arr2.length; j++) {
if (arr1[i].name === arr2[j].name) {
same.push(arr1[i]);
}
}
}
return same;
}
So using your example it would look like this:
const getRecommendation = () => {
Axios.get("http://localhost:5000/recommendations").then((response) => {
setRecomList(response.data);
});
};
const getCostumization = () => {
Axios.get("http://localhost:5000/customizations").then((response) => {
setCustomList(response.data);
});
};
const getRecById = async (id) => {
Axios.get(`http://localhost:5000/recommendations/${id}`).then((res) => {
setRecById(
recById.filter((val) => {
return val._id === id;
})
);
});
};
useEffect(() => {
const equalPairArray = compareArrays(recommendation, customization)
if(equalPairArray.length > 0){
getCostumization();
}
});

Why do I get zombie puppeteer processes on alpine/docker?

Here is the entirety of my puppeteer controller:
import { Readability } from '#mozilla/readability';
const puppeteer = require('puppeteer-extra');
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
const jsdom = require('jsdom');
const { JSDOM } = jsdom;
const summarize = require('summarize');
const keyword_extractor = require('keyword-extractor');
const amex = require('../../csv/AMEX.json');
const nasdaq = require('../../csv/NASDAQ.json');
const nyse = require('../../csv/NYSE.json');
const cryptotickers = require('../../csv/cryptos.json');
puppeteer.use(StealthPlugin());
class Reader {
constructor() {
this.browser = null;
}
async getLink(link) {
this.browser = await puppeteer.launch({
devtools: false,
headless: true,
// product: 'firefox',
executablePath: '/usr/bin/chromium-browser',
args: [
'--proxy-server=' + process.env.PROXY_HOST,
'--no-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
'--single-process',
'--disable-setuid-sandbox',
'--no-zygote',
'--shm-size=4gb',
'--disable-infobars',
'--ignore-certifcate-errors',
'--ignore-certifcate-errors-spki-list',
// '--user-agent="Mozilla/5.0 (iPhone; CPU iPhone OS 14_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1"'
],
});
const { htm, title } = await this.spa(link);
if (!htm) {
await this.browser.close();
return;
}
const text = txt(htm, link);
const data = Object.assign({}, text);
const parts = new URL(link);
if (!data.title) {
data.title = title;
}
data.summary = summary(data.content, data.title);
data.tickers = tickers(data.content, data.textContent);
data.cryptos = cryptos(data.content, data.textContent);
data.meta = getMeta(htm);
if (!data.title && data.meta.title) {
data.title = data.meta.title;
}
data.url = link;
data.htm = htm;
data.host = parts.host;
data.text = data.textContent;
delete data.textContent;
console.log('data fetched: ' + link);
await this.browser.close();
// await this.browser.disconnect();
return data;
}
async spa(url) {
let htm;
let title;
try {
let page = await this.browser.newPage();
await page.setRequestInterception(true);
page.on('request', (req) => {
if (
req.resourceType() === 'stylesheet' ||
req.resourceType() === 'font' ||
req.resourceType() == 'image'
) {
req.abort();
} else {
req.continue();
}
});
await page.authenticate({
username: process.env.PROXY_USER,
password: process.env.PROXY_PASS,
});
await page.setViewport({ width: 800, height: 600 });
// await page.goto(url, { waitUntil: 'networkidle2' });
await page.goto(url, { waitUntil: 'domcontentloaded' });
await this.autoScroll(page);
await page.evaluate(() => window.scrollTo(0, 50));
htm = await page.content();
title = await page.evaluate(() => document.title);
if (htm.indexOf('<title') === -1) {
htm = await page.evaluate(() => document.documentElement.outerHTML);
}
console.log(title, 'title');
} catch (err) {
console.error(err, url);
}
return { htm, title };
}
async autoScroll(page) {
await page.evaluate(async () => {
new Promise((resolve, reject) => {
try {
const maxScroll = Number.MAX_SAFE_INTEGER;
let lastScroll = 0;
const interval = setInterval(() => {
window.scrollBy(0, document.body.offsetHeight);
const { scrollTop } = document.documentElement;
if (scrollTop === maxScroll || scrollTop === lastScroll) {
clearInterval(interval);
resolve();
} else {
lastScroll = scrollTop;
}
}, 1000);
} catch (error) {
reject(error);
}
}).catch((error) => {
console.error(error); // add catch here
});
});
// await page.evaluate(async () => {
// await new Promise((resolve, reject) => {
// let totalHeight = 0;
// let distance = 300;
// let timer = setInterval(() => {
// const scrollHeight = document.body.scrollHeight;
// window.scrollBy(0, distance);
// totalHeight += distance;
// if(totalHeight >= scrollHeight){
// clearInterval(timer);
// resolve();
// }
// }, 100);
// });
// });
}
} // end Class Reader
async function summarization2(text) {
let res;
let data;
console.log(text, process.env.DEEPAI_KEY);
try {
const body = new FormData();
body.append('text', text);
res = await fetch(`https://api.deepai.org/api/summarization`, {
method: 'POST',
body,
headers: {
'api-key': process.env.DEEPAI_KEY,
},
});
data = await res.json();
} catch (err) {
console.error(err);
}
return data;
}
async function sentiment(text) {
return await deepai.callStandardApi('sentiment-analysis', { text });
}
async function summarization(text) {
return await deepai.callStandardApi('summarization', { text }).catch(console.error);
}
function summary(text, title) {
if (!text) return {};
const summary = summarize(`${title} - ${text}`);
summary.topics = keyword_extractor
.extract(`${title} - ${text}`, {
language: 'english',
remove_digits: true,
return_changed_case: true,
remove_duplicates: false,
})
.map(process);
const counts = summary.topics.reduce(
(acc, value) => ({
...acc,
[value]: (acc[value] || 0) + 1,
}),
{},
);
let topics = [];
for (let topic in counts) {
topics.push({ topic, count: counts[topic] });
}
topics = topics.filter((t) => t.topic);
topics = topics.sort((a, b) => {
return b.count - a.count;
});
topics = topics.slice(0, 10);
topics = topics.map((topic) => topic.topic);
summary.topics = topics;
function process(topic) {
topic = topic.toLowerCase().trim();
topic = topic.replace(/[\W_]+/g, '');
topic = topic.replace(/\s+/g, '-');
return topic;
}
console.log('summary: ', summary);
return summary;
}
function tickers(htm, text) {
if (!text) return {};
const tickers = [];
function findTicker(ticker, exchange) {
let name = ticker.Name;
if (name && name.indexOf('Twitter') === -1 && name.indexOf('Facebook') === -1) {
name = name.replace(/,? ?Inc\.?/gi, '').replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}
const regex = new RegExp(`\\b${name}\\b`, 'gi');
if (text.match(regex)) {
console.log(name);
console.log(regex.toString());
tickers.push({ name: ticker.Name, symbol: ticker.Symbol, exchange });
}
}
amex.forEach((ticker) => {
findTicker(ticker, 'amex');
});
nasdaq.forEach((ticker) => {
findTicker(ticker, 'nasdaq');
});
nyse.forEach((ticker) => {
findTicker(ticker, 'nyse');
});
console.log(tickers);
return tickers;
}
function cryptos(htm, text) {
if (!text) return {};
const tickers = [];
function findTicker(ticker) {
const name = ticker.name.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
const regex = new RegExp(`\\b${name}\\b`, 'g');
if (text.match(regex)) {
console.log(name);
console.log(regex.toString());
tickers.push({ name: ticker.name, symbol: ticker.symbol });
}
}
cryptotickers.forEach(findTicker);
console.log(tickers);
return tickers;
}
function getMeta(htm) {
const doc = new JSDOM(htm);
const meta = {};
const thumb =
doc.window.document.querySelector('meta[property="og:image"]') ||
doc.window.document.querySelector('meta[name="twitter:image"]');
const title = doc.window.document.title;
meta.title = title;
meta.thumb = thumb && thumb.getAttribute('content');
return meta;
}
function txt(htm, link) {
const url = new URL(link);
const doc = new JSDOM(htm);
doc.window.document
.querySelectorAll('img')
.forEach(
(el) =>
(el.src =
el.src.indexOf('http') === 0 || el.src.indexOf('//') === 0
? el.src.indexOf('http://')
? el.src.replace('http:', '')
: el.str
: '//' + url.host + el.src),
);
doc.window.document
.querySelectorAll('a[href]')
.forEach(
(el) =>
(el.href =
el.href && el.href.indexOf('/') === 0
? url.protocol + '//' + url.host + el.href
: el.href),
);
const reader = new Readability(doc.window.document);
return reader.parse();
}
export default Reader;
For some reason after a few days the docker container has too many puppeteer processes because for some reason when fetching urls the browser doesn't exit properly.
Eventually the container is out of resources and the entire app freezes and is inaccessible.
I had the same issue when using Puppeteer inside docker. The solution was to implement dumb-init within docker. The Dockerfile should somehow look like this then (I assume you are developing a node-project therefore we call npm start at the end:
RUN apt-get install dumb-init // ... plus your other packages
... your remaining docker things
ENTRYPOINT ["/usr/bin/dumb-init", "--"]
CMD [ "npm", "start" ]

Node js Scraper

I have written a scraper in typescript, Running on node:10.12.0,
Issue: The code goes on sleep after few hours, randomly. And I had to restart it. My best guess is it stucks on url request
Tools/Packages Using:
Puppeteer
Cheerio
Typescript
Code:
import * as cheerio from "cheerio";
import * as request from "request";
import * as fs from "fs";
import * as shell from "shelljs";
import pup = require("puppeteer");
class App {
// #ts-ignore
public browser: pup.Browser;
public appendToFile(file: string, content: string): Promise < string > {
return new Promise < string > ((resolve, reject) => {
try {
fs.appendFileSync(file, content);
resolve("DONE");
} catch (e) {
reject(e);
}
});
}
public loadPage(url: string): Promise < any > {
return new Promise < any > ((resolve, reject) => {
request.get(url, async (err, res, html) => {
if (!err && res.statusCode === 200) {
resolve(html);
} else {
if (err) {
reject(err);
} else {
reject(res);
}
}
});
});
}
public step1(url: string): Promise < string > {
return new Promise < string > (async (resolve, reject) => {
let page: pup.Page | undefined;
try {
let next = false;
let urlLink = url;
let first = true;
let header = "unknown";
let f = url.split("/");
let folder = f[f.length - 3];
folder = folder || header;
let path = "data/" + folder;
shell.mkdir("-p", path);
page = await this.browser.newPage();
await page.goto(url, {
timeout: 0
});
let count = 1;
do {
next = false;
let res = await page.evaluate(() => {
let e = document.querySelectorAll(".ch-product-view-list-container.list-view li ul > li > h6 > a");
let p: string[] = [];
e.forEach((v) => {
p.push(("https://www.link.com") + (v.getAttribute("href") as string));
});
return p;
});
// for(const l of res) {
// try {
// await this.step2(l, "" , "")
// } catch(er) {
// this.appendToFile("./error.txt", l + "::" + url + "\n").catch(e=>e)
// }
// }
let p = [];
let c = 1;
for (const d of res) {
p.push(await this.step2(d, folder, c.toString()).catch((_e) => {
console.log(_e);
fs.appendFileSync("./error-2.txt", urlLink + " ### " + d + "\n");
}));
c++;
}
await Promise.all(p);
await this.appendToFile("./processed.txt", urlLink + ":" + count.toString() + "\n").catch(e => e);
count++;
console.log(urlLink + ":" + count);
let e = await page.evaluate(() => {
let ele = document.querySelector("#pagination-next") as Element;
let r = ele.getAttribute("style");
return r || "";
});
if (e === "") {
next = true;
await page.click("#pagination-next");
// console.log('waitng')
await page.waitFor(1000);
// console.log('done wait')
// await page.waitForNavigation({waitUntil: 'load'}).catch(e=> console.log(e));
// await Promise.all([
// page.click("#pagination-next"),
// page.waitForNavigation({ waitUntil: 'networkidle0'}), // ]);
}
} while (next);
// await page.close();
resolve("page all scrapped");
} catch (errrr) {
reject(errrr);
} finally {
if (page !== undefined) {
await page.close().catch(e => e);
}
}
});
}
public step2(url: string, folder: string, file: string): Promise < string > {
return new Promise < string > (async (resolve, reject) => {
try {
let html = await this.loadPage(url).catch(e => reject(e));
let $ = cheerio.load(html);
let ress: any = {};
let t = $(".qal_title_heading").text();
if (t) {
ress.header = t.replace(/"/g, "'").replace(/\n|\r|\t/g, "");
}
let d = $("div.ch_formatted_text.qal_thread-content_text.asker").html();
if (d) {
ress.body = d.replace(/"/g, "'").replace(/\n|\r|\t/g, "");
}
// let sprit = "-------------------------------";
let filename = "data" + file + ".json"; // ((t.replace(/[^\w\s]/gi, "")).substring(0,250)+".txt")
let data = JSON.stringify(ress) // t +sprit + d + "\n---end---\n"; await this.appendToFile("./data/"+ folder + "/" +filename, data+",\n")
.then((r) => {
resolve(r);
});
} catch (err) {
reject(err);
}
});
}
}
async function main() {
process.on("SIGTERM", () => {
console.log("SigTerm received");
process.exit(1);
});
process.on("SIGINT", () => {
console.log("SigInt received");
process.exit(1);
});
let path = "data/unknown";
shell.mkdir("-p", path);
let c = new App();
let list: string[] = [];
console.log(process.argv[2]);
require("fs").readFileSync(process.argv[2], "utf-8").split(/\r?\n/).forEach((line: string) => {
list.push(line);
});
console.log("total links->" + list.length);
c.browser = await pup.launch({
headless: true
});
for (const l of list) {
await c.step1(l).then(e => {
fs.appendFileSync("./processed.txt", l);
}).catch(e => {
fs.appendFileSync("./error.txt", l);
});
}
}
main();
Let me know if you need something else from me. Also this is all the code.
So , I figured two problems.
The chrome (under puppeteer) consumes high CPU, which gives the trend like this:
at start it's on moderate usage. and it gradually increases. My trend was it started off with 4% usage and after a day, it reached 100%. I've submitted an issue on their git
I did not specify the timeout in request
was:
request.get(url, async (err, res, html) => {
should be:
request.get(url,{timeout: 1500} async (err, res, html) => {
So far my code is running fine for more than a day now. only issue is high cpu usage. But it's none of my concern as for now.

Resources