Realtime scrape a chat using Nodejs - node.js

What I want to do is to build a scraping application on NodeJs from which it m*onitors on Realtime a chat and store certain messages within any database?
What I am wanting to do is the following, I am wanting to capture data from the chat platforms streaming, and thus capture some useful information that helps those who are doing the streaming service;
But I do not know how to start doing this using NodeJs,
What I have been able to do so far has been to capture the data of the messages, however I can not monitor in realtime new messages,
any help in this regard?
What i did so far:
server.js
var express = require('express');
var fs = require('fs');
var request = require('request');
var puppeteer = require('puppeteer');
var app = express();
app.get('/', function(req, res){
url = 'https://www.nimo.tv/live/6035521326';
(async() => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url);
await page.waitForSelector('.msg-nickname');
const messages = await page.evaluate(() => {
return Array.from(document.querySelectorAll('.msg-nickname'))
.map(item => item.innerText);
});
console.log(messages);
})();
res.send('Check your console!')
});
app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;
With this, I get the Nicknames of Users messages and put in an Array, I want to make my application run and receive new Nicknames automatically when the input is done in the chat,
Any help with this challenge?
Maybe I'm going to need to use WebSocket

If possible you should use the API, the chat is using. Try to open the network tab inside the Chrome developer tools and try to figure out which network requests are happening.
If that is not possible, you can use a MutationObserver to monitor DOM changes. Expose a function via page.exposeFunction and then listen to relevant changes. You can then insert the obtained data into a database.
Here is some example code to get you started:
const puppeteer = require('puppeteer');
const { Client } = require('pg');
(async () => {
const client = new Client(/* ... */);
await client.connect(); // connect to database
const browser = await puppeteer.launch({ headless: false });
const [page] = await browser.pages();
// call a handler when a mutation happens
async function mutationListener(addedText) {
console.log(`Added text: ${addedText}`);
// insert data into database
await client.query('INSERT INTO users(text) VALUES($1)', [addedText]);
}
page.exposeFunction('mutationListener', mutationListener);
await page.goto('http://...');
await page.waitForSelector('.msg-nickname');
await page.evaluate(() => {
// wait for any mutations inside a specific element (e.g. the chatbox)
const observerTarget = document.querySelector('ELEMENT-TO-MONITOR');
const mutationObserver = new MutationObserver((mutationsList) => {
// handle change by checking which elements were added and which were deleted
for (const mutation of mutationsList) {
const { removedNodes, addedNodes } = mutation;
// example: pass innerText of first added element to our mutationListener
mutationListener(addedNodes[0].innerText);
}
});
mutationObserver.observe( // start observer
observerTarget,
{ childList: true }, // wait for new child nodes to be added/removed
);
});
})();

Related

fs.writeFile crashes node app after writing first json file

I'm trying to crawl several web pages to check broken links and writing the results of the links to a json files, however, after the first file is completed the app crashes with no error popping up...
I'm using Puppeteer to crawl, Bluebird to run each link concurrently and fs to write the files.
WHAT IVE TRIED:
switching file type to '.txt' or '.php', this works but I need to create another loop outside the current workflow to convert the files from '.txt' to '.json'. Renaming the file right after writing to it also causes the app to crash.
using try catch statements for fs.writeFile but it never throws an error
the entire app outside of express, this worked at some point but i trying to use it within the framework
const express = require('express');
const router = express.Router();
const puppeteer = require('puppeteer');
const bluebird = require("bluebird");
const fs = require('fs');
router.get('/', function(req, res, next) {
(async () => {
// Our (multiple) URLs.
const urls = ['https://www.testing.com/allergy-test/', 'https://www.testing.com/genetic-testing/'];
const withBrowser = async (fn) => {
const browser = await puppeteer.launch();
try {
return await fn(browser);
} finally {
await browser.close();
}
}
const withPage = (browser) => async (fn) => {
const page = await browser.newPage();
// Turns request interceptor on.
await page.setRequestInterception(true);
// Ignore all the asset requests, just get the document.
page.on('request', request => {
if (request.resourceType() === 'document' ) {
request.continue();
} else {
request.abort();
}
});
try {
return await fn(page);
} finally {
await page.close();
}
}
const results = await withBrowser(async (browser) => {
return bluebird.map(urls, async (url) => {
return withPage(browser)(async (page) => {
await page.goto(url, {
waitUntil: 'domcontentloaded',
timeout: 0 // Removes timeout.
});
// Search for urls we want to "crawl".
const hrefs = await page.$$eval('a[href^="https://www.testing.com/"]', as => as.map(a => a.href));
// Predefine our arrays.
let links = [];
let redirect = [];
// Loops through each /goto/ url on page
for (const href of Object.entries(hrefs)) {
response = await page.goto(href[1], {
waitUntil: 'domcontentloaded',
timeout: 0 // Remove timeout.
});
const chain = response.request().redirectChain();
const link = {
'source_url': href[1],
'status': response.status(),
'final_url': response.url(),
'redirect_count': chain.length,
};
// Loops through the redirect chain for each href.
for ( const ch of chain) {
redirect = {
status: ch.response().status(),
url: ch.url(),
};
}
// Push all info of target link into links
links.push(link);
}
// JSONify the data.
const linksJson = JSON.stringify(links);
fileName = url.replace('https://www.testing.com/', '');
fileName = fileName.replace(/[^a-zA-Z0-9\-]/g, '');
// Write data to file in /tmp directory.
fs.writeFile(`./tmp/${fileName}.json`, linksJson, (err) => {
if (err) {
return console.log(err);
}
});
});
}, {concurrency: 4}); // How many pages to run at a time.
});
})();
});
module.exports = router;
UPDATE:
So there is nothing wrong with my code... I realized nodemon was stopping the process after each file was saved. Since nodemon would detect a "file change" it kept restarting my server after the first item

reuse browser instance puppeterr

I would like to know if it is possible to have one .js file that opens a browser instance, creates new page/tab logs in to a website (with username/password) and just stays idle. And in a second .js file use file one browser instance and its page.
1.js
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch({
headless: true,
args: ['--no-sandbox'],
ignoreDefaultArgs: ["--hide-scrollbars"]
});
const page = await browser.newPage();
const response = await page.goto('https://google.com');
console.log('Browser open in the background (headless)!');
//await browser.close();
})();
2.js
const puppeteer = require('puppeteer');
(async () => {
// instructions on browser instance/page from 1.js ...
})();
The crawler object keeps the state of the browser instance and
wherever you call/pass that instance, it refers to the same chromium
in the "background". If this is an overkill, and you just want to
connect to an already running chromium using puppeteer, you can do it
with puppeteer.connect. take a look at this:
How to "hook in" puppeteer into a running Chrome instance/tab – mbit
Yeah I guess its to overkill for me :). But the link you posted was what I wanted but have 2 questions.
This Is a sample what I have.
// 1.js
// open chromium, new tab, go to google.com, print browserWSEndpoint, disconnect
const puppeteer = require('puppeteer');
(async () => {
var browser = await puppeteer.launch({headless: false});
var page = await browser.newPage();
var response = await page.goto('https://google.com');
var browserWSEndpoint = browser.wsEndpoint();
console.log(browserWSEndpoint); // prints: ws://127.0.0.1:51945/devtools/browser/6462daeb-469b-4ae4-bfd1-c3bd2f26aa5e
browser.disconnect();
})();
And
// 2.js
// connect to the open browser with the browserWSEndpoint manualy put in, ... , disconect.
const puppeteer = require('puppeteer');
(async () => {
var browser = await puppeteer.connect({browserWSEndpoint: 'ws://127.0.0.1:51945/devtools/browser/6462daeb-469b-4ae4-bfd1-c3bd2f26aa5e'});
// somehow use the tab that is open from 1.js (google.com)
await browser.disconnect();
})();
I get the browserWSEndpoint string from the console.log 1.js.
It works great but I have two difficulties.
1 - How can I use the variable browserWSEndpoint from 1.js so I dont have to always copy paste it to 2.js.
2- If I open a new page/tab on 1.js and go for example to google and disconnect (browser.disconnect()), how can use that page/tab on 2.js.
Working tested code
getEmail.js is where actual page will be exported. ask clarifications in comments.
getBrowser.js
const puppeteer = require("puppeteer");
module.exports = {
browser: {},
pptr_instance_url:"",
getBrow: async function(){ try {
console.log("line6",this.pptr_instance_url);
this.browser = await puppeteer.connect({browserWSEndpoint: this.pptr_instance_url}).catch(async e =>{
console.log("end point",this.pptr_instance_url);
this.browser = await puppeteer.launch({timeout: 0});
this.pptr_instance_url = this.browser.wsEndpoint();
console.log("line 11",this.pptr_instance_url);
return this.browser;
});
return this.browser;
}catch (e){
console.log(e)
} }
}
pageRenderer.js
const abc = require("../getBrowsernew")
const pageRenderer = async (request) => {
const {reactProjectUrl} = constants, uuidStorageKey = uuidv4(),
localStorageObject = {[uuidStorageKey]: request.body};
const browser = await abc.getBrow();
let url = "someurl.com"
await setLocalStorage(browser, url, localStorageObject);
const page = await browser.newPage();
const response = await page.goto(
url,
{
waitUntil: "networkidle0"
}, {waitUntil: 'load', timeout: 0}
);
return page;
}
module.exports = pageRenderer;
getEmail.js
const pageRenderer = require("./pageRenderer");
const getEmail =async (request) =>{
const page = await pageRenderer(request)
const emailbody = await page.content();
page.close();
return emailbody;
}
module.exports = getEmail;
You can implement this in many ways like having separate modules with functions, or different classes, and it depends on your particular need.
You can have a class that launches the browser and creates pages plus some extra functionalities.
//1.js
const puppeteer = require('puppeteer');
class Crawler {
constructor() {
//init with whatever values you'll need in your class
//or throw an error if the object wasn't created through build
}
static async build() {
let crawler = new Crawler();
await crawler._init();
return crawler;
}
async _init() {
//launch the browser and keep its state
this._browser = await puppeteer.launch({timeout: 0});
//create a page and keep its state
this._page = await this._browser.newPage();
}
//getter
get browser() {
return this._browser;
}
//getter
get page() {
return this._page;
}
async login(url) {
await this._page.goto(url);
//do whatever is related to the login process
}
}
module.exports = {Crawler};
Note that we can't have async functions in the constructor. Since launching browser is async, we use something like a build function to initiate the browser when creating the object. Then we create the crawler object like this:
//2.js
const {Crawler} = require('./1.js');
(async() => {
let crawler = await Crawler.build();
await crawler.login("https://example.com");
//access crawler's page
console.log(crawler.page.url());
})();
Keep in mind that this is only an example and by no means representative of the best practices. So first, you need to understand what you want to achieve out of such encapsulation, then adopt the method that suits you best.
Read more on JS classes here

Can not upload files when running tests in docker Selenium_Hub, Selenium-Node-Chrome-Debug and a container w/ Mocha + Selenium-Webdriver (JS)

I have spent a lot of time experimenting and using google trying to find a solution to this issue but I have not had any success and I am hoping that someone will be able to provide some guidance. So here is my situation, I am trying to run javascript mocha selenium-webdriver tests on my company's application using docker containers. I seem to have everything working except that I am unable to upload files. Prior to trying use my tests in a docker environment I was able to use it on our local servers and upload files with no issue using the a method like this:
const companyImage = process.cwd()+ '/img/backgroundmario.jpg';
const companyImageElem = await driver.findElement(By.xpath("//div/input[#type='file']"));
await companyImageElem.sendKeys(companyImage);
However, I have not been able to have any success when using docker containers. I mounted my img folder to my selenium/node-chrome-debug container which includes a VNC viewer and I can see that the images are present (and I can manually upload the images via the VNC viewer). However, despite numerous variations of providing paths to the images I can not seem to get my images to upload. For some reason the working directory seems to be from my test container and not my node-chrome-debug container but even if I add the images to the test container and change the path to my test container directory with the images they do not upload either.
Here is a snippet of my code I am using for my test (it includes some stuff I wouldn't normally include, specifically the check for process.cwd() and process.env.PWD since I just wanted to see what the path was:
const {
Builder,
By,
Key,
until,
webdriver,
action
} = require('selenium-webdriver');
const mocha = require('mocha');
const chai = require("chai");
const chaiAsPromised = require("chai-as-promised");
const {
makeUtilityBelt
} = require('./util')
chai.use(chaiAsPromised);
const fs = require('fs');
const expect = require('chai').expect;
const ciPassword = require('./envData').ciPassword;
const campManagerMail = 'jdrzymala+companycreator#influential.co';
const campManagerName = 'companycreator';
const campManagerUsername = 'companycreator';
const legacy = "http://node-web-client";
const companyImage = '/opt/test/images/backgroundmario.jpg';
var currentDir = process.cwd();
var appFolder = process.env.PWD;
const {
createLegacyAdmin,
createLegacyResellerCompany,
createLegacyBrandCompany,
createLegacyAgencyCompany,
createLegacyCampManager,
createLegacyClient,
createLegacyInfluencer
} = require('./legacyCreationQueries');
const {
getEmailId,
getUserEmailId,
getRandom,
verifyRandom,
accountSetup
} = require('./sqlutil');
describe('Creates a Company of Each Type via the Legacy Dashboard', function () {
this.timeout(1200000);
this.slow(20000);
let driver;
let util;
before(async function () {
driver = new Builder().forBrowser('chrome').usingServer('http://selenium_hub:4444/wd/hub').build();
util = makeUtilityBelt(driver);
await createLegacyCampManager(campManagerName, campManagerUsername, campManagerMail);
});
afterEach(function () {
let testCaseName = this.currentTest.title;
let testCaseStatus = this.currentTest.state;
if (testCaseStatus === 'failed') {
driver.takeScreenshot().then((data) => {
let screenshotPath = `./results/${testCaseName}.png`;
console.log(`Saving Screenshot as: ${screenshotPath}`);
fs.writeFileSync(screenshotPath, data, 'base64');
});
}
});
after(function () {
driver.quit();
});
describe('Load Legacy Corporate Site and Login to Legacy Dashboard', function () {
it('Loads into the Legacy Dashboard Successfully', async function () {
await driver.get(legacy);
await driver.wait(until.elementLocated(By.xpath("//p[contains(text(),'Sign In')]"), 10000));
await driver.sleep(3000);
const emailElem = await driver.findElement(By.xpath("//input[#id='email']"));
await util.sendKeys(emailElem, campManagerMail);
const pwElem = await driver.findElement(By.xpath("//input[#id='password']"));
await util.sendKeys(pwElem, ciPassword);
await driver.findElement(By.xpath("//button[contains(text(),'Sign In')]")).click();
await driver.wait(until.elementLocated(By.xpath("//div/ul[contains(#class, 'campaign-search-list')]"), 10000));
await driver.wait(until.elementLocated(By.xpath("//ul[#class='menu']/li/a/span[contains(text(),'User Management')]"), 10000));
await driver.sleep(5000);
await console.log("Below is the Current Working Directory");
await console.log(currentDir);
await driver.sleep(3000);
await console.log(appFolder);
await driver.sleep(3000);
await console.log("The above is the app folder");
await driver.sleep(2000);
const loginSuccessElem = await driver.findElement(By.xpath("//ul[#class='menu']/li/a/span[contains(text(),'User Management')]"));
let loginSuccess = await loginSuccessElem.isDisplayed();
await driver.sleep(3000);
await expect(loginSuccess, 'Legacy Login Failed').to.be.true;
});
});
describe('Creates a Reseller Company', function(){
const companyName = 'Reseller Test Company';
it('Navigates to Company Management and Begins Company Creation Process', async function(){
await driver.wait(until.elementLocated(By.xpath("//ul[#class='menu']/li/a/span[contains(text(),'Company Management')]"), 10000));
await driver.findElement(By.xpath("//ul[#class='menu']/li/a/span[contains(text(),'Company Management')]")).click();
await driver.sleep(8000);
await driver.wait(until.elementLocated(By.xpath("//h3[contains(text(),'Search Companies')]"), 10000));
await driver.wait(until.elementLocated(By.xpath("//a[contains(text(),'+ Create Company')]"), 10000));
await driver.sleep(8000);
await driver.findElement(By.xpath("//a[contains(text(),'+ Create Company')]")).click();
await driver.wait(until.elementLocated(By.xpath("//h3[contains(text(),'Create Company')]"), 10000));
const companyCreationPageElem = await driver.findElement(By.xpath("//h3[contains(text(),'Create Company')]"));
let companyCreationPage = await companyCreationPageElem.isDisplayed();
await expect(companyCreationPage, 'Company Creation Page failed to Load').to.be.true;
});
it('Fills in the required fields and creates New Reseller Company', async function(){
const companyDescription = 'This is a test description for a random test company blah blah blah';
const companyAddress = '777 Lucky Lane';
const companyCity = 'Las Vegas';
const companyState = 'Nevada';
const companyZip = '89104';
const companyNameElem = await driver.findElement(By.xpath("//input[#label='Company Name']"));
await util.sendKeys(companyNameElem, companyName);
await driver.sleep(1000);
const companyDescriptionElem = await driver.findElement(By.xpath("//textarea[#label='Company Description']"));
await util.sendKeys(companyDescriptionElem, companyDescription);
await driver.sleep(1000);
const companyTypeElem = await driver.findElement(By.xpath("//select"));
await companyTypeElem.click();
await driver.wait(until.elementLocated(By.xpath("//select/option"), 10000));
const companyTypeSelectElem = await driver.findElement(By.xpath("//select/option[#value='1']"));
await companyTypeSelectElem.click();
await driver.sleep(1000);
const addressElem = await driver.findElement(By.xpath("//input[#label='Address']"));
await util.sendKeys(addressElem, companyAddress);
await driver.sleep(1000);
const cityElem = await driver.findElement(By.xpath("//input[#label='City']"));
await util.sendKeys(cityElem, companyCity);
await driver.sleep(1000);
const stateElem = await driver.findElement(By.xpath("//input[#label='State']"));
await util.sendKeys(stateElem, companyState);
await driver.sleep(1000);
const zipElem = await driver.findElement(By.xpath("//input[#label='Zip Code']"));
await util.sendKeys(zipElem, companyZip);
await driver.sleep(1000);
await driver.findElement(By.xpath("//input[#type='file']")).sendKeys(companyImage);
await driver.sleep(1000);
await driver.wait(until.elementLocated(By.xpath("//img[#class='image-preview']"), 10000));
await driver.sleep(1000);
const submitButtonElem = await driver.findElement(By.xpath("//button[contains(text(),'Submit')]"));
await submitButtonElem.click();
await driver.wait(until.elementLocated(By.xpath("//h3[contains(text(),'Company Actions')]"), 10000));
await driver.wait(until.elementLocated(By.xpath("//p[#class='company-name']"), 10000));
const companySuccessElem = await driver.findElement(By.xpath("//p[#class='company-name'][contains(text(),'"+companyName+"')]"));
let companySuccess = await companySuccessElem.isDisplayed();
await expect(companySuccess, 'Failed to Create New Company').to.be.true;
});
});
});
This is the last thing stopping me from integrating my large number of test files with our CI/CD process but a huge number of my tests involve uploading files so it is a major issue. I am extremely thankful for any guidance anyone could provide me. Thank you in advance!
Despite not receiving any guidance here, with a bit of additional research, the assistance of a coworker and some experimentation I was able to solve my problem.
So there are several aspects that are important. First of all, you must make sure that the images you want to upload are mounted to the container with your browser (in my case, selenium/node-chrome-debug). Then you must make some additions to your test Selenium test file.
You must add the following lines:
var path = require('path');
var remote = require('selenium-webdriver/remote');
You can use var or let although I've been told that let is a better standard practice.
Then, after the line
driver = new Builder().forBrowser('chrome').usingServer('http://selenium_hub:4444/wd/hub').build();
Add this line of code
driver.setFileDetector(new remote.FileDetector());
For your the file you wish to upload, you must set the path to that of the file system on the browser container (selenium/node-chrome-debug in my case). So your file variable would be something like:
const companyImage = process.cwd()+'/images/backgroundmario.jpg';
Then, when you want to upload the file, you find the respective element using whichever form of identification you like and add a little extra to the sendKeys function in comparison to how you would do it were you simply running the script on your local file system rather than a docker container. So the code would look like this:
await driver.findElement(By.xpath("//input[#type='file']")).sendKeys(path.resolve(__dirname, companyImage));
Maybe there is a slightly cleaner way to code it (for example, I generally declare the elements I am interacting with as variables) but the example I have provided will work. It took me a lot of time and effort to find this solution so I hope this eventually saves someone else the amount of pain I experienced trying to get this to work.

Can I force SSR for a Nuxt page?

In a Nuxt app I need to render a page with a lot of data displayed on a google map, obtained from a 100MB .jsonl file. I'm using fs.createReadStream inside asyncData() to parse the data and feed it to the Vue component. Since fs is a server-side only module, this means my app errors when it attempts to render that page client-side.
I would like it so this specific page will exclusively be rendered with SSR so I can use fs in the Vue component.
I thought of using a custom Express middleware to process the data, but this still results in downloading dozens of MB to the client, which is unacceptable. You can see how I request it with Axios in my example.
async asyncData( {$axios} ) {
const fs = require('fs');
if (process.server) {
console.log("Server");
async function readData() {
const DelimiterStream = require('delimiter-stream');
const StringDecoder = require('string_decoder').StringDecoder;
const decoder = new StringDecoder('utf8');
let linestream = new DelimiterStream();
let input = fs.createReadStream('/Users/Chibi/WebstormProjects/OPI/OPIExamen/static/stream.jsonl');
return new Promise((resolve, reject) => {
console.log("LETS GO");
let data = [];
linestream.on('data', (chunk) => {
let parsed = JSON.parse(chunk);
if (parsed.coordinates)
data.push({
coordinates: parsed.coordinates.coordinates,
country: parsed.place && parsed.place.country_code
});
});
linestream.on('end', () => {
return resolve(data);
});
input.pipe(linestream);
});
}
const items = await readData();
return {items};
} else {
console.log("CLIENT");
const items = this.$axios.$get('http://localhost:3000/api/stream');
return {items };
}
}
Even when it renders correctly, NUXT will show me an error overlay complaining about the issue.

nodejs async/await nested API progress

I have an API that searches for the user-provided term, returns an array of results, then fires off async requests for each of the results and gets results for each of these second batch of requests. I'd like the API to report progress as it happens rather than just the final result. So, if I do the following request, I should get updates like so
$ curl 'http://server/?q=foobar'
searching for ${q}…
found 76… now getting images…
found 30 images… done
{
result
}
Most of relevant code is shown below. Fwiw, I am using hapijs for my application.
let imagesOfRecords = {};
const getImages = async function (q) {
console.log(`searching for ${q}…`);
const uri = `http://remoteserver/?q=${q}`;
const {res, payload} = await Wreck.get(uri);
const result = JSON.parse(payload.toString()).hits;
const numOfFoundRecords = result.total;
if (result.total) {
console.log(`found ${result.total}… now getting images…`);
const foundRecords = result.hits.map(getBuckets);
Promise.all(foundRecords).then(function() {
console.log(`found ${Object.keys(imagesOfRecords).length} images… done`);
reply(imagesOfRecords).headers = res.headers;
}).catch(error => {
console.log(error)
});
}
else {
console.log('nothing found');
reply(0).headers = res.headers;
}
};
const getBuckets = async function(record) {
const { res, payload } = await Wreck.get(record.links.self);
const bucket = JSON.parse(payload.toString()).links.bucket;
await getImageFiles(bucket, record.links.self);
};
const getImageFiles = async function(uri, record) {
const { res, payload } = await Wreck.get(uri);
const contents = JSON.parse(payload.toString()).contents;
imagesOfRecords[record] = contents.map(function(el) {
return el.links.self;
});
};
Once I can implement this, my next task would be to implement this progressive update in a web app that uses the above API.
To show result with each step of your requests for backend you can use EventEmitter, which will emit event on each progress step. You can read about events here.
Simple implementation:
const events = require('events');
const eventEmitter = new events.EventEmitter();
//your request code
Promise.all(foundRecords).then(function() {
console.log(`found ${Object.keys(imagesOfRecords).length} images… done`);
eventEmitter.emit('progress');
reply(imagesOfRecords).headers = res.headers;
})
const eventReaction = (e) => {
// do something with event, console log for example.
}
eventEmitter.on('progress', eventReaction);
More examples you can find here and here.
To show events to client you can use library socket.io. I think you can find pretty straightforward explanations how socket.io works in documentation.
If you want to send events between servers or processes and want to go little further, you can read more about 0MQ (zero mq) and it's node implementation

Resources