NodeJS program to scrape table columns from Wikipedia - node.js

I am in the process of learning Node JS and how to web scrape. I thought it would be a good approach to extract columns from a wikipedia page https://en.wikipedia.org/wiki/List_of_S%26P_500_companies
I've been learning about web scraping with Cheerio but am unsure how to code this in NodeJS. I've become familiar with html selectors to identify elements on a page but am not sure how to extract into a program. I plan on extracting this information into a list. I am hoping to extract the Symbol and Security columns on the table on the wiki page.
Below is the code I have compiled and the result I am getting. I have created a const based off a selector on the webpage. I believe it is supposed to return all the values in the column based off the selector.
var AWS = require("aws-sdk");
var AWS = require("aws-sdk/global");
AWS.config.apiVersions = {
dynamodb: '2012-08-10'
};
var dynamodb = new AWS.DynamoDB();
const cheerio = require('cheerio');
const axios = require('axios');
const express = require('express');
async function getStocks() {
try {
const url = ' https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
const { data } = await axios({
method: "GET",
url: url,
})
const $ = cheerio.load(data)
const elemSelector = '#constituents > tbody > tr:nth-child(1) > td'
$(elemSelector).each((parentIdx, parentElem) => {
console.log(parentIdx)
})
console.log($)
} catch (err) {
console.error(err)
}
}
getStocks()
Result
[Function: initialize] {
html: [Function: html],
xml: [Function: xml],
text: [Function: text],
parseHTML: [Function: parseHTML],
root: [Function: root],
contains: [Function: contains],
merge: [Function: merge],
load: [Function: load],
_root: Node {
type: 'root',
name: 'root',
parent: null,
prev: null,
next: null,
children: [ [Node], [Node] ],
'x-mode': 'no-quirks'
},
_options: { xml: false, decodeEntities: true },
fn: Cheerio { constructor: [Function: LoadedCheerio] }
}
[Finished in 2.384s]

This should help. Follow the comments to understand what is happening. You didn't specify how you wanted the data output, so I have them as arrays but you can add code to map the table data with the table headers now that you have the raw data.
// parses HTML
var $ = cheerio.load(data);
// the number of columns you want to target on the table, starting from the left column
var number_of_columns = 2;
// targets the specific table with a selector
var html_table = $('table#constituents');
// gets table header titles; loops through all th data, and pulls an array of the values
var table_header = html_table.find('th').map(function() {return $(this).text().trim();}).toArray();
// removes columns after the number of columns specified
table_header = table_header.slice(0, number_of_columns);
// gets table cell values; loops through all tr rows
var table_data = html_table.find('tbody tr').map(function(tr_index) {
// gets the cells value for the row; loops through each cell and returns an array of values
var cells = $(this).find('td').map(function(td_index) {return $(this).text().trim();}).toArray();
// removes columns after the number of columns specified
cells = cells.slice(0, number_of_columns);
// returns an array of the cell data generated
return [cells];
// the filter removes empty array items
}).toArray().filter(function(item) {return item.length;});
// output the table headers
console.log('table_header', table_header);
// output the table data
console.log('table_data', table_data);

In nodejs you can user puppeteer for your web scraping. it's a very good library with a lot of browser settings and query selectors options!
You can find more about the library reading the documentation at https://pptr.dev/
If you want to integrate it with an API I can help you too.

Related

how to scrape data from a website using node

I am new in Node JS. from morning i am trying to scrape data from website https://www.pmkisan.gov.in/StateDist_Beneficiery.aspx
[![enter image description here][1]][1]
I want to store that date also in db table. Currently i am putting hard coded date. But how can i scrape that date?
my code
(async () => {
await performDbActions(webData);
})();
async function performDbActions(data) {
let dataToBeInsert = {};
// console.log(data);
for (const d of data) {
if (Object.keys(d).length) {
// console.log(d);
let district = await db.sequelize.query("select * from abc where name=:districtName or other_names like :districtLike", {
replacements: {districtName: d['name'], districtLike: '%' + d['name'] + '%'},
raw: true,
type: db.sequelize.QueryTypes.SELECT
});
delete d['sno'];
delete d['name'];
d['as_on'] = '2020-02-06';
}
}
}
}
According to the page's source code the date you're looking for is inside a <span> that has the id ContentPlaceHolder1_lbldate. So you can just use cheerio to get its text-content and pass the result to performDbActions as an additional parameter:
//...
const date = $('#ContentPlaceHolder1_lbldate').text();
//...
await performDbActions(webData, date);
// ...
async function performDbActions(data, date) {
// ...
// it would be safer to use an external date-library like moment.js but here's a way to convert the date in plain js
const dateParts =date.split('/');
const dateObj = new Date(dateParts[2], dateParts[1] - 1, dateParts[0]);
d['created_at'] = dateObj;
}
Note that the date is in format dd/mm/yyyy, so you may have to convert it to your desired format.

Dynamo DB Query Filter Node.js

Running a Node.js serverless backend through AWS.
Main objective: to filter and list all LOCAL jobs (table items) that included the available services and zip codes provided to the filter.
Im passing in multiple zip codes, and multiple available services.
data.radius would be an array of zip codes = to something like this:[ '93901', '93902', '93905', '93906', '93907', '93912', '93933', '93942', '93944', '93950', '95377', '95378', '95385', '95387', '95391' ]
data.availableServices would also be an array = to something like this ['Snow removal', 'Ice Removal', 'Salting', 'Same Day Response']
I am trying to make an API call that returns only items that have a matching zipCode from the array of zip codes provided by data.radius, and the packageSelected has a match of the array data.availableServices provided.
API CALL
import * as dynamoDbLib from "./libs/dynamodb-lib";
import { success, failure } from "./libs/response-lib";
export async function main(event, context) {
const data = JSON.parse(event.body);
const params = {
TableName: "jobs",
FilterExpression: "zipCode = :radius, packageSelected = :availableServices",
ExpressionAttributeValues: {
":radius": data.radius,
":availableServices": data.availableServices
}
};
try {
const result = await dynamoDbLib.call("query", params);
// Return the matching list of items in response body
return success(result.Items);
} catch (e) {
return failure({ status: false });
}
Do I need to map the array of zip codes and available services first for this to work?
Should I be using comparison operators?
https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/LegacyConditionalParameters.QueryFilter.html
Is a sort key value or partition key required to query and filter? (the table has a sort key and partition key but i would like to avoid using them in this call)
Im not 100% sure on how to go about this so if anyone could point me in the right direction that would be wonderful and greatly appreciated!!
I'm not sure what your dynamodb-lib refers to but here's an example of how you can scan for attribute1 in a given set of values and attribute2 in a different set of values. This uses the standard AWS JavaScript SDK, and specifically the high-level document client.
Note that you cannot use an equality (==) test here, you have to use an inclusion (IN) test. And you cannot use query, but must use scan.
const AWS = require('aws-sdk');
let dc = new AWS.DynamoDB.DocumentClient({'region': 'us-east-1'});
const data = {
radius: [ '93901', '93902', '93905', '93906', '93907', '93912', '93933', '93942', '93944', '93950', '95377', '95378', '95385', '95387', '95391' ],
availableServices: ['Snow removal', 'Ice Removal', 'Salting', 'Same Day Response'],
};
// These hold ExpressionAttributeValues
const zipcodes = {};
const services = {};
data.radius.forEach((zipcode, i) => {
zipcodes[`:zipcode${i}`] = zipcode;
})
data.availableServices.forEach((service, i) => {
services[`:services${i}`] = service;
})
// These hold FilterExpression attribute aliases
const zipcodex = Object.keys(zipcodes).toString();
const servicex = Object.keys(services).toString();
const params = {
TableName: "jobs",
FilterExpression: `zipCode IN (${zipcodex}) AND packageSelected IN (${servicex})`,
ExpressionAttributeValues : {...zipcodes, ...services},
};
dc.scan(params, (err, data) => {
if (err) {
console.log('Error', err);
} else {
for (const item of data.Items) {
console.log('item:', item);
}
}
});

How to implement a selector in easy: search for Meteor, using React instead of Blaze

I'm trying to follow the documentation and examples to add a server-side selector to a search function in my Meteor app, implemented using the Easy Search plugin. The end goal is to ensure that only documents the user has permission to see are returned by searching.
I can see a selector working in the Leaderboard example, but I can't get it to work in my code.
Versions:
Meteor 1.7.0.1
easy:search#2.2.1
easysearch:components#2.2.2
easysearch:core#2.2.2
I modified the Meteor 'todos' example app to demonstrate the problem, and my demo code is in a repo.
NOTE! to demonstrate the problem, you need to create an account in the demo app, then create a list and make it private. This add the 'userId' field to the list.
Then you can search for the name of the list, by typing in the search box near the top of the main section; search results are written to the browser console.
The first problem is that if I copy the code from the example in the documentation, I see a server error 'searchObject is not defined:
copied from docs, causes an error: imports/api/lists/lists.js
export const MyIndex = new Index({
'collection': Lists,
'fields': ['name'],
engine: new MongoDBEngine({
selector(searchDefinition, options, aggregation) {
// retrieve the default selector
const selector = this.defaultConfiguration()
.selector(searchObject, options, aggregation)
// options.search.userId contains the userId of the logged in user
selector.userId = options.search.userId
return selector
},
}),
});
It seems there is an error in the docs.
Working instead from the leaderboard example, the code below runs but intermittently returns no results. For example if I have a list called "My list", and I type the search term 's', sometimes the list is returned from the search and sometimes it is not. If I use the MiniMongo engine it all works perfectly.
index selector {"$or":[{"name":{"$regex":".*my.*","$options":"i"}}],"userId":"Wtrr5FRHhkKuAcrLZ"}
client and server: imports/api/lists/lists.js
export const MyIndex = new Index({
'collection': Lists,
'fields': ['name'],
'engine': new MongoDBEngine({
selector: function (searchObject, options, aggregation) {
let selector = this.defaultConfiguration().selector(searchObject, options, aggregation);
selector.userId = options.search.userId;
console.log('index selector', JSON.stringify(selector));
return selector;
}
}),
permission: () => {
return true;
},
});
client: imports/ui/components/lists-show.js
Template.Lists_show.events({
'keyup #search'(event) {
console.log('search for ', event.target.value);
const cursor = MyIndex.search(event.target.value);
console.log('count',cursor.count());
console.log('results', cursor.fetch());
},
});
client: imports/ui/components/lists-show.html
<input id="search" type="text" placeholder="search..." />
Edit: I think the problem is that while the Minimongo engine runs on the client, the MongoDBEngine runs on the server and there are timing issues with the results. The docs show using Tracker.autorun, but that's not a natural fit with my React / Redux app. I'll post an answer if I manage to figure something out - I can't be the only person trying to do something like this.
I got it working in my React / Redux / Meteor app. Things to note:
The cursor MyIndex.search(searchTerm) is a reactive data source - you can't just use it as a return value. When searching on the client with MiniMongo this isn't an issue, but it's important when you use MongoDBEngine to search on the server, because it's asynchronous. In React you can wrap the cursor in withTracker to pass data to the component reactively. In Blaze you would use autorun.tracker. This is shown in the docs but not explained, and it took me a while to understand what was happening.
The docs have an error in the selector example, easily corrected but it's confusing if you have other problems in your code.
With MongoDBEngine, 'permission' must be specified - it does not default to 'true'. Without it, you will see no results.
Writing out the default selector object to the console let me see how it's constructed, and then create a new selector that returns MyDocs that are either public or created by the user.
My code is below. In case it helps anybody else, I've shown how to search on tags also, which are objects with a name property stored in a collection Tags. Each MyDoc has a 'tags' property which is an array of tag ids. The selector first searches the Tags collection to find tags whose name matches the search term, then selects docs in MyDocs with the ids of those tags in their doc.tags array.
There may be a better way to find the search term, or to structure the Tags search, but this is what I could get working.
On server and client:
import { Index, MongoDBEngine } from 'meteor/easy:search';
export const MyDocs = new Mongo.Collection('mydocs');
export const Tags = new Mongo.Collection('tags');
export const MyIndex = new Index({
'collection': MyDocs,
'fields': ['name'],
'engine': new MongoDBEngine({
'selector': function (searchObject, options, aggregation) {
const selector = this.defaultConfiguration().selector(searchObject, options, aggregation);
console.log('default selector', selector); // this searches on name only
// find docs by tag as well as by name
const searchTerm = searchObject.name;
const matchingTags = Tags.find({ 'name': { '$regex': searchTerm } }).fetch();
const matchingTagIds = matchingTags.map((tag) => tag._id);
selector.$or.push({ 'tags': { '$in': matchingTagIds } });
const newSelector = {
'$and': [
{
'$or': [
{ 'isPublic': { '$eq': true } },
{ 'createdBy': options.search.userId },
],
},
{
'$or': selector.$or,
},
],
};
return newSelector;
},
'fields': (searchObject, options) => ({
'_id': 1,
'createdBy': 1,
'name': 1,
}),
'sort': () => ({ 'name': 1 }),
}),
'permission': () => true,
});
React component in client only code:
import React from 'react';
import { connect } from 'react-redux';
import { withTracker } from 'meteor/react-meteor-data';
import PropTypes from 'prop-types';
import store from '../modules/store';
import {
getSearchTerm,
searchStart,
} from '../modules/search'; // contains Redux actions and partial store for search
import { MyIndex } from '../../modules/collection';
function Search(props) {
// functional React component that contains the search box
...
const onChange = (value) => {
clearTimeout(global.searchTimeout);
if (value.length >= 2) {
// user has entered a search term
// of at least 2 characters
// wait until they stop typing
global.searchTimeout = setTimeout(() => {
dispatch(searchStart(value)); // Redux action which sets the searchTerm in Redux state
}, 500);
}
};
...
// the component returns html which calls onChange when the user types in the search input
// and a list which displays the search results, accessed in props.searchResults
}
const Tracker = withTracker(({ dispatch }) => {
// searchTerm is saved in Redux state.
const state = store.getState();
const searchTerm = getSearchTerm(state); // Redux function to get searchTerm out of Redux state
let results = [];
if (searchTerm) {
const cursor = MyIndex.search(searchTerm); // search is a reactive data source
results = cursor.fetch();
console.log('*** cursor count', cursor.count());
return {
'searchResults': results,
};
})(Search);
export default connect()(Tracker);

Global search is not working in DataTable when server side is enabed in NodeJs

I am trying to get form data from MongoDB server and showing it into data table using nodeJs.I successfully have done server-side pagination using npm Paginate v-2 plugin. But now the searching is not working. Below is my NodeJs and javascript files code. Please help me for searching.
NodeJs code
app.get('/gettable',(req,res)=>{
console.log(req.query);
user.paginate({},{
page:Math.ceil(req.query.start / req.query.length) + 1,
limit:parseInt(req.query.length)
},function(err,result){
var mytable = {
draw:req.query.draw,
recordsTotal:0,
recordsFiltered:0,
data:[],
}
if(err) {
console.log(err);
res.json(mytable);
} else {
if(result.totalDocs > 0) {
mytable.recordsTotal = result.totalDocs;
mytable.recordsFiltered = result.totalDocs;
for(var key in result.docs) {
mytable.data.push([
result.docs[key]['name'],
result.docs[key]['lastname'],
result.docs[key]['email'],
result.docs[key]['pass'],
result.docs[key]['birthdate'],
result.docs[key]['zipcode'],
result.docs[key]['phonenumber'],
]);
}
}
res.json(mytable);
}
});
DisplayTable.Js code
$(document).ready(function(){
$('#example').DataTable({
"processing": true,
"serverSide": true,
"ajax": "http://localhost:8080/gettable"
});
})
As I said, I am successfully getting data from a server and showing into data table with server-side pagination but searching is not working but in searching div whatever I search, I am getting that value in search array, like this
search: { value: 'svs', regex: 'false' },
_: '1548653540009' }
But its not implementing in datatable to filter columns.
As I said in the comment that search will not work out of the box when server side is enabled in DataTable, it is because now the whole functionality, whether sorting, paging, limit, and search has to be implemented in the server. DataTable will only send the parameter needed for doing the functionality. Following is the code just for your reference, it is not tested and you may get an error also. You may get inputs from the following code. Feel free to edit the following code if in case of getting errors so that it can help future readers.
app.get('/gettable',(req,res)=>{
console.log(req.query);
var query = {},
// array of columns that you want to show in table
columns = ['name', 'lastname', 'email', 'pass', 'birthdate', 'zipcode', 'phonenumber',];
// check if global search is enabled and it's value is defined
if (typeof req.query.search !== 'undefined' && req.query.search.value != '') {
// get global search value
var text = req.query.search.value;
// iterate over each field definition to check whether search is enabled
// for that particular column or not. You can set search enable/disable
// in datatable initialization.
for (var i=0; i<req.query.columns.length; i++) {
requestColumn = req.query.columns[i];
column = columns[requestColumn.data];
// if search is enabled for that particular field then create query
if (requestColumn.searchable == 'true') {
query[column] = {
$regex: text,
};
}
}
}
user.paginate(query,{
page:Math.ceil(req.query.start / req.query.length) + 1,
limit:parseInt(req.query.length)
},function(err,result){
var mytable = {
draw:req.query.draw,
recordsTotal:0,
recordsFiltered:0,
data:[],
}
if(err) {
console.log(err);
res.json(mytable);
} else {
if(result.totalDocs > 0) {
mytable.recordsTotal = result.totalDocs;
mytable.recordsFiltered = result.totalDocs;
for(var key in result.docs) {
var data = [];
for(var column in columns) {
data.push(result.docs[key][column]);
}
mytable.data.push(data);
}
}
res.json(mytable);
}
});

Excel Javascript API - Check for merge cells

I would like to query Excel for a range of cells and know whether there are merged cells within that range. I see that there is an API for merging cells and an API for unmerging cells. Does anyone know if it is possible to just check for merge cells?
Currently we don't have an API to query the state of merged cells.
Please log a feature request on UserVoice and we will take it into account for future API planning.
-Philip, Developer on the Office Extensibility team
Docs: https://learn.microsoft.com/en-us/javascript/api/excel/excel.range?view=excel-js-preview#getMergedAreasOrNullObject__
Note: This is still a WIP for me, would need a "isNull" check and I haven't adapted this for production code, but this should be a good starting point. I recommend examine the mergedAreas object if you need further details.
Here is how I did it:
async function Get_Merged_Areas_Arr_Of_Objs(context, ws) { //test for Null
var mergedAreas = ws.getUsedRange(true).getMergedAreasOrNullObject();
mergedAreas.load(["areas"]);
await context.sync()
var Merged_Areas_Arr_Of_Objs = []
var arrlen = mergedAreas.areas.items.length
for (var ai = 0; ai < arrlen; ai += 1) {
var obj = mergedAreas.areas.items[ai]
var rng_obj = {}
rng_obj['worksheet'] = obj['address'].split("!")[0]
rng_obj['rowIndex'] = obj['rowIndex']
rng_obj['columnIndex'] = obj['columnIndex']
rng_obj['rowCount'] = obj['rowCount']
rng_obj['columnCount'] = obj['columnCount']
Merged_Areas_Arr_Of_Objs.push(rng_obj)
}
return Merged_Areas_Arr_Of_Objs
}
var Merged_Areas_Arr_Of_Objs = await Get_Merged_Areas_Arr_Of_Objs(context,ws)
console.log('Merged_Areas_Arr_Of_Objs:')
console.log(Merged_Areas_Arr_Of_Objs)
[object Object],[object Object],[object Object],[object Object]
[
0: {
[functions]: ,
__proto__: { },
columnCount: 1,
columnIndex: 0,
rowCount: 3,
rowIndex: 1,
Symbol()_7.z1gk27bjwa1: undefined,
Symbol(nodejs.util.inspect.custom)_j.z1gk27bjwa1: undefined,
worksheet: "Sheet3"
},
1: { },
2: { },
3: { },
length: 4
]

Resources