I am working upon a pubmed project where I need to extract the ids for free full text and free pmc articles.This is what my code is.
import requests
from bs4 import BeautifulSoup
from Bio import Entrez
Entrez.email = "abc#gmail.com" # Always tell NCBI who you are
handle = Entrez.esearch(db="pubmed", term="cough")
record = Entrez.read(handle)
count = record['Count']
handle = Entrez.esearch(db="pubmed", term="cough", retmax=count)
record = Entrez.read(handle)
free_article_ids = []
for id_ in record['IdList']:
req = requests.get(f"https://www.ncbi.nlm.nih.gov/pubmed/{id_}")
soup = BeautifulSoup(req.text, 'lxml')
status = soup.find('span', {'class':'status_icon'})
if status is None:
continue
elif status.text in ["Free full text", "Free PMC Article"]:
free_article_ids.append(id_)
print(free_article_ids)
Problem with my code is that it is taking way too much time for giving the result and I want to speed this process up. How do I do it?
Use multithreading to download concurrently. Recommend a simple framework.
from Bio import Entrez
from simplified_scrapy import Spider, SimplifiedDoc, SimplifiedMain
class MySpider(Spider):
name = 'ncbi.nlm.nih.gov'
start_urls = []
def __init__(self):
Entrez.email = "abc#gmail.com" # Always tell NCBI who you are
handle = Entrez.esearch(db="pubmed", term="cough")
record = Entrez.read(handle)
count = record['Count']
handle = Entrez.esearch(db="pubmed", term="cough", retmax=count)
record = Entrez.read(handle)
for id_ in record['IdList']:
self.start_urls.append(f"https://www.ncbi.nlm.nih.gov/pubmed/{id_}")
Spider.__init__(self,self.name) #necessary
free_article_ids = []
def extract(self,url,html,models,modelNames):
doc = SimplifiedDoc(html)
status = doc.select('span.status_icon')
if status and status.text in ["Free full text", "Free PMC Article"]:
id = url.split('/')[-1]
self.free_article_ids.append(id)
return {"Urls": [], "Data": {"id":id}}
return True
SimplifiedMain.startThread(MySpider())
Here are more examples. https://github.com/yiyedata/simplified-scrapy-demo
Related
So i realized after printing my data to json that my api call isnt finishing, how can i go about creating a count down timer? I am using the cursor value which doesnt really change and i cannot use the rel links, if im able to get the totalPages, can i count down based off the variable or is there a better way to end the python api requests?
my current method doesnt really work since there's always a cursor, but it works as far as never ending the while loop
i thought i could do count = totalpage -1 but that isnt working either, this is what i have so far
from curses import raw, reset_shell_mode
from http import server
from turtle import pd
import requests, sys
import urllib3
from pathlib import Path
import json
from datetime import datetime
from requests import Session
from typing import Any, List, Tuple, Dict
import pandas as pd
from requests.adapters import HTTPAdapter, Retry
from collections import Counter
# disable urllib3 warnings for SSL
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
sys.path.append('/Users/s153152/Documents/Workspace/kpi/rapid7')
def _headers():
headers = {
'X-Api-Key': 'key',
'Content-type':'application/json',
'Accept':'*/*'
}
return headers
def _request():
third_party_patching_filer = {
"asset": "asset.tags IN ['osswin'] && asset.os.vendor CONTAINS 'microsoft' && asset.agentKey IS NOT NULL",
"vulnerability" : "vulnerability.categories NOT IN ['microsoft patch']"}
headers = _headers()
print(headers)
url1 = f"https://us.api.insight.rapid7.com/vm/v4/integration/assets?&size=50"
resp = requests.post(url=url1, headers=headers, json=third_party_patching_filer, verify=False).json()
has_next_cursor = True
nextKey = ""
totalpage = resp["metadata"]
print(totalpage)
total_server_ip_addresses = []
osname_server1 = []
total_critical_vul_osswin = []
results = []
with requests.Session() as session:
url2 = f"https://us.api.insight.rapid7.com/vm/v4/integration/assets?&size=50&cursor={nextKey}"
while has_next_cursor:
s = requests.Session()
backoff_factor=0.3
retries = Retry(total=10, backoff_factor=backoff_factor, status_forcelist=[ 502, 503, 504 ])
s.mount(url2, HTTPAdapter(max_retries=retries))
s = s.post(url=url2, headers=headers, json=third_party_patching_filer, verify=False)
print(s.status_code)
s = s.json()
rel_last = s["metadata"]
cursor = s["metadata"]
if "cursor" in cursor:
nextKey = cursor["cursor"]
totalResources = cursor["totalResources"]
count = totalpage -1
print(f"next key {nextKey}, total resourcs {totalResources}")
print(f"{count}")
#print(desktop_support)
for data in s["data"]:
for tags in data['tags']:
if tags["name"] == 'OSSWIN':
try:
osname_server = data['host_name']
server_host_ip_address = data['ip']
critical_vuln_osswin = data['critical_vulnerabilities']
os_type = data['os_type']
data = {
'ip_address': server_host_ip_address,
'os_name': osname_server,
'critical_vuln': critical_vuln_osswin,
'os_type': os_type
}
results.append(data)
except Exception as e:
f"Possibly no ip address, error {e}"
#print(f"Server asset: {osname_server1}, {total_server_ip_addresses}")
with open('server_info.json', 'w') as f:
json.dump(results, f, indent=2)
#print(data['host_name'])
else:
has_next_cursor = False
return total_server_ip_addresses, sum(total_server_ip_addresses)
def smart_filter():
test = _request()
print(test)
smart_filter()
count down from variable to finish the api call
In this code I want to extract content from a newspaper link using beautifulsoup. But it is not working properly, each link in the list "filtered_Final_LIST" has links which has multiple articles. The function 'ext_url' is not returning all the pages results when I am using concurrent library.
And, Normal for loop is working properly. I have used this concurrent library to increase extraction speed. Am I doing something wrong?
import concurrent.futures
import time
MAX_THREADS = 30
filtered_Final_LIST = ['https://www.financialexpress.com/economy/finmin-asks-ministries-to-restrict-expenses-within-prescribed-limit/2410766/"',
'https://www.financialexpress.com/economy/uk-inflation-hits-near-30-year-high-pressuring-boe-and-households/2410761/"',
'https://www.financialexpress.com/economy/economic-recovery-yet-to-attain-durability-says-report/2410690/"',
'https://www.financialexpress.com/economy/vagaries-of-weather-drive-near-13-lakh-maha-farmers-to-crop-insurance-scheme/2410030/"']
def ext_url(url):
global List_articles, List_header, List_date, List_month, List_year, List_source
## Lists to get dates and news articles
List_articles = []
List_header = []
List_date = []
List_month = []
List_year = []
List_source = []
# for i in range(len(filtered_Final_LIST)):
# if 'https://www.financialexpress.com/economy/' in str(Final_LIST[i]):
# opening the url for reading
html = urllib.request.urlopen(url , timeout = 10)
print(url)
# parsing the html file
htmlParse = BeautifulSoup(html, 'html.parser')
# getting all the paragraphs of articles
for para in htmlParse.find_all(['div'], class_='entry-content wp-block-post-content'):
List_articles.append(para.get_text())
# Getting respective month, date, year the article published
from datetime import datetime
date = htmlParse.find(itemprop="article:published_time").get("content")
match = re.search(r'\d{4}-\d{2}-\d{2}', date)
dt = datetime.strptime(match.group(), '%Y-%m-%d').date()
List_month.append(dt.month)
List_date.append(dt.day)
List_year.append(dt.year)
# getting all the headings of articles
for para in htmlParse.find_all(['h1'], class_='wp-block-post-title'):
List_header.append(para.get_text())
# getting all the source of articles
for para in htmlParse.find_all(['div'], class_='author-link ie_custom_theme_multiple_authors'):
List_source.append(para.get_text())
return List_articles, List_header, List_date, List_month, List_year, List_source
with concurrent.futures.ThreadPoolExecutor() as executor :
for i in range(len(filtered_Final_LIST)):
executor.submit(ext_url, (filtered_Final_LIST[i]))
import trio
import httpx
from bs4 import BeautifulSoup
import pandas as pd
# pip install trio httpx
mainurl = 'https://www.financialexpress.com/economy/'
news = [
'finmin-asks-ministries-to-restrict-expenses-within-prescribed-limit/2410766/',
'uk-inflation-hits-near-30-year-high-pressuring-boe-and-households/2410761/',
'economic-recovery-yet-to-attain-durability-says-report/2410690/',
'vagaries-of-weather-drive-near-13-lakh-maha-farmers-to-crop-insurance-scheme/2410030/'
]
allin = []
async def get_soup(content):
return BeautifulSoup(content, 'lxml')
async def worker(receiver):
async with receiver:
async for client, new in receiver:
r = await client.get(mainurl + new)
soup = await get_soup(r.text)
prs = [x.text for x in soup.select(
'.entry-content > p:not(:last-child)')]
title = soup.select_one('.wp-block-post-title').text
author = soup.select_one('div.author-link a').text
publish = soup.select_one(
'[itemprop="article:published_time"]')['content'].split('T')[0].split('-')
target = [title, author, *publish, prs]
allin.append(target)
async def main():
async with httpx.AsyncClient(timeout=None) as client, trio.open_nursery() as nurse:
sender, receiver = trio.open_memory_channel(0)
async with receiver:
for _ in range(5):
nurse.start_soon(worker, receiver.clone())
async with sender:
for new in news:
await sender.send([client, new])
if __name__ == "__main__":
trio.run(main)
df = pd.DataFrame(
allin, columns=['Title', 'Author', 'Year', 'Month', 'Day', 'Paragraphs'])
print(df)
df.to_csv('data.csv', index=False)
I am trying to scrape the rank, name and url of a company from a website. This involves two pages and I have nested functions to get all the information I need. However, when I try to print the details I get an error that the company_url variable is not defined. I thought that calling the company_button_url function within the main function would do the job, but something is wrong. I have tried calling company_button_url() at differing points in the code, but cannot get it to work.
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
# Handle 403- Forbidden Error
url = 'https://www.b.co.uk/the-lists/mid-companies/'
req = Request(url, headers={'User-Agent': 'Mozilla'})
html = urlopen(req).read()
html_page = html.decode('utf-8')
soup = BeautifulSoup(html_page, 'html.parser') # create soup object
'''Main Function'''
def company_details():
# find rank
rank = soup.find('div', class_="company-score-redesign").text
# find company name
company_name = soup.find('div', class_="company-name-redesign").text
# find company website
''' Find Button Url...Parse HTML from new Url...Find Company Website '''
def company_button_url():
comp = soup.find('div', class_="company-name-redesign-mobile")
comp_btn = comp.find('a', href = True)
comp_btn_url = comp_btn['href']
new_url = comp_btn_url
# Handle 403- Forbidden Error
new_req = Request(new_url, headers={'User-Agent': 'Mozilla'})
nhtml = urlopen(new_req).read() # Getting new page
nhtml_page = nhtml.decode('utf-8')
nsoup = BeautifulSoup(nhtml_page, 'html.parser') # create new soup object
div_company_url = nsoup.find('div', class_="profile-info")
href_company_url = div_company_url.find('a', href = True)
company_url = href_company_url['href']
return company_url
company_button_url()
print(rank, company_name, company_url)
return()
company_details()
Feel very free to pull my coding to pieces - I am very new to this!
Thanks in advance.
I'm looking to scrape a web-site hotel platform for reviews.
I cannot figure out two things:
1 - Why I cannot extract all reviews at one time? Say there are 14 reviews, I retrieve only 7 of them or so. I assume there is restriction by the server hosting the website?
2 - When I iterate over the object review_list the children objects that are retrieved are the same each time - i.e I retrieve the same review_item. Instead of iterating through the various objects the are tag li and of class review_item (see second code snippet).
I'm running Python 3.7 and an example url is:
url example
Hope you can shed some light here.
Thanks!
Code Snippet 1:
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
import json
import re
import sys
import warnings
if not sys.warnoptions:
warnings.simplefilter("ignore")#For ignoring SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE# url = input('Enter url - ' )
url=input("Enter Url - ")
html = urllib.request.urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')
html = soup.prettify("utf-8")
hotel_json_details = {}
hotel_json = {}
for line in soup.find_all('script',attrs={"type" : "application/ld+json"}):
details = line.text.strip()
details = json.loads(details)
hotel_json_details["name"] = details["name"]
hotel_json_details["aggregateRating"]={}
hotel_json_details["aggregateRating"]["ratingValue"]=details["aggregateRating"]["ratingValue"]
hotel_json_details["aggregateRating"]["reviewCount"]=details["aggregateRating"]["reviewCount"]
hotel_json_details["address"]={}
hotel_json_details["address"]["Street"]=details["address"]["streetAddress"]
hotel_json_details["address"]["Locality"]=details["address"]["addressLocality"]
hotel_json_details["address"]["Region"]=details["address"]["addressRegion"]
hotel_json_details["address"]["Zip"]=details["address"]["postalCode"]
hotel_json_details["address"]["Country"]=details["address"]["addressCountry"]
print(hotel_json_details)
div = soup.find_all(['li'],attrs={"class" : "review_item"})
print(div)
Code Snippet 2:
hotel_reviews= []
for line in soup.find_all('li', class_='review_item'):
review={}
review["review_metadata"]={}
review["review"]={}
review["review_metadata"]["review_date"] = soup.find('p', class_='review_item_date').text.strip()
review["review_metadata"]["review_staydate"] = soup.find('p', class_='review_staydate').text.strip()
review["review_metadata"]["reviewer_name"] = soup.find('p', class_='reviewer_name').text.strip()
review["review_metadata"]["reviewer_country"] = soup.find('span', class_='reviewer_country').text.strip()
review["review_metadata"]["reviewer_score"] = soup.find('span', class_='review-score-badge').text.strip()
review["review"]["review_pos"] = soup.find('p', class_='review_pos').text.strip()
review["review"]["review_neg"] = soup.find('p', class_='review_neg').text.strip()
scoreword = soup.find('span', class_='review_item_header_scoreword')
if scoreword != None :
review["review_metadata"]["review_header"] = scoreword.text.strip()
else:
review["review_metadata"]["review_header"] = ""
hotel_reviews.append(x)
print(hotel_reviews)
When you are iterating over the review items, you need to use line.find() instead of soup.find(). This way, you'll be looking for review fields inside every review container as opposed to searching the whole HTML tree:
for line in soup.find_all('li', class_='review_item'):
review = {"review_metadata": {}, "review": {}}
review["review_metadata"]["review_date"] = line.find('p', class_='review_item_date').text.strip()
# ^ HERE
I've implemented news website scraper that scrapes by using Selenium web driver to access dynamic web pages and BeautifulSoup to retrieve the content. While parsing websites, I'm also writing scraped data to MongoDB storage and downloading pictures. I want to implement full news search by given category or by text, that appears in the news content. What can be the suggestions in terms of parallelization/adding async code to speed up the performance?
# -*- coding: utf-8 -*-
import os
import json
import requests
from bs4 import BeautifulSoup
from mongo_setup import Database
import gridfs
from selenium import webdriver
from selenium.common.exceptions import WebDriverException
import time
import logging
import re
import pymongo
PROJECT_ROOT = os.path.abspath(os.path.dirname(__file__))
DRIVER_BIN = os.path.join(PROJECT_ROOT, "bin/chromedriver")
class Scraper:
tsn_resource = 'https://tsn.ua/'
ukrnet_resource = 'https://www.ukr.net/'
db_name = 'scraper_db'
category_coll = 'categories'
articles_coll = 'articles'
def __init__(self, limit=10):
self.limit = limit # max number of articles per category
self.db = Database(self.db_name).connect_db()
self.category_coll = self.init_collection(self.category_coll)
self.articles_coll = self.init_collection(self.articles_coll)
self.logger = self.init_logger()
self.driver = webdriver.Chrome(executable_path = DRIVER_BIN)
self.image_storage = os.path.join(PROJECT_ROOT, "image_storage/")
def init_logger(self):
'''
Initialize log file.
'''
logger = logging.getLogger('scraper_app')
logger.setLevel(logging.INFO)
# create a file handler
handler = logging.FileHandler('scraper_logfile.log')
handler.setLevel(logging.INFO)
# create a logging format
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
# add the handlers to the logger
logger.addHandler(handler)
return logger
def init_collection(self, name):
if name in self.db.collection_names():
self.db[name].drop()
return self.db[name]
def insert_one_to_collection(self, data, collection):
try:
collection.insert_one(data)
except pymongo.errors.DuplicateKeyError:
pass
def insert_many_to_collection(self, data, collection):
try:
collection.insert_many(data)
except pymongo.errors.DuplicateKeyError:
pass
def download_image(self, image_url):
'''
download images from news articles
to local storage
'''
if not image_url.startswith(("data:image", "javascript")):
local_filename = image_url.split('/')[-1].split("?")[0]
r = requests.get(image_url, stream=True, verify=False)
with open(self.image_storage + local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
f.write(chunk)
def upload_image_to_mongo(self, image_url):
response = requests.get(image_url, stream=True)
fs = gridfs.GridFS(self.db)
img = response.raw.read()
fs.put(img, filename=local_filename)
def get_page_content(self, url):
try:
self.driver.get(url)
except WebDriverException:
self.driver = webdriver.Chrome(executable_path = DRIVER_BIN)
page = self.driver.page_source
return page
def parse_page_content(self, url, parser_lib):
page_obj = self.get_page_content(url)
soup = BeautifulSoup(page_obj, parser_lib)
return soup
def tsn_categories(self):
categories = self.gather_categories(self.tsn_resource, 'ul.c-app-nav-more-list li a')
return categories
def ukrnet_categories(self):
categories = self.gather_categories(self.ukrnet_resource, 'h2.feed__section--title a')
return categories
def gather_categories(self, url, selector):
categories = []
soup = self.parse_page_content(url, "html.parser")
all_categories = soup.select(selector)
for item in all_categories:
category = {}
link = str(item.attrs.get('href'))
if link.startswith('javascript'):
continue
if not link.startswith('https:'):
link = 'https:' + link
category['link'] = link
category['name'] = item.get_text().strip()
categories.append(category)
self.insert_many_to_collection(categories, self.category_coll)
return categories
def search_by_category(self, category_name):
category_name = category_name.decode('utf-8')
category_list = []
category_list += self.tsn_categories()
category_list += self.ukrnet_categories()
category_obj = next(item for item in category_list if item['name'] == category_name)
link = category_obj['link']
if 'ukr.net' in link:
articles = self.get_ukrnet_articles(category_name, link)
else:
articles = self.get_tsn_articles(category_name, link)
return articles
def get_ukrnet_articles(self, category_name, url):
'''
retrieve all articles from ukr.net by given category link
'''
count = 0
result = []
soup = self.parse_page_content(url, "html.parser")
all_articles = soup.select('div.im-tl a')
for item in all_articles:
if count <= self.limit:
article = {}
link = item.attrs.get('href')
article['link'] = link
article['category'] = category_name
article['content'] = item.contents[0].encode('utf-8')
result.append(article)
self.insert_one_to_collection(article, self.articles_coll)
else:
break
count += 1
return result
def get_tsn_articles(self, category_name, url):
'''
retrieve all articles from tsn.ua by given category link
'''
count = 0
result = []
data = [] # temporary storage
# first parse through the list of articles
soup = self.parse_page_content(url, "html.parser")
all_articles = soup.select('div.c-entry-embed a.c-post-img-wrap')
for item in all_articles:
# iterate limit amount of articles
if count <= self.limit:
article = {}
link = item.attrs.get('href')
img_src = item.find('img').get('src')
if link.endswith(".html"):
article['link'] = link
if img_src is not None:
article['img_src'] = img_src
self.download_image(img_src)
article['category'] = category_name
data.append(article)
count += 1
else:
break
# then iterate over each article
for article in data:
new_soup = self.parse_page_content(article['link'], "html5lib")
news_content = new_soup.select('div.e-content p')
text_content = [] # article content
for chunk in news_content:
text_content.append(chunk.get_text().strip(''))
article_text = ' '.join(text_content)
news_header = new_soup.select('div.c-post-meta h1') # article title
if news_header:
header_text = "".join(news_header[0].contents)
article_image = new_soup.find('figure', class_='js-lightgallery')
if article_image:
img_src = article_image.find('img').get('src') # articles image
self.download_image(img_src)
news_chunk = {}
news_chunk['category'] = article['category']
news_chunk['link'] = article['link']
news_chunk['title'] = header_text
# news_chunk['title'] = ''
news_chunk['content'] = article_text
news_chunk['images'] = []
if 'img_src' in article:
news_chunk['images'].append(article['img_src']) # caption image
if article_image:
news_chunk['images'].append(img_src) # article image
result.append(news_chunk)
self.insert_one_to_collection(news_chunk, self.articles_coll)
return result
def search_by_text(self, text):
category_links = []
category_links += self.ukrnet_categories()
category_links += self.tsn_categories()
result = self.website_search_by_text(text, category_links)
return result
def website_search_by_text(self, text_searched, category_links):
result = []
text_searched = text_searched.decode('utf-8')
for link in category_links:
article = {}
soup = self.parse_page_content(link['link'], "html.parser")
all_articles = soup.find_all('a', text=re.compile(text_searched))
for item in all_articles:
article['link'] = item.attrs.get('href')
article['category'] = link['name']
article['content'] = (item.contents[0].strip()).encode('utf-8')
self.insert_one_to_collection(article, self.articles_coll)
result.append(article)
return result
def collect_ukrnet_articles(self):
'''
outdated
'''
categories = self.ukrnet_categories()
for category in categories:
count = 0
soup = self.parse_page_content(category['link'], "html.parser")
all_articles = soup.select('div.im-tl a')
for item in all_articles:
# only 10 first articles
if count < self.limit:
article = {}
link = item.attrs.get('href')
article['link'] = link
article['category'] = category['name']
article['content'] = item.contents[0].encode('utf-8')
self.insert_one_to_collection(article, self.articles_coll)
else:
break
count += 1
def run(self):
self.search_by_category('Economics', self.tsn_categories())
self.search_by_text('Economics')
self.driver.quit()
if __name__ == '__main__':
scraper = Scraper()
scraper.run()
scrapy is a solid python framework that automatically does things async/parallel.
There's also multiprocessing that's been conveniently put into one package.
And then there's multithreading, also conveniently put into one package.
With the multithreading library there's a way to call the function you're trying to thread with map() and then pass the lists/variables you're trying to use with it. map(your_func, your_list)
I don't remember the exact link, or structure for it, but it's a quick google search away. Really makes it easier.