I want to webscraping multiple urls and parse quick as possible but the for loop is not too faster for me, have a way to do this maybe with asynchronous or multiprocessing or multithreading?
import grequests
from bs4 import BeautifulSoup
links1 = [] #multiple links
while True:
try:
reqs = (grequests.get(link) for link in links1)
resp = grequests.imap(reqs, size=25, stream=False)
for r in resp: # I WANT TO RUN THIS FOR LOOP QUICK AS POSSIBLE ITS POSSIBLE?
soup = BeautifulSoup(r.text, 'lxml')
parse = soup.find('div', class_='txt')
Example how to use multiprocessing with requests/BeautifulSoup:
import requests
from tqdm import tqdm # for pretty progress bar
from bs4 import BeautifulSoup
from multiprocessing import Pool
# some 1000 links to analyze
links1 = [
"https://en.wikipedia.org/wiki/2021_Moroccan_general_election",
"https://en.wikipedia.org/wiki/Tangerang_prison_fire",
"https://en.wikipedia.org/wiki/COVID-19_pandemic",
"https://en.wikipedia.org/wiki/Yolanda_Fern%C3%A1ndez_de_Cofi%C3%B1o",
] * 250
def parse(url):
soup = BeautifulSoup(requests.get(url).content, "html.parser")
return soup.select_one("h1").get_text(strip=True)
if __name__ == "__main__":
with Pool() as p:
out = []
for r in tqdm(p.imap(parse, links1), total=len(links1)):
out.append(r)
print(len(out))
With my internet connection/CPU (Ryzen 3700x) I was able to get results from all 1000 links in 30 seconds:
100%|██████████| 1000/1000 [00:30<00:00, 33.12it/s]
1000
all my CPUs were utilized (screenshot from htop):
Related
I have a URL from where I am trying to extract data. I have found a way to extract data from that url. but how do i then move to the next URL if the existing URL doesn't have any data?
The base URL have used in main function is,
https://posoco.in/reports/daily-reports/
instead I only want to extract data from 2020-21 and then go on from here,
https://posoco.in/reports/daily-reports/daily-reports-2020-21/
this way if the pdf files from 2020-21 is all extracted, then the program should start extracting from the next URL which is 2021-22 so on as long as the website exists in an automated way so program can check them automated every-year.
The code I have written.
#import libraries
import re
import tabula
import datetime
from datetime import datetime,timedelta
from datetime import timedelta, date
import requests
import pandas as pd
import glob
import logging
import os
import urllib.request
import urllib.error
import urllib.parse
from bs4 import BeautifulSoup
import wget
def source_urls(url):
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
tags = soup('a')
filelink = []
for link in tags:
if ('daily-reports-' in link.get('href','')):
filelink.append(link.get('href'))
return filelink
def get_urls(url):
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
tags = soup('a')
filelink = []
for link in tags:
if ('_nldc_psp/?wpdmdl' in link.get('href','')):
filelink.append(link.get('href'))
return filelink
if __name__ == "__main__":
url = 'https://posoco.in/reports/daily-reports/'
file_links = source_urls(url)
sorted_file_links = sorted(file_links)
for files in sorted_file_links:
sub_files = get_urls(files)
for x in sub_files:
print(x)
The program output
https://posoco.in/download/31.03.14_nldc_psp/?wpdmdl=3256
https://posoco.in/download/30.03.14_nldc_psp/?wpdmdl=3255
https://posoco.in/download/29.03.14_nldc_psp/?wpdmdl=3254
https://posoco.in/download/28.03.14_nldc_psp/?wpdmdl=3253
....
...
...
...
...
https://posoco.in/download/11-03-21_nldc_psp/?wpdmdl=35681
https://posoco.in/download/10-03-21_nldc_psp/?wpdmdl=35649
https://posoco.in/download/09-03-21_nldc_psp/?wpdmdl=35627
https://posoco.in/download/08-03-21_nldc_psp/?wpdmdl=35612
https://posoco.in/download/07-03-21_nldc_psp/?wpdmdl=35589
I have pasted all the libraries, but only a few are used here. rest are used to download and for processing and logging.
Well, you've to set an condition where it's will pickup the main urls which is equal/bigger than 2021-20
And then you can parse the inner urls.
Also, There's no need to use .get here!
Since you are using a condition to pickup urls include _nldc_psp/?wpdmdlwhich means if there's a pattern then return it. so you DONNOT need to replace the value with empty values link.get('href','') and then return it!
import httpx
import trio
from bs4 import BeautifulSoup
from datetime import datetime
from pprint import pprint as pp
cond = datetime.strptime('2020-21', '%Y-%d')
async def get_urls(client):
r = await client.get('https://posoco.in/reports/daily-reports/')
soup = BeautifulSoup(r.text, 'lxml')
return [x['href'] for x in soup.select('a[href*=reports-]') if datetime.strptime(x['href'].split('-', 3)[-1][:-1], '%Y-%d') >= cond]
async def main():
async with httpx.AsyncClient(timeout=None) as client, trio.open_nursery() as nurse:
links = await get_urls(client)
async def get_pdf(url):
r = await client.get(url)
soup = BeautifulSoup(r.text, 'lxml')
pp([x['href']
for x in soup.select('a[href*="nldc_psp"]')])
if links:
for link in links:
nurse.start_soon(get_pdf, link)
if __name__ == "__main__":
trio.run(main)
I am scraping data from https://www.bseindia.com/markets/PublicIssues/BSEBidDetails_ofs.aspx?flag=NR&Scripcode=541729. Every 10 seconds the data changes. So I have made a timer for the script to run. At the start of each iteration it re initializes the pandas dataframe and scrapes the website and adds data to the dataframe. Is there a way I can Visulise this data in a static window?
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as soup
import pandas as pd
import time
url = 'https://www.bseindia.com/markets/PublicIssues/BSEBidDetails_ofs.aspx?flag=NRScripcode=541729'
req = Request(url , headers={'User-Agent': 'Mozilla/5.0'})
bseobj=pd.DataFrame(columns=['Price Interval','qNumber of bids','qConfirmed','qYet to be confirmed','qTotal','Confirmed','Yet to be confirmed','Total'])
while True:
i=0
r=[]
lr=[]
webpage = urlopen(req).read()
page_soup = soup(webpage, "html.parser")
table=page_soup.find_all("table")
for row in table[1]:
for column in row.find_all("td",{"class":"body_text_table"},{"bgcolor":"#FFFFFF"}):
val=column.get_text()
val=val.replace(',','')
chk=val.replace('.','')
if chk.isdigit():
r.append(float(val))
elif val=="":
r.append(0)
else:
r.append(val)
lr.append(r)
r=[]
for i in range(2,len(lr)-2):
bseobj=bseobj.append({'Price Interval':lr[i][0],'qNumber of bids':lr[i][1],'qConfirmed':lr[i][2],'qYet to be confirmed':lr[i][3],'qTotal':lr[i][4],'Confirmed':lr[i][5],'Yet to be confirmed':lr[i][6],'Total':lr[i][7]},ignore_index=True)
print(bseobj)
bseobj=bseobj.iloc[0:0]
time.sleep(10)
I am just starting out with web scraping. I am having trouble with beautiful soup. I have tried changing the div class to other classes as well but it always returns []. Here is my code.
import time
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Chrome(executable_path="C:/Users/MuhIsmail/Downloads/cd79/chromedriver.exe")
url = "https://www.cricbuzz.com/cricket-match/live-scores"
driver.get(url)
driver.maximize_window()
time.sleep(4)
content = driver.page_source
soup = BeautifulSoup(content, "html.parser")
scores = soup.find_all('div', class_='col-xs-9 col-lg-9 dis-inline')
print(scores)
import requests
from bs4 import BeautifulSoup
r = requests.get("https://www.cricbuzz.com/cricket-match/live-scores")
soup = BeautifulSoup(r.text, 'html.parser')
for item in soup.select("a.cb-mat-mnu-itm:nth-child(5)"):
print(item.text)
Output:
MLR vs SYS - SYS Won
It is returning [] because there are no elements on the page with that class.
If you open your browser console and do a simple
document.getElementsByClassName('col-xs-9 col-lg-9 dis-inline')
it will return no results.
I tried this as well:
import requests
from bs4 import BeautifulSoup
url = "https://www.cricbuzz.com/cricket-match/live-scores"
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
scores = soup.find_all('div', {'class':'col-xs-9 col-lg-9 dis-inline'})
print(scores)
I'm in the process of learning python3 and I try to solve a simple task. I want to get the name of account and the date of post from instagram link.
import requests
from bs4 import BeautifulSoup
html = requests.get('https://www.instagram.com/p/BuPSnoTlvTR')
soup = BeautifulSoup(html.text, 'lxml')
item = soup.select_one("meta[property='og:description']")
name = item.find_previous_sibling().get("content").split("•")[0]
print(name)
This code works sometimes with links like this https://www.instagram.com/kingtop
But I need it to work also with post of image like this https://www.instagram.com/p/BuxB00KFI-x/
That's all what I could make, but this is not working. And I can't get the date also.
Do you have any ideas? I appreciate any help.
I found a way to get the name of account. Now I'm trying to find a way to get an upload date
import requests
from bs4 import BeautifulSoup
import urllib.request
import urllib.error
import time
from multiprocessing import Pool
from requests.exceptions import HTTPError
start = time.time()
file = open('users.txt', 'r', encoding="ISO-8859-1")
urls = file.readlines()
for url in urls:
url = url.strip ('\n')
try:
req = requests.get(url)
req.raise_for_status()
except HTTPError as http_err:
output = open('output2.txt', 'a')
output.write(f'не найдена\n')
except Exception as err:
output = open('output2.txt', 'a')
output.write(f'не найдены\n')
else:
output = open('output2.txt', 'a')
soup = BeautifulSoup(req.text, "lxml")
the_url = soup.select("[rel='canonical']")[0]['href']
the_url2=the_url.replace('https://www.instagram.com/','')
head, sep, tail = the_url2.partition('/')
output.write (head+'\n')
I am trying to scrape a site and the problem I am running into is the page takes time to load. So by the time my scraping is done I may get only five items when there may be 25. Is there a way to slow down python. I am using beautifulSoup
Here is the code I am using
import urllib
import urllib.request
from bs4 import BeautifulSoup
theurl="http://agscompany.com/product-category/fittings/tube-nuts/316-tube/"
thepage = urllib.request.urlopen(theurl)
soup = BeautifulSoup(thepage,"html.parser")
for pn in soup.find_all('div',{"class":"shop-item-text"}):
pn2 = pn.text
print(pn2)
Thank you
All the results can be accessed from theses pages :
http://agscompany.com/product-category/fittings/tube-nuts/316-tube/page/
http://agscompany.com/product-category/fittings/tube-nuts/316-tube/page/2/
...
So you can access them with a loop on the page number :
import urllib
import urllib.request
from bs4 import BeautifulSoup
theurl="http://agscompany.com/product-category/fittings/tube-nuts/316-tube/"
for i in range(1,5):
thepage = urllib.request.urlopen(theurl + '/page/' + str(i) + '/')
soup = BeautifulSoup(thepage,"html.parser")
for pn in soup.find_all('div',{"class":"shop-item-text"}):
pn2 = pn.text
print(pn2)
More generic version of #Kenavoz's answer.
This approach doesn't care about how many pages there are.
Also, I would go for requests rather than urllib.
import requests
from bs4 import BeautifulSoup
url_pattern = 'http://agscompany.com/product-category/fittings/tube-nuts/316-tube/page/{index}/'
status_code = 200
url_index = 1
while status_code == 200:
url = url_pattern.format(index=url_index)
response = requests.get(url)
status_code = response.status_code
url_index += 1
soup = BeautifulSoup(response.content, 'html.parser')
page_items = soup.find_all('div', {'class': 'shop-item-text'})
for page_item in page_items:
print(page_item.text)