python code to loop though a list of postcodes and get the GP practices for those postcodes by scraping the yellow pages (Australia) - python-3.x

The code below gives me the following error:
ValueError: Length mismatch: Expected axis has 0 elements, new values have 1 elements
on the df.columns = ["GP Practice Name"] line.
I tried
import pandas as pd
import requests
from bs4 import BeautifulSoup
postal_codes = ["2000", "2010", "2020", "2030", "2040"]
places_by_postal_code = {}
def get_places(postal_code):
url = f"https://www.yellowpages.com.au/search/listings?clue={postal_code}&locationClue=&latitude=&longitude=&selectedViewMode=list&refinements=category:General%20Practitioner&selectedSortType=distance"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
places = soup.find_all("div", {"class": "listing-content"})
return [place.find("h2").text for place in places]
for postal_code in postal_codes:
places = get_places(postal_code)
places_by_postal_code[postal_code] = places
df = pd.DataFrame.from_dict(places_by_postal_code, orient='index')
df.columns = ["GP Practice Name"]
df = pd.DataFrame(places_by_postal_code.values(), index=places_by_postal_code.keys(), columns=["GP Practice Name"])
print(df)
and was expecting a list of GPs for the postcodes specified in the postal_codes variable.

Related

How to scrape table from website, while BS4 selection won`t find it?

I'm using below code scrape table element from url (www.sfda.gov.sa/en/cosmetics-list). But its coming empty
from bs4 import BeautifulSoup
import requests
import pandas as pd
url="https://www.sfda.gov.sa/en/cosmetics-list"
res = requests.get(url)
soup = BeautifulSoup(res.content, 'html.parser')
table = soup.find('table', attrs={'class':'table table-striped display'})
table_rows = table.find_all('tr')
res = []
for tr in table_rows:
td = tr.find_all('td')
row = [tr.text.strip() for tr in td if tr.text.strip()]
if row:
res.append(row)
df = pd.DataFrame(res, columns=["ProductName", "Category", "Country", "Company"])
print(df)
Running above code but not getting data
Data is loaded via XHR so you should use this to get your information:
url = 'https://www.sfda.gov.sa/GetCosmetics.php?page=1'
pd.DataFrame(requests.get(url).json()['results'])
Example
Loop over number of pages in range() and collect all data.
import requests
import pandas as pd
data = []
for i in range(1,5):
url = f'https://www.sfda.gov.sa/GetCosmetics.php?page={i}'
data.extend(requests.get(url).json()['results'])
pd.DataFrame(data)
Output
id
cosmatics_Id
productNotificationsId
productNumber
status
productArName
productEnName
brandName
catArabic
catEnglish
counrtyAr
counrtyEn
manufactureType
packageVolume
unitAr
unitEn
barcode
manufacturearabicname
manufactureenglishname
listedNameAr
listedNameEn
imageUrl
batchNumber
country_of_manufacturing_English
country_of_manufacturing_Arabic
productCreationDate
productexpireddate
subCategory1
subCategoryAR
storageCircumstances
protectionInstructions
usageInstructions
notes
mainCommercialRecordNumber
manufacturingLicenseNumber
0
549105
58472
10518
2020-011019101291-245945
Active
ليتسيا كوبيبا
Litsea cubeba oil
MOKSHA LIFE STYLE
منتجات العناية بالبشرة
Skin products
الهند
India
Foreign
250
ملي لتر
Milliliter (ml)
0
موكشا لايف ستايل برودكت
Moksha lifestyle products
مؤسسة شجور الارض للتجارة
shojoor alearth trading
India
الهند
2020-09-28T09:40:46
2025-10-05T09:40:46
Perfumes
العطور
room temperature
تاريخ انتهاء الصلاحية
الاستعمال الخارجي
7016000957
FR555666
...
9
84386
58481
4031
2016-0120132-048982
Active
جودي ثيرابي سيستيم للشعر بالبروتين
Judy protein & Silk hair therapy system
Judy
منتجات العناية بالشعر وفروة الرأس
Hair and scalp products
الولايات المتحدة
United States
Foreign
1000
ملي لتر
Milliliter (ml)
641243925950
معامل ناتيورال كوزماتيك
natural cosmetic labs USA Inc.,
شركه بيت جودي الدوليه للتجارة
bait gody for trading co.
United States
الولايات المتحدة
2016-12-25T14:40:44
2027-01-01T14:40:44
Hair styling products
منتجات تصفيف الشعر
7007289163
FR555666
You can use concurrent.futures to concurrently scrape pages and when all pages are complete concat the results into a single dataframe:
import concurrent.futures
import json
import os
import pandas as pd
import requests
class Scrape:
def __init__(self):
self.root_url = "https://www.sfda.gov.sa/GetCosmetics.php?"
self.pages = self.get_page_count()
self.processors = os.cpu_count()
def get_page_count(self) -> int:
return self.get_data(url=self.root_url).get("pageCount")
#staticmethod
def get_data(url: str) -> dict:
with requests.Session() as request:
response = request.get(url, timeout=30)
if response.status_code != 200:
print(response.raise_for_status())
return json.loads(response.text)
def process_pages(self) -> pd.DataFrame:
page_range = list(range(1, self.pages + 1))
with concurrent.futures.ProcessPoolExecutor(max_workers=self.processors) as executor:
return pd.concat(executor.map(self.parse_data, page_range)).reset_index(drop=True)
def parse_data(self, page: int) -> pd.DataFrame:
url = f"{self.root_url}page={page}"
data = self.get_data(url=url)
return (pd
.json_normalize(data=data, record_path="results")
)[["productEnName", "catEnglish", "counrtyEn", "brandName"]].rename(
columns={"productEnName": "ProductName", "catEnglish": "Category",
"counrtyEn": "Country", "brandName": "Company"}
)
if __name__ == "__main__":
final_df = Scrape().process_pages()
print(final_df)

extracting columns contents only so that all columns for each row are in the same row using Python's BeautifulSoup

I have the following python snippet in Jupyter Notebooks that works.
The challenge I have is to extract just the rows of columnar data only
Here's the snippet:
from bs4 import BeautifulSoup as bs
import pandas as pd
page = requests.get("http://lib.stat.cmu.edu/datasets/boston")
page
soup = bs(page.content)
soup
allrows = soup.find_all("p")
print(allrows)
I'm a little unclear of what you are after but I think it's each individual row of data from URL provided.
I couldn't find a way to use beautiful soup to parse the data you are after but did find a way to separate the rows using .split()
from bs4 import BeautifulSoup as bs
import pandas as pd
import requests
page = requests.get("http://lib.stat.cmu.edu/datasets/boston")
soup = bs(page.content)
allrows = soup.find_all("p")
text = soup.text # turn soup into text
text_split = text.split('\n\n') # split the page into 3 sections
data = text_split[2] # rows of data
# create df column titles using variable titles on page
col_titles = text_split[1].split('\n')
df = pd.DataFrame(columns=range(14))
df.columns = col_titles[1:]
# 'try/except' to catch end of index,
# loop throw text data building complete rows
try:
complete_row = []
n1 = 0 #used to track index
n2 = 1
rows = data.split('\n')
for el in range(len(rows)):
full_row = rows[n1] + rows[n2]
complete_row.append(full_row)
n1 = n1 + 2
n2 = n2 + 2
except IndexError:
print('end of loop')
# loop through rows of data, clean whitespace and append to df
for row in complete_row:
elem = row.split(' ')
df.loc[len(df)] = [el for el in elem if el]
#fininshed dataframe
df

Pass url column's values one by one to web crawler code in Python

Based on the answered code from this link, I'm able to create a new column: df['url'] = 'https://www.cspea.com.cn/list/c01/' + df['projectCode'].
Next step I would like to pass the url column's values to the following code and append all the scrapied contents as dataframe.
import urllib3
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "https://www.cspea.com.cn/list/c01/gr2021bj1000186" # url column's values should be passed here one by one
soup = BeautifulSoup(requests.get(url, verify=False).content, "html.parser")
index, data = [], []
for th in soup.select(".project-detail-left th"):
h = th.get_text(strip=True)
t = th.find_next("td").get_text(strip=True)
index.append(h)
data.append(t)
df = pd.DataFrame(data, index=index, columns=["value"])
print(df)
How could I do that in Python? Thanks.
Updated:
import requests
from bs4 import BeautifulSoup
import pandas as pd
df = pd.read_excel('items_scraped.xlsx')
data = []
urls = df.url.tolist()
for url_link in urls:
url = url_link
# url = "https://www.cspea.com.cn/list/c01/gr2021bj1000186"
soup = BeautifulSoup(requests.get(url, verify=False).content, "html.parser")
index, data = [], []
for th in soup.select(".project-detail-left th"):
h = th.get_text(strip=True)
t = th.find_next("td").get_text(strip=True)
index.append(h)
data.append(t)
df = pd.DataFrame(data, index=index, columns=["value"])
df = df.T
df.reset_index(drop=True, inplace=True)
print(df)
df.to_excel('result.xlsx', index = False)
But it only saved one rows into excel file.
You need to combine the dfs generated in the loop. You could add them to a list and then call pd.concat on that list.
import requests
from bs4 import BeautifulSoup
import pandas as pd
df = pd.read_excel('items_scraped.xlsx')
# data = []
urls = df.url.tolist()
dfs = []
for url_link in urls:
url = url_link
# url = "https://www.cspea.com.cn/list/c01/gr2021bj1000186"
soup = BeautifulSoup(requests.get(url, verify=False).content, "html.parser")
index, data = [], []
for th in soup.select(".project-detail-left th"):
h = th.get_text(strip=True)
t = th.find_next("td").get_text(strip=True)
index.append(h)
data.append(t)
df = pd.DataFrame(data, index=index, columns=["value"])
df = df.T
df.reset_index(drop=True, inplace=True)
print(df)
dfs.append(df)
df = pd.concat(dfs)
df.to_excel('result.xlsx', index = False)
Use
urls = df.url.tolist()
To create a list of URLs and then iterate through them using f string to insert each one into your base url

How to assign bs4 strings to pandas dataframe in for loop?

I have an HTML string that I am successfully able to use beautifulsoup4 on to extract the elements I need.
the HTML strings are in a list and I am wanting to extract only certain elements out of the strings and assign them to dataframe columns.
Current code:
import pandas as pd
from bs4 import BeautifulSoup
lst = [ <html>,<html>]
df = pd.DataFrame()
for i in lst:
soup = BeautifulSoup(i)
for link in soup.find_all('a'):
df['links'] = str(link.get('href'))
#print(link.get('href'))
#get all text messages
soup.find_all('p')
df['messages'] = str(soup.find_all('p'))
#get author name
soup.find_all(class_="author--name")
df['author'] = str(soup.find_all(class_="author--name"))
#get username
soup.find_all(class_= "author--username")
df['username'] = str(soup.find_all(class_= "author--username"))
All the soup lines of code are producing the data I need, but why is the dataframe not assigning the string values to the dataframe columns?
I can see that from an empty dataframe, the code creates the new columns but there are no values.
What am I doing wrong?
The solution was to wrap the assignments in brackets like so:
for i in lst:
df = pd.DataFrame()
soup = BeautifulSoup(i)
#print(soup)
for link in soup.find_all('a'):
df['links'] = [str(link.get('href'))]
#print(link.get('href'))
#get all text messages
soup.find_all('p')
df['messages'] = [str(soup.find_all('p'))]
#get author name
soup.find_all(class_="author--name")
df['author'] = [str(soup.find_all(class_="author--name"))]
#get username
soup.find_all(class_= "author--username")
df['username'] = [str(soup.find_all(class_= "author--username"))] text messages
soup.find_all('p')
df['messages'] = str(soup.find_all('p'))
#get author name
soup.find_all(class_="author--name")
df['author'] = str(soup.find_all(class_="author--name"))
#get username
soup.find_all(class_= "author--username")
df['username'] = str(soup.find_all(class_= "author--username"))

how to fix the indexing error and to scrape the data from a webpage

I want to scrape data from a webpage from a wayback machine using pandas. I used string split to split some string if its present.
the URL for the webpage is this
Here is my code:
import pandas as pd
url = "https://web.archive.org/web/20140528015357/http://eciresults.nic.in/statewiseS26.htm"
dfs = pd.read_html(url)
df = dfs[0]
idx = df[df[0] == '\xa0Next >>'].index[0]
# Error mentioned in comment happens on the above line.
cols = list(df.iloc[idx-1,:])
df.columns = cols
df = df[df['Const. No.'].notnull()]
df = df.loc[df['Const. No.'].str.isdigit()].reset_index(drop=True)
df = df.dropna(axis=1,how='all')
df['Leading Candidate'] = df['Leading Candidate'].str.split('i',expand=True)[0]
df['Leading Party'] = df['Leading Party'].str.split('iCurrent',expand=True)[0]
df['Trailing Party'] = df['Trailing Party'].str.split('iCurrent',expand=True)[0]
df['Trailing Candidate'] = df['Trailing Candidate'].str.split('iAssembly',expand=True)[0]
df.to_csv('Chhattisgarh_cand.csv', index=False)
The expected output from that webpage must be in csv format like
You can use BeautifulSoup to extract the data. Panadas will help you to process the data in efficient way but its not ment for data extraction.
import pandas as pd
from bs4 import BeautifulSoup
import requests
response = requests.get('https://web.archive.org/web/20140528015357/http://eciresults.nic.in/statewiseS26.htm?st=S26')
soup = BeautifulSoup(response.text,'lxml')
table_data = []
required_table = [table for table in soup.find_all('table') if str(table).__contains__('Indian National Congress')]
if required_table:
for tr_tags in required_table[0].find_all('tr',{'style':'font-size:12px;'}):
td_data = []
for td_tags in tr_tags.find_all('td'):
td_data.append(td_tags.text.strip())
table_data.append(td_data)
df = pd.DataFrame(table_data[1:])
# print(df.head())
df.to_csv("DataExport.csv",index=False)
You can expect result like this in pandas dataframe,
0 1 ... 6 7
0 BILASPUR 5 ... 176436 Result Declared
1 DURG 7 ... 16848 Result Declared
2 JANJGIR-CHAMPA 3 ... 174961 Result Declared
3 KANKER 11 ... 35158 Result Declared
4 KORBA 4 ... 4265 Result Declared
The code below should get you the table on your url link ("Chhattisgarh Result Status") using a combination of BS and pandas; you can then save it as csv:
from bs4 import BeautifulSoup
import urllib.request
import pandas as pd
url = "https://web.archive.org/web/20140528015357/http://eciresults.nic.in/statewiseS26.htm?st=S26"
response = urllib.request.urlopen(url)
elect = response.read()
soup = BeautifulSoup(elect,"lxml")
res = soup.find_all('table')
df = pd.read_html(str(res[7]))
df[3]

Resources