I'm trying to scrape data from this site enter link description here but I'm encountering a problem with pricing. Can you tell me if I'm seeing the documentation wrong?
import requests
from bs4 import BeautifulSoup
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
def extract_data():
url = "https://www.vivareal.com.br/venda/rj/rio-de-janeiro/apartamento_residencial/"
response = requests.get(url)
return BeautifulSoup(response.text, "html.parser")
soup = extract_data()
property_list = soup.find_all("div", class_="property-card__main-content")
# Created a dataframe with the extracted data
data = {"title": [property.find("h2", class_="property-card__title").text for property in property_list],
"price": [property.find("div", class_="property-card__price").text for property in property_list],
"location": [property.find("div", class_="property-card__address").text for property in property_list]}
df = pd.DataFrame(data)
and then i was preprocessing the data
df["price"] = pd.to_numeric(df["price"].str.replace("R$ ", "").str.replace(".", "").str.replace(",", ".").str.extract('(\d+)').astype(float))
but i got this error: Can only use .str accessor with string values! , which i think is that the element is not being found by the find() method.
Related
This is my code
from bs4 import BeautifulSoup
import requests, lxml
import re
from urllib.parse import urljoin
from googlesearch import search
import pandas as pd
query = 'A M C College of Engineering, Bangalore'
link = []
for i in search(query, tld='co.in', start=0, stop=1):
print(i)
soup = BeautifulSoup(requests.get(i).text, 'lxml')
for link in soup.select("a[href$='.pdf']"):
if re.search(r'nirf', str(link), flags=re.IGNORECASE):
fUrl = urljoin(i, link['href'])
print(fUrl)
link.append(fUrl)
print(link)
df = pd.DataFrame(link, columns=['PDF LINKS'])
print(df)
Here is my output after running the code:
https://www.amcgroup.edu.in/AMCEC/index.php
https://www.amcgroup.edu.in/AMCEC/image/Download/NIRFENGG.pdf
https://www.amcgroup.edu.in/AMCEC/image/Download/NIRFMBA.pdf
https://www.amcgroup.edu.in/AMCEC/image/Download/NIRF_2019.pdf
https://www.amcgroup.edu.in/AMCEC/image/Download/NIRF_2020.pdf
# Printing list with links but getting tags
For Invitation Click here...
# Dataframe where I want to store list
PDF LINKS
0 For Invitation Click here...
I should get the list of links which is shown in the output but when printing the list it gives me the whole tag instead of link. Also I want to push the all the links that I got into a single row of dataframe like this:
PDF LINKS
0 link1 link2 link3 #for query1
1 link1 link2 #for another query
How can I achieve this. And what is the problem with my code why I am getting tag instead of list.
Thanks in advance.
Use different variable name for the list and for the tag in for-loop:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin
query = "A M C College of Engineering, Bangalore"
all_data = []
for i in ["https://www.amcgroup.edu.in/AMCEC/index.php"]:
soup = BeautifulSoup(requests.get(i).text, "lxml")
for link in soup.select("a[href$='.pdf']"): # <-- `link` is different than `all_data` here!
if re.search(r"nirf", link["href"], flags=re.IGNORECASE):
fUrl = urljoin(i, link["href"])
all_data.append(fUrl)
df = pd.DataFrame(all_data, columns=["PDF LINKS"])
print(df)
Prints:
PDF LINKS
0 https://www.amcgroup.edu.in/AMCEC/image/Download/NIRFENGG.pdf
1 https://www.amcgroup.edu.in/AMCEC/image/Download/NIRFMBA.pdf
2 https://www.amcgroup.edu.in/AMCEC/image/Download/NIRF_2019.pdf
3 https://www.amcgroup.edu.in/AMCEC/image/Download/NIRF_2020.pdf
EDIT: To have results in one row:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin
query = "A M C College of Engineering, Bangalore"
all_data = []
for i in ["https://www.amcgroup.edu.in/AMCEC/index.php"]:
soup = BeautifulSoup(requests.get(i).text, "lxml")
row = []
for link in soup.select(
"a[href$='.pdf']"
): # <-- `link` is different than `all_data` here!
if re.search(r"nirf", link["href"], flags=re.IGNORECASE):
fUrl = urljoin(i, link["href"])
row.append(fUrl)
if row:
all_data.append(row)
df = pd.DataFrame({"PDF LINKS": all_data})
print(df)
Prints:
PDF LINKS
0 [https://www.amcgroup.edu.in/AMCEC/image/Download/NIRFENGG.pdf, https://www.amcgroup.edu.in/AMCEC/image/Download/NIRFMBA.pdf, https://www.amcgroup.edu.in/AMCEC/image/Download/NIRF_2019.pdf, https://www.amcgroup.edu.in/AMCEC/image/Download/NIRF_2020.pdf]
I am running into an issue pulling in the human readable header name from this table in an html document. I can pull in the id, but my trouble comes when trying to pull in the correct header between the '>... I am not sure what I need to do in this instance... Below is my code. It all runs except for the last for loop.
# Import libraries
import requests
from bs4 import BeautifulSoup
from pprint import pprint
import pandas as pd
import numpy as np
# Pull the HTML link into a local file or buffer
# and then parse with the BeautifulSoup library
# ------------------------------------------------
url = 'https://web.dsa.missouri.edu/static/mirror_sites/factfinder.census.gov/bkmk/table/1.0/en/GEP/2014/00A4/0100000US.html'
r = requests.get(url)
#print('Status: ' + str(r.status_code))
#print(requests.status_codes._codes[200])
soup = BeautifulSoup(r.content, "html")
table = soup.find(id='data')
#print(table)
# Convert the data into a list of dictionaries
# or some other structure you can convert into
# pandas Data Frame
# ------------------------------------------------
trs = table.find_all('tr')
#print(trs)
header_row = trs[0]
#print(header_row)
names = []
for column in header_row.find_all('th'):
names.append(column.attrs['id'])
#print(names)
db_names = []
for column in header_row.find_all('a'):
db_names.append(column.attrs['data-vo-id']) # ISSUE ARISES HERE!!!
print(db_names)
Let pandas read_html do the work for you, and simply specify the table id to find:
from pandas import read_html as rh
table = rh('https://web.dsa.missouri.edu/static/mirror_sites/factfinder.census.gov/bkmk/table/1.0/en/GEP/2014/00A4/0100000US.html', attrs = {'id': 'data'})[0]
Hey you can try something like this :
soup = BeautifulSoup(r.content, "html")
table = soup.findAll('table', {'id':'data'})
trs = table[0].find_all('tr')
#print(trs)
names = []
for row in trs[:1]:
td = row.find_all('td')
data_row_txt_list = [td_tag.text.strip() for td_tag in row]
header_row = data_row_txt_list
for column in header_row:
names.append(column)
This is the url: https://yorkathletics.com/sports/mens-swimming-and-diving/roster"
If I run this command:
soup.find_all('span', class_="sidearm-roster-player-height")
then I try to get the length of the output, it is mentioned 20 while it is supposed to be 10.
I can't see why this happens.
Change your class selector as follows:
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://yorkathletics.com/sports/mens-swimming-and-diving/roster')
soup = bs(r.content, 'lxml')
print([i.text for i in soup.select('.height')])
Note: You can grab the whole table with pandas:
import pandas as pd
table = pd.read_html('https://yorkathletics.com/sports/mens-swimming-and-diving/roster')[2]
print(table)
Im trying to scrape the box score table from https://www.nascar.com/stats/2021/1/box-score
my code is not working if someone could take a look and point me in right direction.
`import requests
from bs4 import BeautifulSoup
import pandas as pd
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = ('https://www.nascar.com/stats/2021/1/box-score')
response = requests.get(url, headers = headers)
response.content
soup = BeautifulSoup(response.content, 'html.parser')
stats = soup.find_all('table', class_ = "stats-box-score-table-driver")
stats
for row in stats.find_all('tr'):
for cell in row.find_all('td'):
print(cell.text)
I'm performing the same web scraping pattern that I just learned from post , however, I'm unable to scrap the using below script. I keep getting an empty return and I know the tags are there. I want to find_all "mubox" then pulls values for O/U and goalie information. This so weird, what am I missing?
from bs4 import BeautifulSoup
import requests
import pandas as pd
page_link = 'https://www.thespread.com/nhl-scores-matchups'
page_response = requests.get(page_link, timeout=10)
# here, we fetch the content from the url, using the requests library
page_content = BeautifulSoup(page_response.content, "html.parser")
# Take out the <div> of name and get its value
tables = page_content.find_all("div", class_="mubox")
print (tables)
# Iterate through rows
rows = []
This site uses an internal API before rendering the data. This api is an xml file, you can get here which contains all the match information. You can parse it using beautiful soup :
from bs4 import BeautifulSoup
import requests
page_link = 'https://www.thespread.com/matchups/NHL/matchup-list_20181030.xml'
page_response = requests.get(page_link, timeout=10)
body = BeautifulSoup(page_response.content, "lxml")
data = [
(
t.find("road").text,
t.find("roadgoalie").text,
t.find("home").text,
t.find("homegoalie").text,
float(t.find("ot").text),
float(t.find("otmoney").text),
float(t.find("ft").text),
float(t.find("ftmoney").text)
)
for t in body.find_all('event')
]
print(data)