Again, I'm new to python, but in other languages error handling is easy. I'm not sure how to do this in python. The temperature variable on the web page doesn't hold a value for the 1st item in the list, but the other items have it. How do I ignore an item is it doesn't return something back. I want it to continue with the other code and skip the list that doesn't have a class.
Code
import requests
from bs4 import BeautifulSoup
page = requests.get('https://forecast.weather.gov/MapClick.php?lat=28.57117500000004&lon=-81.38776499999994#.YHH8fehKiUk')
soup = BeautifulSoup(page.content, 'html.parser')
week = soup.find(id='seven-day-forecast-body')
items=week.find_all(class_='tombstone-container')
period_names = [item.find(class_='period-name').get_text() for item in items]
short_descriptions = [item.find(class_='short-desc').get_text() for item in items]
temperatures = [item.find(class_='temp').get_text() for item in items]
print(period_names)
print(short_descriptions)
print(temperatures)
Error
Traceback (most recent call last): File
"c:\Users\14074\AppData\Local\Programs\Python\Python39\Youtube_posting.py",
line 12, in
temperatures = [item.find(class_='temp').get_text() for item in items] File
"c:\Users\14074\AppData\Local\Programs\Python\Python39\Youtube_posting.py",
line 12, in
temperatures = [item.find(class_='temp').get_text() for item in items] AttributeError: 'NoneType' object has no attribute 'get_text'
You can first check if the element with class="temp" exists in item and if not, return a default value (e.g "N/A"):
import requests
from bs4 import BeautifulSoup
url = "https://forecast.weather.gov/MapClick.php?lat=28.57117500000004&lon=-81.38776499999994#.YHH8fehKiUk"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
week = soup.find(id="seven-day-forecast-body")
items = week.find_all(class_="tombstone-container")
period_names = [
item.find(class_="period-name").get_text(separator=" ") for item in items
]
short_descriptions = [
item.find(class_="short-desc").get_text(separator=" ") for item in items
]
temperatures = [
i.get_text(separator=" ") if (i := item.find(class_="temp")) else "N/A"
for item in items
]
for p, d, t in zip(period_names, short_descriptions, temperatures):
print("{:<30} {:<40} {}".format(p, d, t))
Prints:
NOW until 7:00pm Sat Red Flag Warning N/A
This Afternoon Chance T-storms High: 90 °F
Tonight Showers Likely then Chance Showers Low: 69 °F
Sunday Heavy Rain High: 79 °F
Sunday Night Chance Showers Low: 67 °F
Monday Mostly Sunny High: 86 °F
Monday Night Mostly Clear Low: 62 °F
Tuesday Sunny High: 87 °F
Tuesday Night Partly Cloudy Low: 65 °F
An alternative is to use a try/except clause and catch the AttributeError exception and pass on it.
try:
temperatures = [item.find(class_='temp').get_text() for item in items]
except AttributeError:
pass
This will do exactly what you asked, which is ignore an item if item.find(class_='temp') returned a NoneType and move on to the next item.
Related
I am scraping data from GDELT [https://www.gdeltproject.org]. It is a pretty cool project that checks ~100,000 news sites each day, labels all the articles, and makes them available. I am getting attribute error while extracting the data. The code use is the following:
import gdelt
gd = gdelt.gdelt(version=1)
from statsmodels.tsa.api import VAR
import pandas as pd
import os
os.makedirs("data",exist_ok=True)
import datetime
cur_date = datetime.datetime(2022,1,10) - datetime.timedelta(days=10)
end_date = datetime.datetime(2022,1,10)
year = cur_date.year
month = str(cur_date.month)
day = str(cur_date.day)
if cur_date.month < 10:
month = "0" + month
if cur_date.day < 10:
day = "0" + day
gd.Search(['%s %s %s'%(year, month, day)],table='gkg',coverage=True, translation=False)
I am getting attribute error
AttributeError Traceback (most recent call last)
<ipython-input-10-2f00cabbf1ac> in <module>
----> 1 results = gd.Search(['%s %s %s'%(year, month, day)],table='gkg',coverage=True,
translation=False)
~\anaconda3\lib\site-packages\gdelt\base.py in Search(self, date, table, coverage,
translation, output, queryTime, normcols)
646
647 if self.table == 'gkg' and self.version == 1:
--> 648 results.columns = results.ix[0].values.tolist()
649 results.drop([0], inplace=True)
650 columns = results.columns
~\anaconda3\lib\site-packages\pandas\core\generic.py in __getattr__(self, name)
5463 if self._info_axis._can_hold_identifiers_and_holds_name(name):
5464 return self[name]
-> 5465 return object.__getattribute__(self, name)
5466
5467 def __setattr__(self, name: str, value) -> None:
AttributeError: 'DataFrame' object has no attribute 'ix'
I am trying to create a dataframe with python's pandas library utilizing data obtained with a requests response. The problem is when there is not that item available on the API so it raises a KeyError and crashes the program.
The source data frame is being iterated over each product name. It then takes the product name of that row and finds how many different SKUs exists, creating a row in a new dataframe for each SKU and adding some quantities and other needed information to the new dataframe. The idea is to have a row with ALL the same information on the first dataframe repeated however many SKUs there are updated with the quantity and package ID for that SKU.
If the length of the response returned is 0, I still want it to append the row from the first data frame
def create_additional_rows_needed(comb_data):
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.debug("test")
new_combined_data = pd.DataFrame(columns=comb_data.columns)
COVA_DATA_LEN = 2993
row = 0
current_item = ''
while row < len(comb_data):
number_of_skus = 0
current_item = comb_data.iloc[row, 1]
if (len(current_item)) is not None:
number_of_skus = len(find_gb_product(current_item))
else:
number_of_skus = 0
current_quantity = find_gb_product(current_item).iloc[number_of_skus - 1, find_gb_product(current_item).columns.get_loc('quantity')]
logger.info('Current Quantity: {}'.format(current_quantity))
current_package = find_gb_product(current_item)['lot_number'][number_of_skus - 1]
if number_of_skus == 0:
pass
while number_of_skus > 0:
logger.info('Current Item: {}'.format(current_item))
logger.info('Number of Skus: {}'.format(number_of_skus))
logger.info('Appending: {}'.format(comb_data.iloc[row, 1]))
new_combined_data = new_combined_data.append([comb_data.iloc[row, :]])
new_combined_data.iloc[-1, new_combined_data.columns.get_loc('TotalOnHand')] = current_quantity
new_combined_data.iloc[-1, new_combined_data.columns.get_loc('PackageId')] = current_package
number_of_skus = number_of_skus - 1
logger.info('Finished index {}'.format(row))
row = row + 1
logger.info('Moving to index {}'.format(row))
return new_combined_data
It goes well for every item with the exception of a few. Here is the error I get.
KeyError
2889 return self._engine.get_loc(casted_key)
2890 except KeyError as err:
-> 2891 raise KeyError(key) from err
2892
2893 if tolerance is not None:
KeyError: 'quantity'
This has taken up my entire weekend and all my sleep and is due Monday Morning at 10am MST with only two days notice. Please help me.
Catching the error and continuing should work. Something along the lines of:
while row < len(comb_data):
....
try:
current_quantity = find_gb_product(current_item).iloc[number_of_skus - 1, find_gb_product(current_item).columns.get_loc('quantity')]
except KeyError:
continue
....
trying to make a bill
price = {'sugar' : 45,'rice': 60,'tealeaves':450,'wheat':40,'oil':100};
ordered = {'sugar':2,'rice': 3,'tealeaves':0.5,'wheat':4,'oil':1}
total = list()
for k,v in price:
value = price[k]*kgsordered[k]
print (k,':',value)
total.append(value)
print('*'*4,'CG Grocery Store','*'*4)
print('Your final bill is ₹',total.sum())
print('Thank you for shopping with us!!')
traceback coming
Traceback (most recent call last):
File "C:\Users\user\Desktop\My Python Files\curiosity gym python
HW.py", line 4, in
for k,v in price: ValueError: too many values to unpack (expected 2)
Firstly, you have to use .items() to iterate through a dictionary.
Secondly, you were using kgsordered[k] instead of ordered[k], which gives you an error, since kgsordered isn't defined.
And in the end, in you want to calculate the sum of all the elements in a list, you do it by doing sum(total), where total is your list
price = {'sugar' : 45,'rice': 60,'tealeaves':450,'wheat':40,'oil':100};
ordered = {'sugar':2,'rice': 3,'tealeaves':0.5,'wheat':4,'oil':1}
total = list()
for k,v in price.items():
value = price[k]*ordered[k]
print (k,':',value)
total.append(value)
print('*'*4,'CG Grocery Store','*'*4)
print('Your final bill is ₹',sum(total))
print('Thank you for shopping with us!!')
# output
sugar : 90
rice : 180
tealeaves : 225.0
wheat : 160
oil : 100
**** CG Grocery Store ****
Your final bill is ₹ 755.0
Thank you for shopping with us!!
I am able to scrape data from multiple web pages in a web site using BeautifulSoup, and I am using pandas to make a table of the data. The problem is I cannot get all of the arrays to be the same length and I get:
ValueError: arrays must all be same length
Here is the code I have tried:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
# Lists to store the scraped data in
addresses = []
geographies = []
rents = []
units = []
availabilities = []
# Scraping all pages
pages_url = requests.get('https://www.rent.com/new-york/tuckahoe-apartments')
pages_soup = BeautifulSoup(pages_url.text, 'html.parser')
list_nums = pages_soup.find('div', class_='_1y05u').text
print(list_nums)
pages = [str(i) for i in range(0,6)]
for page in pages:
response = requests.get('https://www.rent.com/new-york/tuckahoe-apartments?page=' + page).text
html_soup = BeautifulSoup(response, 'html.parser')
# Extract data from individual listing containers
listing_containers = html_soup.find_all('div', class_='_3PdAH')
print(type(listing_containers))
print(len(listing_containers))
print("Page " + str(page))
for container in listing_containers:
address = container.a
if address is not None:
addresses.append(address.text)
elif address is None:
addresses.append('None')
else:
address.append(np.nan)
geography = container.find('div', class_='_1dhrl')
if geography is not None:
geographies.append(geography.text)
elif geography is None:
geographies.append('None')
else:
geographies.append(np.nan)
rent = container.find('div', class_='_3e12V')
if rent is None:
rents.append('None')
elif rent is not None:
rents.append(rent.text)
else:
rents.append(np.nan)
unit = container.find('div', class_='_2tApa')
if unit is None:
rents.append('None')
elif rent is not None:
units.append(unit.text)
else:
rents.append(np.nan)
availability = container.find('div', class_='_2P6xE')
if availability is None:
availabilities.append('None')
elif availability is not None:
availabilities.append(availability.text)
else:
availabilities.append(np.nan)
print(len(addresses))
print(len(geographies))
print(len(rents))
print(len(units))
print(len(availabilities))
minlen = min(len(addresses), len(geographies), len(rents), len(units), len(availabilities))
print('Minimum Array Length on this Page = ' + str(minlen))
test_df = pd.DataFrame({'Street' : addresses,
'City-State-Zip' : geographies,
'Rent' : rents,
'BR/BA' : units,
'Units Available' : availabilities
})
print(test_df)
Here is the output with error, and I have printed the length of each array for each web page to show that the problem first occurs on "Page 5":
236 Properties
<class 'bs4.element.ResultSet'>
30
Page 0
30
30
30
30
30
Minimum Array Length on this Page = 30
<class 'bs4.element.ResultSet'>
30
Page 1
60
60
60
60
60
Minimum Array Length on this Page = 60
<class 'bs4.element.ResultSet'>
30
Page 2
90
90
90
90
90
Minimum Array Length on this Page = 90
<class 'bs4.element.ResultSet'>
30
Page 3
120
120
120
120
120
Minimum Array Length on this Page = 120
<class 'bs4.element.ResultSet'>
30
Page 4
150
150
150
150
150
Minimum Array Length on this Page = 150
<class 'bs4.element.ResultSet'>
30
Page 5
180
180
188
172
180
Minimum Array Length on this Page = 172
Traceback (most recent call last):
File "renttucktabletest.py", line 103, in <module>
'Units Available' : availabilities
...
ValueError: arrays must all be same length
For the result, I either want to cut the array short to stop at the minimum length of the arrays so they are all equal length (in this case, the min = 172), or to fill in all of the other arrays with NaN or 'None' to get to the maximum array length so they are all equal length (in this case the maximum - 188).
I would prefer to find a solution that does not include more advanced coding than BeautifulSoup and pandas.
d = {'Street' : addresses,
'City-State-Zip' : geographies,
'Rent' : rents,
'BR/BA' : units,
'Units Available' : availabilities
}
test_df = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in d.items()]))
In case of scraping, better to put generated record from each iteration in a temporary dict first, then put it into a list by appending it like I've demonstrated below:
import numpy as np
import requests
import pandas as pd
from bs4 import BeautifulSoup
# Scraping all pages
pages_url = requests.get("https://www.rent.com/new-york/tuckahoe-apartments")
pages_soup = BeautifulSoup(pages_url.text, "html.parser")
list_nums = pages_soup.find("div", class_="_1y05u").text
pages = [str(i) for i in range(0, 6)]
records = []
for page in pages:
response = requests.get(
"https://www.rent.com/new-york/tuckahoe-apartments?page=" + page
).text
html_soup = BeautifulSoup(response, "html.parser")
# Extract data from individual listing containers
listing_containers = html_soup.find_all("div", class_="_3PdAH")
print("Scraping page " + str(page))
for container in listing_containers:
# Dict to hold one record
result = {}
address = container.a
if address is None:
result["Street"] = np.nan
else:
result["Street"] = address.text
geography = container.find("div", class_="_1dhrl")
if geography is None:
result["City-State-Zip"] = np.nan
else:
result["City-State-Zip"] = geography.text
rent = container.find("div", class_="_3e12V")
if rent is None:
result["Rent"] = np.nan
else:
result["Rent"] = rent.text
unit = container.find("div", class_="_2tApa")
if unit is None:
result["BR/BA"] = np.nan
else:
result["BR/BA"] = unit.text
availability = container.find("div", class_="_2P6xE")
if availability is None:
result["Units Available"] = np.nan
else:
result["Units Available"] = availability.text
print("Record: ", result)
records.append(result)
test_df = pd.DataFrame(records)
print(test_df)
I try to select specific fields from my Qdata.txt file and use field[2] to calculate average for every years separate. My code give only total average.
data file looks like: (1. day of year: 101 and last: 1231)
Date 3700300 6701500
20000101 21.00 223.00
20000102 20.00 218.00
. .
20001231 7.40 104.00
20010101 6.70 104.00
. .
20130101 8.37 111.63
. .
20131231 45.00 120.98
import sys
td=open("Qdata.txt","r") # open file Qdata
total=0
count=0
row1=True
for row in td :
if (row1) :
row1=False # row1 is for topic
else:
fields=row.split()
try:
total=total+float(fields[2])
count=count+1
# Errors.
except IndexError:
continue
except ValueError:
print("File is incorrect.")
sys.exit()
print("Average in 2000 was: ",total/count)
You could use itertools.groupby using the first four characters as the key for grouping.
with open("data.txt") as f:
next(f) # skip first line
groups = itertools.groupby(f, key=lambda s: s[:4])
for k, g in groups:
print(k, [s.split() for s in g])
This gives you the entries grouped by year, for further processing.
Output for your example data:
2000 [['20000101', '21.00', '223.00'], ['20000102', '20.00', '218.00'], ['20001231', '7.40', '104.00']]
2001 [['20010101', '6.70', '104.00']]
2013 [['20130101', '8.37', '111.63'], ['20131231', '45.00', '120.98']]
You could create a dict (or even a defaultdict) for total and count instead:
import sys
from collections import defaultdict
td=open("Qdata.txt","r") # open file Qdata
total=defaultdict(float)
count=defaultdict(int)
row1=True
for row in td :
if (row1) :
row1=False # row1 is for topic
else:
fields=row.split()
try:
year = int(fields[0][:4])
total[year] += float(fields[2])
count[year] += 1
# Errors.
except IndexError:
continue
except ValueError:
print("File is incorrect.")
sys.exit()
print("Average in 2000 was: ",total[2000]/count[2000])
Every year separate? You have to divide your input into groups, something like this might be what you want:
from collections import defaultdict
row1 = True
year_sums = defaultdict(list)
for row in td:
if row1:
row1 = False
continue
fields = row.split()
year = fields[0][:4]
year_sums[year].append(float(fields[2]))
for year in year_sums:
avarage = sum(year_sums[year])/count(year_sums[year])
print("Avarage in {} was: {}".format(year, avarage)
That is just some example code, I don't know if it works for sure, but should give you an idea what you can do. year_sums is a defaultdict containing lists of values grouped by years. You can then use it for other statistics if you want.