I want to get information from two websites and display it on 'real-time' in the console.
To get the information from the website I am using BeautifulSoup 4. I have read, that the bottleneck of scraping websites is the connection or the website itself. So I wanted to use multithreading, so the read can be done 'simultaneously'. Below is my code:
import urllib
from bs4 import BeautifulSoup
import threading
import time
link_website_one = 'http://www.website1.com'
link_website_one = 'http://www.website2.com'
def request_url(url):
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36'
header = { 'User-Agent' : user_agent }
req = urllib.request.Request(url, headers=header)
try:
response = urllib.request.urlopen(req)
return response
except urllib.error.HTTPError as e:
print('The server couldn\'t fulfill the request.')
print(e.code,': ', e.reason)
return None
except urllib.error.URLError as e:
print('We failed to reach a server.')
print(e.code,': ', e.reason)
return None
def new_entry(website, lis, arr, a_id):
if website == 0 :
info1 = arr.find()
info2 = arr.find('div', {'class' : '1'}).find('span', {'class' : '7'}).get_text().strip()
info3 = arr.find('span', {'class' : '2'}).get_text().strip()
info4 = arr.find('span', {'class' : '3'}).get_text().strip()
info5 = arr.find('span', {'class' : '4'}).get_text().strip()
info6 = arr.find('div', {'class' : '5'}).get_text().strip()
lis.append({'info1' : info1, 'info2' : info2, 'info3' : info3, 'info4' : info4, 'info5' : info5, 'info6' : info6})
elif website == 1 :
info1 = a_id
info2 = arr.find('span', {'class' : '8'}).get_text()
info3 = arr.find('div', {'class' : '9'}).get_text()
info4 = arr.find_all('div', {'class' : '10'})[0].get_text()
info5 = arr.find_all('div', {'class' : '10'})[1].get_text()
info6 = arr.a["href"]
lis.append({'info1' : info1, 'info2' : info2, 'info3' : info3, 'info4' : info4, 'info5' : info5, 'info6' : info6})
class AsyncSearch(threading.Thread):
def __init__(self, website, iter_rounds, delay_time):
threading.Thread.__init__(self)
self.website = website
self.iter_rounds = iter_rounds
self.delay_time = delay_time
def run(self):
if self.website == 0:
for z in range(self.iter_rounds):
req_1 = request_url(link_1)
content_1 =req_1.read()
soup_1 = BeautifulSoup(content_1, 'lxml')
arr_1 = soup_1.find_all('div', {'class' : 'special_class'})
for x in range(len(arr_1)):
id_as = int(arr_1[x].find('class123')['class2345'].split(',')[0].split('"')[3].strip())
if id_as not in found_ids_1:
found_ids_1.add(id_as)
new_entry(0, all_entries_1, arr_1[x], id_as)
#else:
# break
req_1.close()
time.sleep(self.delay_time)
elif self.website == 1:
for z in range(self.iter_rounds):
req_2 = request_url(link)
content_2 =req_2.read()
soup_2 = BeautifulSoup(content_2, 'lxml')
arr_2 = soup_2.find_all('div', {'class' : 'class445'})
for x in range(len(arr_2)):
if arr_2[x].a['oid'] not in found_ids_2:
#print('Mobile: ',test[x].a[Inserat_ID])
found_ids_2.add(arr_2[x].a['oid'])
new_entry(1, all_entries_1, arr_2[x], arr_2[x].a['oid'])
#else:
# break
req.close()
time.sleep(self.delay_time)
all_entries_1=[]
all_entries_2=[]
found_ids_1 = set()
found_ids_2 = set()
website1 = AsyncSearch(0,10,1)
website2 = AsyncSearch(1,10,1)
website1.start()
website2.start()
website1.join()
website2.join()
First of two lists (all_entries 1/2) and two sets (found_ids 1/2) are created.
The websites I am scraping are offering 20 ads per page with unique ids. With the new_entry method you can say which website you want, in which list it should append the new entry, which array contains the beautifulsoup and the id of the ad you want to append.
Now for the mutlithreading I created a class AsyncSearch, where you can choose the website, choose the number of iterations of requests and how long to wait for the next request.
The two sets found_ids 1/2 are there so you don't append an ad more than once in the all_entries list.
Now to the problem. This code works.
But: If delay_time = 1 and iterations = 10 it needs 20 seconds to finish. Is there a faster approach to solve this problem? the .read() duration is between 0.12 and 0.17 seconds for website 1 and between 0.03 and 0.10 seconds for website 2.
If I said anything not understandable please ask.
Related
I am trying to extract data from this webpage :
https://www.oddsportal.com/basketball/usa/nba-2008-2009/results/
My code works but it do not extract data which match with what we can see on the website.
For odds you can see for the first event :
Orlando Magic - Los Angeles Lakers
The result is ok but for odds 1 and 2 there are differences.
Here is my code :
#dataframe that will be populated
df = pd.DataFrame()
# loading options for the driver
driver = webdriver.Chrome(options=options)
#beautiful soup and selenium objects
options = Options()
# maximized UI to load totally the page
options.add_argument("start-maximized")
url = 'https://www.oddsportal.com/basketball/usa/nba-2008-2009/results/'
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
for col in soup.find_all('tr', attrs = {'deactivate'}):
df = df.append(
{
#match date
'date' : col.findPreviousSibling(attrs = {'center nob-border'}).text[0:-6],
#match name
'match_name' : col.find('td', attrs = {'class' : 'name table-participant'}).text.replace('\xa0', ''),
#match result
'result' : col.find('td', attrs = {'class' : 'center bold table-odds table-score'}).text,
#home winning odd
'home_odd' : is_empty(col.find('td', attrs = {'class' : "odds-nowrp"})),
#away odd
'away_odd' : is_empty(col.find('td', attrs = {'class' : "odds-nowrp"}).findNext( attrs = {'class' : "odds-nowrp"}))
},
ignore_index = True)
driver.quit()
df.head()
Good day, everyone.
I'm trying to get the table on each page from the links appended to 'player_page.'
I want the stats per game for each player in that season, and the table I want is listed on the players' individual page. Each link appended is correct, but I'm having trouble capturing the correct info when running my loops.
Any idea what I'm doing wrong here?
Any help is appreciated.
from bs4 import BeautifulSoup
import requests
import pandas as pd
from numpy import sin
url = 'https://www.pro-football-reference.com'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
}
year = 2018
r = requests.get(url + '/years/' + str(year) + '/fantasy.htm')
soup = BeautifulSoup(r.content, 'lxml')
player_list = soup.find_all('td', attrs= {'class': 'left', 'data-stat': 'player'})
player_page = []
for player in player_list:
for link in player.find_all('a', href= True):
#names = str(link['href'])strip('')
link = str(link['href'].strip('.htm'))
player_page.append(url + link + '/gamelog' + '/' + str(year))
for page in player_page:
dfs = pd.read_html(page)
yearly_stats = []
for df in dfs:
yearly_stats.append(df)
final_stats = pd.concat(yearly_stats)
final_stats.to_excel('Fantasy2018.xlsx')
This works. The table columns change according to the player's position, I believe. Not everyone has tackle information, for example.
import pandas as pd
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = 'https://www.pro-football-reference.com'
year = 2018
r = requests.get(url + '/years/' + str(year) + '/fantasy.htm')
soup = BeautifulSoup(r.content, 'lxml')
player_list = soup.find_all('td', attrs= {'class': 'left', 'data-stat': 'player'})
dfs = []
for player in player_list:
for link in player.find_all('a', href= True):
name = link.getText()
link = str(link['href'].strip('.htm'))
try:
df = pd.read_html(url + link + '/gamelog' + '/' + str(year))[0]
for i, columns_old in enumerate(df.columns.levels):
columns_new = np.where(columns_old.str.contains('Unnamed'), '' , columns_old)
df.rename(columns=dict(zip(columns_old, columns_new)), level=i, inplace=True)
df.columns = df.columns.map('|'.join).str.strip('|')
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df = df.dropna(subset=['Date'])
df.insert(0,'Name',name)
df.insert(1,'Moment','Regular Season')
dfs.append(df)
except:
pass
try:
df1 = pd.read_html(url + link + '/gamelog' + '/' + str(year))[1]
for i, columns_old in enumerate(df1.columns.levels):
columns_new = np.where(columns_old.str.contains('Unnamed'), '' , columns_old)
df1.rename(columns=dict(zip(columns_old, columns_new)), level=i, inplace=True)
df1.columns = df1.columns.map('|'.join).str.strip('|')
df1['Date'] = pd.to_datetime(df1['Date'], errors='coerce')
df1 = df1.dropna(subset=['Date'])
df1.insert(0,'Name',name)
df1.insert(1,'Moment','Playoffs')
dfs.append(df1)
except:
pass
dfall = pd.concat(dfs)
dfall.to_excel('Fantasy2018.xlsx')
I want to get links of names from all the pages by clicking load more and needs help with pagination
I've got the logic to print links for names but needs help with pagination
for pos in positions:
url = "https://247sports.com/Season/2021-Football/CompositeRecruitRankings/?InstitutionGroup=HighSchool"
two = requests.get("https://247sports.com/Season/2021-Football/CompositeRecruitRankings/?InstitutionGroup=HighSchool" + pos,headers=HEADERS)
bsObj = BeautifulSoup(two.content , 'lxml')
main_content = urljoin(url,bsObj.select(".data-js")[1]['href']) ## ['href']InstitutionGroup" extracting the link leading to the page containing everything available here
response = requests.get(main_content)
obj = BeautifulSoup(response.content , 'lxml')
names = obj.findAll("div",{"class" : "recruit"})
for player_name in names:
player_name.find('a',{'class' : ' rankings-page__name-link'})
for all_players in player_name.find_all('a', href=True):
player_urls = site + all_players.get('href')
# print(player_urls)
I expect output : https://247sports.com/Player/Jack-Sawyer-46049925/
(links of all player names)
Can just iterate through the parameters in the requests. Since you can just continue to iterate forever, I had it check for when players started to repeat (essentially when the next iteration doesn't add new players). Seem to stop after 21 pages which gives 960 players.
import requests
from bs4 import BeautifulSoup
url = 'https://247sports.com/Season/2021-Football/CompositeRecruitRankings/'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'}
player_links = []
prior_count = 0
for page in range(1,101):
#print ('Page: %s' %page)
payload = {
'ViewPath': '~/Views/SkyNet/PlayerSportRanking/_SimpleSetForSeason.ascx',
'InstitutionGroup': 'HighSchool',
'Page': '%s' %page}
response = requests.get(url, headers=headers, params=payload)
soup = BeautifulSoup(response.text, 'html.parser')
recruits = soup.find_all('div',{'class':'recruit'})
for recruit in recruits:
print ('https://247sports.com' + recruit.find('a')['href'])
player_links.append('https://247sports.com' + recruit.find('a')['href'])
current_count = len(list(set(player_links)))
if prior_count == current_count:
print ('No more players')
break
else:
prior_count = current_count
Output:
print (player_links)
['https://247sports.com/Player/Korey-Foreman-46056100', 'https://247sports.com/Player/Jack-Sawyer-46049925', 'https://247sports.com/Player/Tommy-Brockermeyer-46040211', 'https://247sports.com/Player/James-Williams-46049981', 'https://247sports.com/Player/Payton-Page-46055295', 'https://247sports.com/Player/Camar-Wheaton-46050152', 'https://247sports.com/Player/Brock-Vandagriff-46050870', 'https://247sports.com/Player/JT-Tuimoloau-46048440', 'https://247sports.com/Player/Emeka-Egbuka-46048438', 'https://247sports.com/Player/Tony-Grimes-46048912', 'https://247sports.com/Player/Sam-Huard-46048437', 'https://247sports.com/Player/Amarius-Mims-46079928', 'https://247sports.com/Player/Savion-Byrd-46078964', 'https://247sports.com/Player/Jake-Garcia-46053996', 'https://247sports.com/Player/Agiye-Hall-46055274', 'https://247sports.com/Player/Caleb-Williams-46040610', 'https://247sports.com/Player/JJ-McCarthy-46042742', 'https://247sports.com/Player/Dylan-Brooks-46079585', 'https://247sports.com/Player/Nolan-Rucci-46058902', 'https://247sports.com/Player/GaQuincy-McKinstry-46052990', 'https://247sports.com/Player/Will-Shipley-46056925', 'https://247sports.com/Player/Maason-Smith-46057128', 'https://247sports.com/Player/Isaiah-Johnson-46050757', 'https://247sports.com/Player/Landon-Jackson-46049327', 'https://247sports.com/Player/Tunmise-Adeleye-46050288', 'https://247sports.com/Player/Terrence-Lewis-46058521', 'https://247sports.com/Player/Lee-Hunter-46058922', 'https://247sports.com/Player/Raesjon-Davis-46056065', 'https://247sports.com/Player/Kyle-McCord-46047962', 'https://247sports.com/Player/Beaux-Collins-46049126', 'https://247sports.com/Player/Landon-Tengwall-46048781', 'https://247sports.com/Player/Smael-Mondon-46058273', 'https://247sports.com/Player/Derrick-Davis-Jr-46049676', 'https://247sports.com/Player/Troy-Franklin-46048840', 'https://247sports.com/Player/Tywone-Malone-46081337', 'https://247sports.com/Player/Micah-Morris-46051663', 'https://247sports.com/Player/Donte-Thornton-46056489', 'https://247sports.com/Player/Bryce-Langston-46050326', 'https://247sports.com/Player/Damon-Payne-46041148', 'https://247sports.com/Player/Rocco-Spindler-46049869', 'https://247sports.com/Player/David-Daniel-46076804', 'https://247sports.com/Player/Branden-Jennings-46049721', 'https://247sports.com/Player/JaTavion-Sanders-46058800', 'https://247sports.com/Player/Chris-Hilton-46055801', 'https://247sports.com/Player/Jason-Marshall-46051367', ... ]
I am a beginner and answers on this forum have been invaluable. I am using Python 3 and Beautiful Soup to scrape (non-table) data from multiple web pages on the same website by looping the page number. It works but I keep getting the AttributeError: 'NoneType' object has no attribute 'text' after the first iteration.
Here is the code I have tried thus far:
import requests
from bs4 import BeautifulSoup
import csv
import lxml
# Lists to store the scraped data in
addresses = []
geographies = []
rents = []
units = []
availabilities = []
# Scraping all pages
pages_url = requests.get('https://www.rent.com/new-york/tuckahoe-apartments')
pages_soup = BeautifulSoup(pages_url.text, 'html.parser')
list_nums = pages_soup.find('div', class_='_1y05u').text
print(list_nums)
pages = [str(i) for i in range(1,8)]
for page in pages:
response = requests.get('https://www.rent.com/new-york/tuckahoe-apartments?page=' + page).text
html_soup = BeautifulSoup(response, 'lxml')
# Extract data from individual listing containers
listing_containers = html_soup.find_all('div', class_='_3PdAH')
print(type(listing_containers))
print(len(listing_containers))
for container in listing_containers:
address = container.a.text
addresses.append(address)
geography = container.find('div', class_='_1dhrl').text
geographies.append(geography)
rent = container.find('div', class_='_3e12V').text
rents.append(rent)
unit = container.find('div', class_='_2tApa').text
units.append(unit)
availability = container.find('div', class_='_2P6xE').text
availabilities.append(availability)
import pandas as pd
test_df = pd.DataFrame({'Street' : addresses,
'City-State-Zip' : geographies,
'Rent' : rents,
'BR/BA' : units,
'Units Available' : availabilities
})
print(test_df)
Here is the output:
240 Properties
<class 'bs4.element.ResultSet'>
30
Street City-State-Zip Rent BR/BA Units Available
0 Quarry Place at Tuckahoe 64 Midland PlaceTuckahoe, NY 10707 $2,490+ 1–2 Beds • 1–2 Baths 2 Units Available
Traceback (most recent call last):
File "renttucktabletest.py", line 60, in <module>
availability = container.find('div', class_='_2P6xE').text
AttributeError: 'NoneType' object has no attribute 'text'
The result I am looking for is all 240 listings in the pandas dataframe exactly like the first iteration shown in the output above. Can anyone help to fix this error? Would be much appreciated. Thank you!
As pointed out, the issue is some of the containers are missing certain div elements. eg no 'unit' or 'availability' information.
One way to deal with this would be to use if - else statements. Append only if the element exists, else append a NaN value. Something like:
import requests
import numpy as np
from bs4 import BeautifulSoup
import csv
import lxml
# Lists to store the scraped data in
addresses = []
geographies = []
rents = []
units = []
availabilities = []
# Scraping all pages
pages_url = requests.get('https://www.rent.com/new-york/tuckahoe-apartments')
pages_soup = BeautifulSoup(pages_url.text, 'html.parser')
list_nums = pages_soup.find('div', class_='_1y05u').text
print(list_nums)
pages = [str(i) for i in range(1,8)]
for page in pages:
response = requests.get('https://www.rent.com/new-york/tuckahoe-apartments?page=' + page).text
html_soup = BeautifulSoup(response, 'lxml')
# Extract data from individual listing containers
listing_containers = html_soup.find_all('div', class_='_3PdAH')
print(type(listing_containers))
print(len(listing_containers))
for container in listing_containers:
address = container.a
if address:
addresses.append(address.text)
else:
addresses.append(np.nan)
geography = container.find('div', class_='_1dhrl')
if geography:
geographies.append(geography.text)
else:
geographies.append(np.nan)
rent = container.find('div', class_='_3e12V')
if rent:
rents.append(rent.text)
else:
rents.append(np.nan)
unit = container.find('div', class_='_2tApa')
if unit:
units.append(unit.text)
else:
units.append(np.nan)
availability = container.find('div', class_='_2P6xE')
if availability:
availabilities.append(availability.text)
else:
availabilities.append(np.nan)
import pandas as pd
test_df = pd.DataFrame({'Street' : addresses,
'City-State-Zip' : geographies,
'Rent' : rents,
'BR/BA' : units,
'Units Available' : availabilities
})
print(test_df)
Street City-State-Zip Rent \
0 Quarry Place at Tuckahoe 64 Midland PlaceTuckahoe, NY 10707 $2,490+
1 address not disclosed Tuckahoe, NY 10707 $2,510
2 address not disclosed Tuckahoe, NY 10707 $4,145
3 60 Washington St 1 60 Washington StTuckahoe, NY 10707 $3,500
4 269 Columbus Ave 5 269 Columbus AveTuckahoe, NY 10707 $2,700
BR/BA Units Available
0 1–2 Beds • 1–2 Baths 2 Units Available
1 1 Bed • 1 Bath NaN
2 2 Beds • 2 Bath NaN
3 3 Beds • 2 Bath NaN
4 2 Beds • 1 Bath NaN
If you pull the info from a script tag and treat as json that problem goes away. None or 0 is returned from the json where had you been trying for class name etc you would have got an error.
import requests
import json
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
def add_records(url, s):
res = requests.get(url)
soup = bs(res.content, 'lxml')
r = re.compile(r'window.__APPLICATION_CONTEXT__ = (.*)')
data = soup.find('script', text=r).text
script = r.findall(data)[0]
items = json.loads(script)['store']['listings']['listings']
for item in items:
street = item['address']
geography = ', '.join([item['city'], item['state'], item['zipCode']])
rent = item['aggregates']['prices']['low']
BR_BA = 'beds: ' + str(item['aggregates']['beds']['low']) + ' , ' + 'baths: ' + str(item['aggregates']['baths']['low'])
units = item['aggregates']['totalAvailable']
listingId = item['listingId']
url = base_url + item['listingSeoPath']
# all_info = item
record = {'Street' : street,
'Geography' : geography,
'Rent' : rent,
'BR/BA' : BR_BA,
'Units Available' : units,
'ListingId' : listingId,
'Url' : url}
results.append(record)
url = 'https://www.rent.com/new-york/tuckahoe-apartments?page={}'
base_url = 'https://www.rent.com/'
results = []
with requests.Session() as s:
for page in range(1, 9):
add_records(url.format(page), s)
df = pd.DataFrame(results, columns = [ 'Street', 'Geography', 'Rent', 'BR/BA', 'Units Available', 'ListingId', 'Url'])
print(df)
Here is another approach to achieve the same.
import pandas
import requests
from bs4 import BeautifulSoup
urls = ['https://www.rent.com/new-york/tuckahoe-apartments?page={}'.format(page) for page in range(1,9)]
def get_content(links):
for url in links:
res = requests.get(url)
soup = BeautifulSoup(res.text, 'lxml')
for items in soup.select("._3PdAH"):
d = {}
d['address'] = items.select_one("[data-tid='property-title']").text
try:
d['geographies'] = items.select_one("[data-tid='listing-info-address']").text
except AttributeError: d['geographies'] = ""
try:
d['rent'] = items.select_one("[data-tid='price']").text
except AttributeError: d['rent'] = ""
try:
d['units'] = items.select_one("[data-tid='beds-baths']").text
except AttributeError: d['units'] = ""
try:
d['availabilities'] = items.select_one("[data-tid='property-unitAvailText']").text
except AttributeError: d['availabilities'] = ""
dataframe.append(d)
return dataframe
if __name__ == '__main__':
dataframe = []
item = get_content(urls)
df = pandas.DataFrame(item)
df.to_csv("output.csv",index=False)
The below python program asks the user for two reddit usernames and compares their score.
import json
from urllib import request
def obtainKarma(users_data):
users_info = []
for user_data in users_data:
data = json.load(user_data)
posts = data["data"]["children"]
num_posts = len(posts)
scores = []
comments = []
for post_id in range(num_posts):
score = posts[post_id]["data"]["score"]
comment = posts[post_id]["num_comments"]
scores.append(score)
comments.append(comment)
users_info.append((scores,comments))
user_id = 0
for user_info in users_info:
user_id+=1
print("User"+str(user_id))
for user_attr in user_info:
print(user_attr)
def getUserInfo():
count = 2
users_data = []
while count:
count = count + 1
username = input("Please enter username:\n")
url = "https://reddit.com/user/"+username+".json"
try:
user_data = request.urlopen(url)
except:
print("No such user.\nRetry Please.\n")
count = count + 1
raise
users_data.append(user_data)
obtainKarma(users_data)
if __name__ == '__main__':
getUserInfo()
However, when I run the program and enter a username, I get an error:
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 429: Too Many Requests
I tried looking for similar issues but none of them satisfied to solve this specific issue. Looking at the error, it would make sense to say that the URL includes an amount of data that exceeds a specific limit? But that still sounds absurd because it is not that much of a data.
Thanks.
The problem seems to be resolved when you supply a User-Agent with your request.
import json
from urllib import request
def obtainKarma(users_data):
users_info = []
for user_data in users_data:
data = json.loads(user_data) # I've changed 'json.load' to 'json.loads' because you want to parse a string, not a file
posts = data["data"]["children"]
num_posts = len(posts)
scores = []
comments = []
for post_id in range(num_posts):
score = posts[post_id]["data"]["score"]
comment = posts[post_id]["data"]["num_comments"] # I think you forgot '["data"]' here, so I added it
scores.append(score)
comments.append(comment)
users_info.append((scores,comments))
user_id = 0
for user_info in users_info:
user_id+=1
print("User"+str(user_id))
for user_attr in user_info:
print(user_attr)
def getUserInfo():
count = 2
users_data = []
while count:
count = count + 1
username = input("Please enter username:\n")
url = "https://reddit.com/user/"+username+".json"
user_data = None
try:
req = request.Request(url)
req.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)')
resp = request.urlopen(req)
user_data = resp.read().decode("utf-8")
except Exception as e:
print(e)
print("No such user.\nRetry Please.\n")
count = count + 1
raise # why raise? --> Program will end if user is not found
if user_data:
print(user_data)
users_data.append(user_data)
obtainKarma(users_data)
if __name__ == '__main__':
getUserInfo()
There were still other issues with your code:
You should not write json.load(user_data), because you are parsing a string. So I changed it to use json.loads(user_data).
The Python documentation for json.loads states:
Deserialize s (a str instance containing a JSON document) to a Python object using this conversion table.
And in the code comment = posts[post_id]["num_comments"], I think you forgot to index on 'data', so I changed it to comment = posts[post_id]["data"]["num_comments"]
And why are you raising the exception in the except-block? This will end the program, however it seems that you expect it not to, from looking at the following code:
print("No such user.\nRetry Please.\n")