Getting table value from nowgoal has got an index error - python-3.x

I am quite new to scraping. I am getting links from nowgoal. Below is how I started navigating to above page. I do not wish to get link for all matches. But I will have an input text file, which is attached here and use the selected league and date.
The following code will initialize as input:
#Intialisation
league_index =[]
final_list = []
j = 0
#config load
config = RawConfigParser()
configFilePath = r'.\config.txt'
config.read(configFilePath)
date = config.get('database_config','date') #input file provided by user - provide in YYYY-MM-DD format
leagues = config.get('database_config','leagues') #input file provided by user - provide in windows format
headless_param =config.get('database_config','headless') #Headless param - set True if you want to see bowser operating in foreground!
leagues_list = leagues.split(',')
print(leagues_list)
After I initialized with the preferred date and league, I will set up for chrome driver as follow:
options = webdriver.ChromeOptions() #initialise webdriver options
#options.binary_location = brave_path #if you are running the script on brave - then enable it
if headless_param == 'True' :
print('headless')
options.headless = True # if headeless parameter is set to true - the chrome browser will not appear in foreground
options.add_argument('start-maximized') # Start the chrome maximised
options.add_argument('disable-infobars') # Disable infobars
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option("prefs", {"profile.default_content_setting_values.cookies": 2})
options.add_experimental_option("prefs", {"profile.block_third_party_cookies": True})
options.add_experimental_option('useAutomationExtension', False)
options.add_argument("--incognito") #Incognito mode
#intiate the driver
driver = webdriver.Chrome(resource_path('./drivers/chromedriver.exe'),options=options)
#Format the url
url = 'http://www.nowgoal3.com/football/fixture/?f=ft0&date='+date
#get the url
driver.get(url)
#wait for some time
time.sleep(3)
driver.find_element_by_xpath('//*[#id="li_league"]').click()
time.sleep(5)
#click on the -team ranking
driver.find_element_by_xpath('//*[#id="TeamOrderCheck"]').click()
After this, you will be brought to the following page
I also add in the snap shot below
I try to get the data from the table by looping: the code is as follow:
> #Get the leagues name from page htmlSource = driver.page_source
> #Pass the htmlsource into soup soup = bs4.BeautifulSoup(htmlSource,'html.parser')
> #Table table = soup.select('table[id="table_live"]')
> #Rows of table all_rows = table[0].select('tr')
> #loop through each row
for i , row in enumerate(all_rows[2:]) :
> try:
> key_word = row['class'][0]
> print(key_word)
> if 'Leaguestitle' in key_word:#if leagues got changed
> league = row.a.text
> print(row.a.text)
> if row.a.text in leagues_list:
> j =1
> else:
> j =0
> elif j== 1:
> home_team = row.findAll('a')[0].text #home team
> print(home_team)
> away_team = row.findAll('a')[1].text #away team
> match_number = ''.join(filter(str.isdigit,row.findAll('a')[2]['href'].strip()))
> #match_number
> link = 'http://data.nowgoal.group/3in1odds/'+match_number+'.html'
> #link for 3 in 1 odds from the match code
> home_ranking = row.findAll('span')[0].text.strip('[]') #home team ranking
> away_ranking = row.findAll('span')[1].text.strip('[]') #Away team ranking
> final_list.append([home_team,home_ranking,away_team,away_ranking,league,match_number,link])
> except KeyError:
> try:
> if row['style']=='display:none':
> continue
> elif j== 1:
> home_team = row.findAll('a')[0].text #home team
> away_team = row.findAll('a')[1].text #away team
> home_ranking = row.findAll('span')[0].text.strip('[]') #home team ranking
> away_ranking = row.findAll('span')[1].text.strip('[]') #Away team ranking
> match_number = ''.join(filter(str.isdigit,row.findAll('a')[2]['href'].strip()))
> #match_code associated with each match
> link = 'http://data.nowgoal.group/3in1odds/'+match_number+'.html'
> #link for 3 in 1 odds from the match code
> final_list.append([home_team,home_ranking,away_team,away_ranking,league,match_number,link])
> except KeyError :
> print('KeyError')
>
>
> except IndexError:
> if j== 1:
> home_team = row.findAll('a')[0].text #home team
> away_team = row.findAll('a')[1].text #away team
> home_ranking = row.findAll('span')[0].text.strip('[]') #home team ranking
> away_ranking = row.findAll('span')[1].text.strip('[]') #Away team ranking
> match_number = ''.join(filter(str.isdigit,row.findAll('a')[2]['href'].strip()))
> #match_code associated with each match
> link = 'http://data.nowgoal.group/3in1odds/'+match_number+'.html'
> #link for 3 in 1 odds from the match code
> final_list.append([home_team,home_ranking,away_team,away_ranking,league,match_number,link])
> print('IndexError-captured')
>
> print(final_list)#show the final result driver.quit()#close the
> browser
Then I print out the hometeam and the following results
Chelsea adtext-bg QC: MAY88.COM - NHÀ CÁI HỢP PHÁP NA UY - THƯỞNG NẠP
100% - HOÀN TRẢ 100TR - HỖ TRỢ 24/7
Then it threw me an index error as follow:
Traceback (most recent call last):
File "D:/Football matters/Sttratagem data access/Games By Numbers/Nowgoal scraping project/codes/NOWGOAL-20200721T024808Z-001/NOWGOAL/PYFILES/Link_extractor_v1.3.py", line 124, in <module>
away_team = row.findAll('a')[1].text #away team
IndexError: list index out of range
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:/Football matters/Sttratagem data access/Games By Numbers/Nowgoal scraping project/codes/NOWGOAL-20200721T024808Z-001/NOWGOAL/PYFILES/Link_extractor_v1.3.py", line 149, in <module>
away_team = row.findAll('a')[1].text #away team
IndexError: list index out of range

league_list = league_list = ["English Premier League", 'Italian Serie A',
'England Championship', 'Spanish La Liga', 'Swedish Allsvenskan', 'USA Major League Soccer','Saudi','Dutch Cup']
#wait for some time
# wait for some time
wait.until(EC.element_to_be_clickable((By.ID, "li_league"))).click()
# click on the -team ranking
wait.until(EC.element_to_be_clickable(
(By.XPATH, "//label[#for='TeamOrderCheck']/span"))).click()
for league in league_list:
try:
nextRow = wait.until(EC.presence_of_element_located(
(By.XPATH, '//tr[.//a[contains(text(),"{}")]]'.format(league))))
id = nextRow.get_attribute("id").split("_")[1]
try:
row = wait.until(EC.presence_of_all_elements_located(
(By.XPATH, '//tr[preceding-sibling::tr[.//a[contains(text(),"{}")]] and following-sibling::tr[#id="tr_{}"] and not(#style="display:none")]'.format(league, int(id)+1))))
print("########The result for {} ########".format(league))
for i in row:
print(i.get_attribute("textContent"))
print("###########Completed##############".format(league))
except:
row = wait.until(EC.presence_of_all_elements_located(
(By.XPATH, '//tr[preceding-sibling::tr[.//a[contains(text(),"{}")]] and not(#style="display:none")]'.format(league))))
print("########The result for {} ########".format(league))
for i in row:
print(i.get_attribute("textContent"))
print("###########Completed##############".format(league))
continue
except:
continue
you can use following and preceeding property , as there is no unique way to identify next following element we have to take id and increment it with 1

Prints all the information of each row.
wait = WebDriverWait(driver, 5)
driver.get('http://www.nowgoal3.com/football/fixture/?type=&f=sc1&date=2021-01-29')
league_list=["English Premier League",'Italian Serie A','England Championship','Spanish La Liga', 'Swedish Allsvenskan','USA Major League Soccer','Swiss Challenge League']
#wait for some time
wait.until(EC.element_to_be_clickable((By.ID, "li_league"))).click()
#click on the -team ranking
wait.until(EC.element_to_be_clickable((By.XPATH, "//label[#for='TeamOrderCheck']/span"))).click()
for league in league_list:
try:
header = driver.find_element_by_xpath("//tr[#class='Leaguestitle fbHead']/td[2]/span/a[text()='"+league+"']")
#print(len(header))
print(header.text)
nextRow = wait.until(EC.presence_of_element_located(
(By.XPATH, '//tr[.//a[contains(text(),"{}")]]'.format(league))))
id = nextRow.get_attribute("id").split("_")[1]
try:
rows = wait.until(EC.presence_of_all_elements_located(
(By.XPATH, '//tr[preceding-sibling::tr[.//a[contains(text(),"{}")]] and following-sibling::tr[#id="tr_{}"] and not(#style="display:none")]'.format(league, int(id)+1))))
except:
rows = wait.until(EC.presence_of_all_elements_located(
(By.XPATH, '//tr[preceding-sibling::tr[.//a[contains(text(),"{}")]] and not(#style="display:none")]'.format(league))))
continue
#print(len(rows))
for row in rows:
home = row.find_element_by_css_selector("td:nth-child(5) > a").text
homeRank = row.find_element_by_css_selector("td:nth-child(5) span.team-hg").text.strip('[]')
away = row.find_element_by_css_selector("td:nth-child(7) > a").text
awayRank = row.find_element_by_css_selector("td:nth-child(7) span.team-hg").text.strip('[]')
link = row.find_element_by_css_selector("td.toolimg >a:nth-child(3)").get_attribute('href')
link = ''.join(filter(lambda i: i.isdigit(), link))
link = 'http://data.nowgoal.group/3in1odds/'+link+'.html'
print(home,homeRank,away,awayRank,link)
except:
continue

Related

How to ignore a key error and continue the while loop

I am trying to create a dataframe with python's pandas library utilizing data obtained with a requests response. The problem is when there is not that item available on the API so it raises a KeyError and crashes the program.
The source data frame is being iterated over each product name. It then takes the product name of that row and finds how many different SKUs exists, creating a row in a new dataframe for each SKU and adding some quantities and other needed information to the new dataframe. The idea is to have a row with ALL the same information on the first dataframe repeated however many SKUs there are updated with the quantity and package ID for that SKU.
If the length of the response returned is 0, I still want it to append the row from the first data frame
def create_additional_rows_needed(comb_data):
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.debug("test")
new_combined_data = pd.DataFrame(columns=comb_data.columns)
COVA_DATA_LEN = 2993
row = 0
current_item = ''
while row < len(comb_data):
number_of_skus = 0
current_item = comb_data.iloc[row, 1]
if (len(current_item)) is not None:
number_of_skus = len(find_gb_product(current_item))
else:
number_of_skus = 0
current_quantity = find_gb_product(current_item).iloc[number_of_skus - 1, find_gb_product(current_item).columns.get_loc('quantity')]
logger.info('Current Quantity: {}'.format(current_quantity))
current_package = find_gb_product(current_item)['lot_number'][number_of_skus - 1]
if number_of_skus == 0:
pass
while number_of_skus > 0:
logger.info('Current Item: {}'.format(current_item))
logger.info('Number of Skus: {}'.format(number_of_skus))
logger.info('Appending: {}'.format(comb_data.iloc[row, 1]))
new_combined_data = new_combined_data.append([comb_data.iloc[row, :]])
new_combined_data.iloc[-1, new_combined_data.columns.get_loc('TotalOnHand')] = current_quantity
new_combined_data.iloc[-1, new_combined_data.columns.get_loc('PackageId')] = current_package
number_of_skus = number_of_skus - 1
logger.info('Finished index {}'.format(row))
row = row + 1
logger.info('Moving to index {}'.format(row))
return new_combined_data
It goes well for every item with the exception of a few. Here is the error I get.
KeyError
2889 return self._engine.get_loc(casted_key)
2890 except KeyError as err:
-> 2891 raise KeyError(key) from err
2892
2893 if tolerance is not None:
KeyError: 'quantity'
This has taken up my entire weekend and all my sleep and is due Monday Morning at 10am MST with only two days notice. Please help me.
Catching the error and continuing should work. Something along the lines of:
while row < len(comb_data):
....
try:
current_quantity = find_gb_product(current_item).iloc[number_of_skus - 1, find_gb_product(current_item).columns.get_loc('quantity')]
except KeyError:
continue
....

Python Program ValueError: invalid literal for int() with base 10:

I am a python-beginner, trying to write a program to: "Corona Virus Live Updates for India – Using Python".
I am getting this error, after running the code/programe:
performance.append(int(row[2]) + int(row[3]))
ValueError: invalid literal for int() with base 10:
What can I do to fix this problem?
The Code:
extract_contents = lambda row: [x.text.replace('\n', '') for x in row]
URL = 'https://www.mohfw.gov.in/'
SHORT_HEADERS = ['SNo', 'State','Indian-Confirmed',
'Foreign-Confirmed','Cured','Death']
response = requests.get(URL).content
soup = BeautifulSoup(response, 'html.parser')
header = extract_contents(soup.tr.find_all('th'))
stats = []
all_rows = soup.find_all('tr')
for row in all_rows:
stat = extract_contents(row.find_all('td'))
if stat:
if len(stat) == 5:
# last row
stat = ['', *stat]
stats.append(stat)
elif len(stat) == 6:
stats.append(stat)
stats[-1][1] = "Total Cases"
stats.remove(stats[-1])
#Step #3:
objects = []
for row in stats :
objects.append(row[1])
y_pos = np.arange(len(objects))
performance = []
for row in stats:
performance.append(int(row[2]) + int(row[3]))
table = tabulate(stats, headers=SHORT_HEADERS)
print(table)
just changed the line performance.append(int(row[2]) + int(row[3])) to performance.append(row[2] +str(int(float(row[3]))) )
Full Code:
import requests
from bs4 import BeautifulSoup
import numpy as np
from tabulate import tabulate
extract_contents = lambda row: [x.text.replace('\n', '') for x in row]
URL = 'https://www.mohfw.gov.in/'
SHORT_HEADERS = ['SNo', 'State','Indian-Confirmed', 'Foreign-Confirmed','Cured','Death']
response = requests.get(URL).content
soup = BeautifulSoup(response, 'html.parser')
header = extract_contents(soup.tr.find_all('th'))
stats = []
all_rows = soup.find_all('tr')
for row in all_rows:
stat = extract_contents(row.find_all('td'))
if stat:
if len(stat) == 5:
# last row
stat = ['', *stat]
stats.append(stat)
elif len(stat) == 6:
stats.append(stat)
stats[-1][1] = "Total Cases"
stats.remove(stats[-1])
#Step #3:
objects = [row[1] for row in stats ]
y_pos = np.arange(len(objects))
performance = [row[2] +str(int(float(row[3]))) for row in stats]
table = tabulate(stats, headers=SHORT_HEADERS)
print(table)

Taking info from file and creating a dictionary

The goal of mine is to create a dictionary called 'sum_of_department' contains the department as the key and the total annual salary of all employees combined as a value. So far this is what I have but I'm a bit lost on how to add all the department names along with a sum of all of the employees salary in that dictionary. The current dictionary i tried displays only the amount of the salary and how many times its seen in the file. this is where i need the help.
import requests
# endpoint
endpoint = "https://data.cityofchicago.org/resource/xzkq-xp2w.json"
# optional parameters
parameters = {"$limit":20,}
# make request
response = requests.get(endpoint, params=parameters)
# Get the response data as a python object.
data = response.json()
count_by_department = {}
sum_by_department = {}
#loop through the data
for i in data:
if ('department' and 'salary_or_hourly' and 'annual_salary' in i):
department = i['department']
pay_type = i['salary_or_hourly']
anual_salary = i['annual_salary']
# print(i['annual_salary'])
else:
# handle case where there is no department property in that record
department = 'undefined'
pay_type = 'n/a'
anual_salary = 'n/a'
# print(department,"," ,pay_type)
# exclude the cases where the pay type is Hourly
if(pay_type != 'Salary' ):
pay_type = 0
# print(department,"," ,pay_type)
# update the sum_by_department and count_by_department dictionaries
if (department in count_by_department):
count_by_department[department] += 1
else:
count_by_department[department] = 1
if (anual_salary in sum_by_department):
sum_by_department[anual_salary] +=1
else:
sum_by_department[anual_salary] = 1
# print(count_by_department)
# print(sum_by_department)
You should add each person's annual_salary to the sum_by_department array while looping. Also, do not forget to convert your annual_salary variable to the float type, because adding them together as strings won't work.
Example script:
import requests
# endpoint
endpoint = "https://data.cityofchicago.org/resource/xzkq-xp2w.json"
# optional parameters
parameters = {"$limit":20,}
# make request
response = requests.get(endpoint, params=parameters)
# Get the response data as a python object.
data = response.json()
count_by_department = {}
sum_by_department = {}
#loop through the data
for i in data:
if ('department' and 'salary_or_hourly' and 'annual_salary' in i):
department = i['department']
pay_type = i['salary_or_hourly']
annual_salary = float(i['annual_salary'])
# print(i['annual_salary'])
else:
# handle case where there is no department property in that record
department = 'undefined'
pay_type = 'n/a'
annual_salary = 0
# print(department,"," ,pay_type)
# exclude the cases where the pay type is Hourly
if(pay_type != 'Salary' ):
pay_type = 0
# print(department,"," ,pay_type)
# update the sum_by_department and count_by_department dictionaries
if (department in count_by_department):
count_by_department[department] += 1
sum_by_department[department] += annual_salary
else:
count_by_department[department] = 1
sum_by_department[department] = annual_salary
#import pdb; pdb.set_trace();
print('count_by_department = ', count_by_department)
print('sum_by_department = ', sum_by_department)
Tip:
Uncomment the pdb line to debug interactively. The Python Debugger (pdb for short) halts the program while it's still running (i.e. in memory), so you can interact with it and inspect all variables.

BeautifulSoup4 Returning Empty List when Attempting to Scrape a Table

I'm trying to pull the data from this url: https://www.winstonslab.com/players/player.php?id=98 and I keep getting the same error when I try to access the tables.
My scraping code is below. I run this, then hp = HTMLTableParser() and table = hp.parse_url('https://www.winstonslab.com/players/player.php?id=98')[0][1] returns the error 'index 0 is out of bounds for axis 0 with size 0'
import requests
import pandas as pd
from bs4 import BeautifulSoup
class HTMLTableParser:
def parse_url(self, url):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
return [(table['id'],self.parse_html_table(table))\
for table in soup.find_all('table')]
def parse_html_table(self, table):
n_columns = 0
n_rows=0
column_names = []
# Find number of rows and columns
# we also find the column titles if we can
for row in table.find_all('tr'):
# Determine the number of rows in the table
td_tags = row.find_all('td')
if len(td_tags) > 0:
n_rows+=1
if n_columns == 0:
# Set the number of columns for our table
n_columns = len(td_tags)
# Handle column names if we find them
th_tags = row.find_all('th')
if len(th_tags) > 0 and len(column_names) == 0:
for th in th_tags:
column_names.append(th.get_text())
# Safeguard on Column Titles
if len(column_names) > 0 and len(column_names) != n_columns:
raise Exception("Column titles do not match the number of columns")
columns = column_names if len(column_names) > 0 else range(0,n_columns)
df = pd.DataFrame(columns = columns,
index= range(0,n_rows))
row_marker = 0
for row in table.find_all('tr'):
column_marker = 0
columns = row.find_all('td')
for column in columns:
df.iat[row_marker,column_marker] = column.get_text()
column_marker += 1
if len(columns) > 0:
row_marker += 1
# Convert to float if possible
for col in df:
try:
df[col] = df[col].astype(float)
except ValueError:
pass
return df
If the data that you need is just the table, you can accomplish that with pandas.read_html() function.

Python 3.x unsupported operand type in using encode decode

I am trying to build a generic crawler for my marketing project and keep track of where the information came from viz blogs, testimonials etc. I am using Python 3.5 and Spyder/pycharm as IDE and I keep getting the following error in using encode - decode. The input to my code is a list of company names and product features in an excel file. I also searched for possible solutions but the recommendations in the community are for typecasting, which I am not sure is the problem.
Kindly let me know if some more clarification is required from my side.
from __future__ import division, unicode_literals
import codecs
import re
import os
import xlrd
import requests
from urllib.request import urlopen
from time import sleep
from bs4 import BeautifulSoup
import openpyxl
from collections import Counter
page=0
b=0
n=0
w=0
p=0
o=0
workbook=xlrd.open_workbook("C:\Product.xlsx")
workbook1=xlrd.open_workbook("C:\linkslist.xlsx")
sheet_names = workbook.sheet_names()
sheet_names1 = workbook1.sheet_names()
wb= openpyxl.Workbook() #User Spreadsheet
ws = wb.active
ws.title = "User"
ws['A1'] = 'Feature'
ws['B1'] = 'Customer-Testimonials'
ws['C1'] = 'Case Study'
ws['D1'] = 'Blog'
ws['E1'] = 'Press'
ws['F1'] = 'Total posts'
ws1 = wb.create_sheet(title="Ml")
ws1['A1'] = 'Feature'
ws1['B1'] = 'Phrase'
ws1['C1'] = 'Address'
ws1['D1'] = 'Tag Count'
worksheet = workbook.sheet_by_name(sheet_names[0])
worksheet1 = workbook1.sheet_by_name(sheet_names[0])
for linknumber in range(0,25):
u = worksheet1.cell(linknumber,0).value
url='www.' + u.lower() + '.com'
print (url)
r=''
while r == '':
try:
print ("in loop")
r = requests.get("http://" +url)
except:
sleep(3)#if the code still gives that error then try increasing the sleep time to 5 maybe
print (r)
data = r.text
#print data
soup1 = BeautifulSoup(data, "html.parser")
#print soup1
num=3 #starting row number and keep the column same.
word = ''
word = worksheet.cell(num,3).value
while not word == 'end':
print (num)
#print word
tag_list=[]
phrase= []
counts=[]
address=[]
counts = Counter(tag_list)
for link in soup1.find_all('a'):
#print link
add = link.encode("ascii", "ignore")
print (add)
if not'Log In' in add:
#print link.get('href')
i=0
content = ''
for i in range(1,5):
if content=='':
try:
print (link.get('href'))
i+=1
req = urllib.request.Request(link.get('href'))
with urllib.request.urlopen(req) as response:
content = response.read()
except:
sleep(3)
#if the code still gives that error then try increasing the sleep time to 5 maybe
continue
soup = BeautifulSoup(content, "html.parser")
s=soup(text=re.compile(word))
if s:
print ("TRUE")
add = link.encode('ascii','ignore')
print (type(add))
if 'customer-testimonial' in add :
b+=1
elif 'case-study' in add :
n+=1
elif 'blog' in add :
w+=1
elif 'press' in add :
p+=1
else :
o+=1
#phrase_type=["Customer testimonials","news","ads","twitter","facebook","instagram"]
#print(os.path.join(root, name))
print (add)
for tag in s:
parent_html = tag.parent.name
print (parent_html)
tag_list.append(parent_html)
phrase.append(s)
address.append(add)
#print str(phrase)
counts = Counter(tag_list)
page +=1
else:
counts = Counter(tag_list)
no =num-1
print(counts)
print (word)
ws['A%d'%no] = word.encode('utf-8' , 'ignore')
ws1['A%d'%no] = word.encode('utf-8' , 'ignore')
print ("Number of pages is %d" %page)
print ("Number of Customer testimonials posts is %d" %b)
ws['B%d'%no] = b
print ("Number of Case Studies posts is %d" %n)
ws['C%d'%no] = n
print ("Number of blog posts is %d" %w)
ws['D%d'%no] = w
print ("Number of press posts is %d" %p)
ws['E%d'%no] = p
print ("Number of posts is %d" %page)
ws['F%d'%no] = page
ws1['B%d'%no] = phrase.encode('utf-8' , 'ignore')
ws1['C%d'%no] = address.encode('utf-8' , 'ignore')
ws1['D%d'%no] = counts.encode('utf-8' , 'ignore')
counts.clear()
num += 1
word = worksheet.cell(num,3).value
#print word
page=0
b=0
n=0
w=0
p=0
o=0
phrase=[]
address=[]
tag_list=[]
wb.save('%s.xlsx'%worksheet1.cell(linknumber,0).value)
I get the following output and error while running the code:
www.amobee.com
in loop
<Response [200]>
3
Traceback (most recent call last):
File "C:/project_web_parser.py", line 69, in <module>
add = link.encode("ascii", "ignore")
File "C:\ProgramData\Ana3\lib\site-packages\bs4\element.py", line 1094, in encode
u = self.decode(indent_level, encoding, formatter)
File "C:\ProgramData\Ana3\lib\site-packages\bs4\element.py", line 1159, in decode
indent_space = (' ' * (indent_level - 1))
TypeError: unsupported operand type(s) for -: 'str' and 'int'
Process finished with exit code 1
Traceback shows error in line 69 where you try to encode link. To fix it, just change that line to:
add = link.encode("ascii", errors="ignore")
Why does it happen?
Your link variable is type of bs4.element.Tag
>>>type(link)
<class 'bs4.element.Tag'>
.encode() method for tags takes more arguments then .encode() method for strings.
In source code of bs4 in file \bs4\element.py on line 1089 you can find definition of it:
def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
indent_level=None, formatter="minimal",
errors="xmlcharrefreplace"):
First argument is encoding, second is indent_level (int or None) and errors handling is forth.
Error
unsupported operand type(s) for -: 'str' and 'int'
means that you tried to subtract 'ignore' - 1.

Resources