How can I use Python to scrape a multipage table and export to a CSV file? - python-3.x

i am trying to scrape a table that spans multiple pages and export to a csv file. only one line of data seems to get exported and it is jumbled up.
I have looked on the web and tried many iterations and very frustrated now. As you can tell from code I am a novice at coding!
import bs4 as bs
import urllib.request
import pandas as pd
import csv
max_page_num = 14
max_page_dig = 1 # number of digits in the page number
with open('result.csv',"w") as f:
f.write("Name, Gender, State, Position, Grad, Club/HS, Rating, Commitment \n")
for i in range(0, max_page_num):
page_num = (max_page_dig - len(str(i))) * "0" +str(i) #gives a string in the format of 1, 01 or 001, 005 etc
print(page_num)
source = "https://www.topdrawersoccer.com/search/?query=&divisionId=&genderId=m&graduationYear=2020&positionId=0&playerRating=&stateId=All&pageNo=" + page_num + "&area=commitments"
print(source)
url = urllib.request.urlopen(source).read()
soup = bs.BeautifulSoup(url,'lxml')
table = soup.find('table')
table_rows = table.find_all('tr')
for tr in table_rows:
td = tr.find_all('td')
row = [i.text for i in td]
#final = row.strip("\n")
#final = row.replace("\n","")
with open('result.csv', 'a') as f:
f.write(row)
It seems when I write to csv it overwrites previous ones. It also pastes it on one line and the players name is concatenated with the school name . Thanks for any and all help.

I think you have a problem with your inside for loop. Try re-writing it as
with open('result.csv', 'a') as f:
for tr in table_rows:
td = tr.find_all('td')
row = [i.text for i in td]
f.write(row)
and see if it works.
More generally, this can probably be done more simply by using pandas. Try changing your for loop to:
for i in range(0, max_page_num):
page_num = ...
source = ....
df = pd.read_html(source)
df.to_csv('results.csv', header=False, index=False, mode='a') #'a' should append each table to the csv file, instead of overwriting it.

Related

extracting columns contents only so that all columns for each row are in the same row using Python's BeautifulSoup

I have the following python snippet in Jupyter Notebooks that works.
The challenge I have is to extract just the rows of columnar data only
Here's the snippet:
from bs4 import BeautifulSoup as bs
import pandas as pd
page = requests.get("http://lib.stat.cmu.edu/datasets/boston")
page
soup = bs(page.content)
soup
allrows = soup.find_all("p")
print(allrows)
I'm a little unclear of what you are after but I think it's each individual row of data from URL provided.
I couldn't find a way to use beautiful soup to parse the data you are after but did find a way to separate the rows using .split()
from bs4 import BeautifulSoup as bs
import pandas as pd
import requests
page = requests.get("http://lib.stat.cmu.edu/datasets/boston")
soup = bs(page.content)
allrows = soup.find_all("p")
text = soup.text # turn soup into text
text_split = text.split('\n\n') # split the page into 3 sections
data = text_split[2] # rows of data
# create df column titles using variable titles on page
col_titles = text_split[1].split('\n')
df = pd.DataFrame(columns=range(14))
df.columns = col_titles[1:]
# 'try/except' to catch end of index,
# loop throw text data building complete rows
try:
complete_row = []
n1 = 0 #used to track index
n2 = 1
rows = data.split('\n')
for el in range(len(rows)):
full_row = rows[n1] + rows[n2]
complete_row.append(full_row)
n1 = n1 + 2
n2 = n2 + 2
except IndexError:
print('end of loop')
# loop through rows of data, clean whitespace and append to df
for row in complete_row:
elem = row.split(' ')
df.loc[len(df)] = [el for el in elem if el]
#fininshed dataframe
df

Import txt file and filter with space

I'm writing a script to track my orders from a website. I want to import the order# from a txt file and the script should repeat it self as long as there are ordernumbers.I wrote a code where the script imports this txt file and chooses a random ordernumber but the script puts all ordernumbers together and doesnt seperate them how can I fix this ?
this is my code:
f=open("Order#.txt", "r")
OrderNR = f.read()
words = OrderNR.split()
Repeat = len(words)
for i in range(Repeat):
randomlist = OrderNR
Orderrandom = random.choice(randomlist)
Mainlink = 'https://footlocker.narvar.com/footlocker/tracking/startrack?order_number=' + Orderrandom
Instead of using f.read(), try using f.readlines().
# Using readlines()
file1 = open('myfile.txt', 'r')
Lines = file1.readlines()
Try PANDAS
import pandas as pd
df = pd.read_csv('Order#.txt', delimiter='\t')
print(df)
you can see TXT file in table format

I want to find all the head lines containing certain word/words to be scraped and saved to a text file

How can I use a list of words and make the program pull out any new headings containing any one of the words inside the list. It gives out a error if I try to use the list of key words.
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime, timedelta
s_date = '2018/01/01'
e_date = '2018/01/06'
d1 = datetime.strptime(s_date, '%Y/%m/%d')
d2 = datetime.strptime(e_date, '%Y/%m/%d')
delta = timedelta(days = 1)
date_list = list()
while d1 <= d2:
# print(d1.strftime('%Y/%m/%d'))
date_list.append(d1.strftime('%Y/%m/%d'))
d1 += delta
print(date_list)
for d in date_list:
URL = 'https://www.thedailystar.net/newspaper?date={}'.format(d)
result = requests.get(URL)
src = result.text
soup = BeautifulSoup(src, 'lxml')
# filename = 'new.csv'
# f = open(filename, 'w', newline = '')
# fx = csv.writer(f)
containers = soup.find_all('div',class_ = 'list-content')
key_words = ['Road', 'crash', 'dead', 'accidents']
key_word = input('Enter the desired word to search the news: ')
for c in containers:
headings = c.h5.a.text
if key_word in headings:
print(headings)
with open('nw.txt', 'w') as f:
f.write(headings)
# fx.writerow(headings)
You had several bugs in your code, that's why it didn't work as expected.
Here's the correct version of what you want to achieve:
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime, timedelta
s_date = '2018/01/01'
e_date = '2018/01/06'
d1 = datetime.strptime(s_date, '%Y/%m/%d')
d2 = datetime.strptime(e_date, '%Y/%m/%d')
delta = timedelta(days = 1)
date_list = list()
while d1 <= d2:
date_list.append(d1.strftime('%Y/%m/%d'))
d1 += delta
print(date_list)
with open('nw.txt', 'w') as f:
for d in date_list:
URL = 'https://www.thedailystar.net/newspaper?date={}'.format(d)
result = requests.get(URL)
src = result.text
soup = BeautifulSoup(src, 'lxml')
containers = soup.find_all('div',class_ = 'list-content')
key_words = ['Road', 'crash', 'dead', 'accidents']
# key_word = input('Enter the desired word to search the news: ')
for c in containers:
headings = c.h5.a.text
if any(key_word in headings for key_word in key_words):
print(headings)
f.write(headings + '\n')
What's happening (changes are at the bottom):
If you wanted to use a list of keywords (which is called key_words), then an option is to use built-in any function and iterate over all of keywords, checking wherther it is in your current headings.
Also you're open-ing file every time you want to write - it destroys last write and creates a new file. Instead you should open file once before loop.
Plus when you were writing headings to file, you didn't add \n which is the newline symbol - it would cause all headings to append as one row.

what is wrong with this Pandas and txt file code

I'm using pandas to open a CSV file that contains data from spotify, meanwhile, I have a txt file that contains various artists names from that CSV file. What I'm trying to do is get the value from each row of the txt and automatically search them in the function I've done.
import pandas as pd
import time
df = pd.read_csv("data.csv")
df = df[['artists', 'name', 'year']]
def buscarA():
start = time.time()
newdf = (df.loc[df['artists'].str.contains(art)])
stop = time.time()
tempo = (stop - start)
print (newdf)
e = ('{:.2f}'.format(tempo))
print (e)
with open("teste3.txt", "r") as f:
for row in f:
art = row
buscarA()
but the output is always the same:
Empty DataFrame
Columns: [artists, name, year]
Index: []
The problem here is that when you read the lines of your file in Python, it also gets the line break per row so that you have to strip it off.
Let's suppose that the first line of your teste3.txt file is "James Brown". It'd be read as "James Brown\n" and not recognized in the search.
Changing the last chunk of your code to:
with open("teste3.txt", "r") as f:
for row in f:
art = row.strip()
buscarA()
should work.

BeautifulSoup, Requests, Dataframe, extracting from <SPAN> and Saving to Excel

Python novice here again! 2 questions:
1) Instead of saving to multiple tabs (currently saving each year to a tab named after the year) how can I save all this data into one sheet in excel called "summary".
2) ('div',class_="sidearm-schedule-game-result") returns the format "W, 1-0". How can I split the "W, 1-0" into two columns, one containing "W" and the next column containing "1-0".
Thanks so much
import requests
import pandas as pd
from pandas import ExcelWriter
from bs4 import BeautifulSoup
import openpyxl
import csv
year_id = ['2003','2004','2005','2006','2007','2008','2009','2010','2011','2012','2013','2014','2015','2016','2017','2018','2019']
lehigh_url = 'https://lehighsports.com/sports/mens-soccer/schedule/'
results = []
with requests.Session() as req:
for year in range(2003, 2020):
print(f"Extracting Year# {year}")
url = req.get(f"{lehigh_url}{year}")
if url.status_code == 200:
soup = BeautifulSoup(url.text, 'lxml')
rows = soup.find_all('div',class_="sidearm-schedule-game-row flex flex-wrap flex-align-center row")
sheet = pd.DataFrame()
for row in rows:
date = row.find('div',class_="sidearm-schedule-game-opponent-date").text.strip()
name = row.find('div',class_="sidearm-schedule-game-opponent-name").text.strip()
opp = row.find('div',class_="sidearm-schedule-game-opponent-text").text.strip()
conf = row.find('div',class_="sidearm-schedule-game-conference-conference").text.strip()
try:
result = row.find('div',class_="sidearm-schedule-game-result").text.strip()
except:
result = ''
df = pd.DataFrame([[year,date,name,opp,conf,result]], columns=['year','date','opponent','list','conference','result'])
sheet = sheet.append(df,sort=True).reset_index(drop=True)
results.append(sheet)
def save_xls(list_dfs, xls_path):
with ExcelWriter(xls_path) as writer:
for n, df in enumerate(list_dfs):
df.to_excel(writer,'%s' %year_id[n],index=False,)
writer.save()
save_xls(results,'lehigh.xlsx')
Instead of creating a list of dataframes, you can append each sheet into 1 dataframe and write that to file with pandas. Then to split into 2 columns, just use .str.split() and split on the comma.
import requests
import pandas as pd
from bs4 import BeautifulSoup
year_id = ['2019','2018','2017','2016','2015','2014','2013','2012','2011','2010','2009','2008','2007','2006','2005','2004','2003']
results = pd.DataFrame()
for year in year_id:
url = 'https://lehighsports.com/sports/mens-soccer/schedule/' + year
print (url)
lehigh = requests.get(url).text
soup = BeautifulSoup(lehigh,'lxml')
rows = soup.find_all('div',class_="sidearm-schedule-game-row flex flex-wrap flex-align-center row")
sheet = pd.DataFrame()
for row in rows:
date = row.find('div',class_="sidearm-schedule-game-opponent-date").text.strip()
name = row.find('div',class_="sidearm-schedule-game-opponent-name").text.strip()
opp = row.find('div',class_="sidearm-schedule-game-opponent-text").text.strip()
conf = row.find('div',class_="sidearm-schedule-game-conference-conference").text.strip()
try:
result = row.find('div',class_="sidearm-schedule-game-result").text.strip()
except:
result = ''
df = pd.DataFrame([[year,date,name,opp,conf,result]], columns=['year','date','opponent','list','conference','result'])
sheet = sheet.append(df,sort=True).reset_index(drop=True)
results = results.append(sheet, sort=True).reset_index(drop=True)
results['result'], results['score'] = results['result'].str.split(',', 1).str
results.to_excel('lehigh.xlsx')

Resources