I have created a script that collects the information on a website and puts it on a script. I'm on my process to become acquainted with python scraping and I would like some help as I would like to player numbers to be on a different column
# import libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup
import xlsxwriter
import xlwt
from xlwt import Workbook
# Workbook is created
wb = Workbook()
# add_sheet is used to create sheet.
sheet1 = wb.add_sheet('Sheet 1')
#send request
#url = 'http://fcf.cat/acta/1920/futbol-11/infantil-primera-divisio/grup-11/1i/sant-ildefons-ue-b/1i/lhospitalet-centre-esports-c'
url = 'https://www.fcf.cat/acta/2422183'
page = requests.get(url,timeout=5, verify=False)
soup = BeautifulSoup(page.text,'html.parser')
#read acta
#acta_text = []
#acta_text_element = soup.find_all(class_='acta-table')
#for item in acta_text_element:
# acta_text.append(item.text)
i = 0
acta = []
for tr in soup.find_all('tr'):
values = [td.text.strip() for td in tr.find_all('td') ]
print(values)
acta.append(values)
i = 1 + i
sheet1.write(i,0,values)
wb.save('xlwt example.xls')
print(acta)
Thanks,
Two things to consider:
You can separate the first element in the list by using values[0] then use values[1:] for the remaining items
Use isnumeric to check if a string value is a number
Try this code:
for tr in soup.find_all('tr'):
values = [td.text.strip() for td in tr.find_all('td') ]
print(values)
acta.append(values)
i = 1 + i
if len(values) and values[0].isnumeric(): # if first element is number
sheet1.write(i,0,values[0]) # number in column 1
sheet1.write(i,1,values[1:]) # rest of list in column 2
else:
sheet1.write(i,0,values) # all values in column 1
Excel output (truncated)
To take the team on the left, for example, try this:
tables = soup.select('table')
players = []
columns = ["Player","Shirt"]
titulars = [item for item in tables[1].text.strip().split('\n') if len(item)>0]
#tables[1] is where the data for the first team is; the other team is in tables[8]
for num, name in zip(titulars[2::2],titulars[1::2]):
player = []
player.extend((num,name))
players.append(player)
pd.DataFrame(players,columns=columns)
Output:
Player Shirt
0 TORNER ENCINAS, GONZALO 1
1 MACHUCA LOVERA, OSMAR SILVESTRE 3
2 JARA MARTIN, BLAI 4
3 AGUILAR LUQUE, DANIEL 5
4 FONT MURILLO, JOAQUIN 6
5 MARTÍNEZ ELVIR, RICHARD ADRIAN 7
6 MARQUEZ RODRIGUEZ, GERARD 8
7 PATUEL BATLLE, GERARD 10
8 EL MAHI ZAROUALI, BILAL 11
9 JAUME MORERA, ADRIA 14
10 DEL VALLE ESCANCIANO, MARTI 15
Related
I need help how I can get the teams column from the table from this https://www.hltv.org/stats
This code gives me all values from the table but I did not get the value of teams because it is in in form of images(Hyperlink). I want to get the title of the teams.
r = requests.get("https://www.hltv.org/stats/players")
# Create a pandas with pulled data
root = bs(r.content, "html.parser")
root.prettify()
# Pull the player data out of the table and put into our dataframe
table = (str)(root.find("table"))
players = pd.read_html(table, header=0)[0]
I need to get all teams as a pandas column with a header as a team
Please help
Since the team name is contained in the alt attribute of the team images, you can simply replace the <td> content with the values from the alt attributes:
table = root.find("table")
for td in table('td', class_='teamCol'):
teams = [img['alt'] for img in td('img')]
td.string = ', '.join(teams)
players = pd.read_html(str(table), header=0)[0]
Gives
Player Teams Maps K-D Diff K/D Rating1.0
0 ZywOo Vitality, aAa 612 3853 1.39 1.29
1 s1mple Natus Vincere, FlipSid3, HellRaisers 1153 6153 1.31 1.24
2 sh1ro Gambit Youngsters 317 1848 1.39 1.21
3 Kaze ViCi, Flash, MVP.karnal 613 3026 1.31 1.20
[...]
You can do something like this using requests, pandas and BeautifulSoup:
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
req = requests.get("https://www.hltv.org/stats/players")
root = bs(req.text, "html.parser")
# Find the first table in the page
table = root.find('table', {'class': 'stats-table player-ratings-table'})
# Find all td with class "teamCol"
teams = table.find_all('td', {'class': 'teamCol'})
# Get img source & title from all img tags in teams
imgs = [(elm.get('src'), elm.get('title')) for team in teams for elm in team.find_all('img')]
# Create your DataFrame
df = pd.DataFrame(imgs, columns=['source', 'title'])
print(df)
Output:
source title
0 https://static.hltv.org/images/team/logo/9565 Vitality
1 https://static.hltv.org/images/team/logo/5639 aAa
2 https://static.hltv.org/images/team/logo/4608 Natus Vincere
3 https://static.hltv.org/images/team/logo/5988 FlipSid3
4 https://static.hltv.org/images/team/logo/5310 HellRaisers
... ... ...
1753 https://static.hltv.org/images/team/logo/4602 Tricked
1754 https://static.hltv.org/images/team/logo/4501 ALTERNATE aTTaX
1755 https://static.hltv.org/images/team/logo/7217 subtLe
1756 https://static.hltv.org/images/team/logo/5454 SKDC
1757 https://static.hltv.org/images/team/logo/6301 Splyce
[1758 rows x 2 columns]
I'm trying to scrape daily the stock value of a product. This is the web https://funds.ddns.net/f.php?isin=ES0110407097. And this is the code I'm trying:
import pandas as pd
from bs4 import BeautifulSoup
html_string = 'https://funds.ddns.net/f.php?isin=ES0110407097'
soup = BeautifulSoup(html_string, 'lxml')
new_table = pd.DataFrame(columns=range(0,2), index = [0])
row_marker = 0
column_marker = 0
for row in soup.find_all('tr'):
columns = soup.find_all('td')
for column in columns:
new_table.iat[row_marker,column_marker] = column.get_text()
column_marker += 1
print(new_table)
I would like to get in Python the same format I can see in the web, both the data and the number. How can I get it, please?
There's a simpler way for that particular page:
import requests
import pandas as pd
url = 'https://funds.ddns.net/f.php?isin=ES0110407097'
resp = requests.get(url)
new_table = pd.read_html(resp.text)[0]
print(new_table.head(5))
Output:
0 1
0 FECHA VL:EUR
1 2019-12-20 120170000
2 2019-12-19 119600000
3 2019-12-18 119420000
4 2019-12-17 119390000
I have setup a few argparse arguments for my script like so:
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--file", "-i", type=str, required=True)
parser.add_argument("--outfile", "-o", type=str, required=False)
parser.add_argument("--tab", "-t", type=str, required=False)
parser.add_argument("--tab_result", "-tr", type=str, required=False)
args = parser.parse_args()
#assign value too variables
infile = args.file
outfilepath = args.outfile
tabs = args.tab
tab_result = args.tab_result
I need to pass the variables of each the argparsers above into a function and assign values to a dataframe. I am trying to do this like so:
def func1():
print(infile)
doc = pd.DataFrame()
doc['file'] = infile
doc['output_table_name'] = outfilepath
doc['output_table_fields'] = json_normalized['index'] #from another df, works fine
doc['output_table_datatypes'] = json_normalized['dtypes.name'] #from another df, works fine
writer = pd.ExcelWriter(tabs)
doc.to_excel(writer,args.documentor_tab)
writer.save()
#print(infile)
#print(outfilename)
print(doc)
return doc
print('wrote document')
func1()
When I print this dataframe the infile and outpathfile argparse values dont get assigned to the dataframe columns, however all the rest of the argparse values do.
What I am doing wrong that not all values from argparse are getting assigned to the dataframe?
doc['file'] references the 'file' column so you can't set it to a string before there are any rows in the dataframe.
If there's only one row in json_normalized then you probably want something like this:
def func1(infile, outfilepath, tabs, tabs_result, json_normalized):
doc = pd.DataFrame(columns=['file', 'output_table_name', 'output_table_fields', 'output_table_datatypes'])
index = json_normalized['index'][0]
dtypes_name = json_normalized['dtypes.name'][0]
doc.loc[0] = [infile, outfilepath, index, dtypes_name]
...
return doc
or if you mean to write a whole column of index then swap the order:
def func1(infile, outfilepath, tabs, tabs_result, json_normalized):
doc = pd.DataFrame(columns=['file', 'output_table_name', 'output_table_fields', 'output_table_datatypes'])
doc['output_table_fields'] = json_normalized['index']
doc['output_table_datatypes'] = json_normalized['dtypes.name']
doc['output_table_name'] = outfilepath
doc['file'] = infile
...
return doc
argparse and passing/using variables in a function isn't the issue. The problem is with how you create the data frame.
Consider this stripped down example:
In [255]: doc = pd.DataFrame()
In [256]: doc['file'] = 'foobar'
In [257]: doc['outfile'] = 'anothername'
In [258]: doc
Out[258]:
Empty DataFrame
Columns: [file, outfile]
Index: []
In [259]: doc['col'] = [1,2,3,4]
In [260]: doc
Out[260]:
file outfile col
0 NaN NaN 1
1 NaN NaN 2
2 NaN NaN 3
3 NaN NaN 4
The initial assignments apply to an empty frame, one without any rows.
Assigning the constant values to the columns after creating the rows:
In [261]: doc['file'] = 'foobar'
In [262]: doc['outfile'] = 'anothername'
In [263]: doc
Out[263]:
file outfile col
0 foobar anothername 1
1 foobar anothername 2
2 foobar anothername 3
3 foobar anothername 4
Alternatively you could specify the row indices at the start:
In [265]: doc = pd.DataFrame(index=np.arange(5))
In [266]: doc
Out[266]:
Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4]
In [267]: doc['file'] = 'foobar'
In [268]: doc['outfile'] = 'anothername'
In [269]: doc
Out[269]:
file outfile
0 foobar anothername
1 foobar anothername
2 foobar anothername
3 foobar anothername
4 foobar anothername
i am attempting to parse Yahoo finance's historical stock price tables for various stocks using BeautifulSoup with Python. Here is the code:
import requests
import pandas as pd
import urllib
from bs4 import BeautifulSoup
tickers = ['HSBA.L', 'RDSA.L', 'RIO.L', 'BP.L', 'GSK.L', 'DGE.L', 'AZN.L', 'VOD.L', 'GLEN.L', 'ULVR.L']
url = 'https://uk.finance.yahoo.com/quote/HSBA.L/history?period1=1478647619&period2=1510183619&interval=1d&filter=history&frequency=1d'
request = requests.get(url)
soup = BeautifulSoup(request.text, 'lxml')
table = soup.find_all('table')[0]
n_rows = 0
n_columns = 0
column_name = []
for row in table.find_all('tr'):
data = row.find_all('td')
if len(data) > 0:
n_rows += 1
if n_columns == 0:
n_columns = len(data)
headers = row.find_all('th')
if len(headers) > 0 and len(column_name) == 0:
for header_names in headers:
column_name.append(header_names.get_text())
new_table = pd.DataFrame(columns = column_name, index = range(0,n_rows))
row_index = 0
for row in table.find_all('tr'):
column_index = 0
columns = row.find_all('td')
for column in columns:
new_table.iat[row_index, column_index] = column.get_text()
column_index += 1
if len(columns) > 0:
row_index += 1
The first time i ran the code, i had the interval set to exactly two years from November the 7th 2015 (with weekly prices). The issue is that the resulting data frame is 101 rows long but i know for a fact it should be more (106). Then i tried to change the interval completely to the default one when the page opens (which is daily) but i still got the same 101 rows, whereas the actual data is much larger. Is there anything wrong with the code, or is it something Yahoo finance are doing?
Any help is appreciated, i'm really stuck here.
AFAIK, the API was shut down in May of 2017. Can you use Google finance? If you can accept Ex cel as a solution, here is a link to a file that you can download to download all kinds of historical time series data.
http://investexcel.net/multiple-stock-quote-downloader-for-excel/
What I am trying to do is to get bootstrap confidence limits by row regardless of the number of rows and make a new dataframe from the output.I currently can do this for the entire dataframe, but not by row. The data I have in my actual program looks similar to what I have below:
0 1 2
0 1 2 3
1 4 1 4
2 1 2 3
3 4 1 4
I want the new dataframe to look something like this with the lower and upper confidence limits:
0 1
0 1 2
1 1 5.5
2 1 4.5
3 1 4.2
The current generated output looks like this:
0 1
0 2.0 2.75
The python 3 code below generates a mock dataframe and generates the bootstrap confidence limits for the entire dataframe. The result is a new dataframe with just 2 values, a upper and a lower confidence limit rather than 4 sets of 2(one for each row).
import pandas as pd
import numpy as np
import scikits.bootstrap as sci
zz = pd.DataFrame([[[1,2],[2,3],[3,6]],[[4,2],[1,4],[4,6]],
[[1,2],[2,3],[3,6]],[[4,2],[1,4],[4,6]]])
print(zz)
x= zz.dtypes
print(x)
a = pd.DataFrame(np.array(zz.values.tolist())[:, :, 0],zz.index, zz.columns)
print(a)
b = sci.ci(a)
b = pd.DataFrame(b)
b = b.T
print(b)
Thank you for any help.
scikits.bootstrap operates by assuming that data samples are arranged by row, not by column. If you want the opposite behavior, just use the transpose, and a statfunction that doesn't combine columns.
import pandas as pd
import numpy as np
import scikits.bootstrap as sci
zz = pd.DataFrame([[[1,2],[2,3],[3,6]],[[4,2],[1,4],[4,6]],
[[1,2],[2,3],[3,6]],[[4,2],[1,4],[4,6]]])
print(zz)
x= zz.dtypes
print(x)
a = pd.DataFrame(np.array(zz.values.tolist())[:, :, 0],zz.index, zz.columns)
print(a)
b = sci.ci(a.T, statfunction=lambda x: np.average(x, axis=0))
print(b.T)
Below is the answer I ended up figuring out to create bootstrap ci by row.
import pandas as pd
import numpy as np
import numpy.random as npr
zz = pd.DataFrame([[[1,2],[2,3],[3,6]],[[4,2],[1,4],[4,6]],
[[1,2],[2,3],[3,6]],[[4,2],[1,4],[4,6]]])
x= zz.dtypes
a = pd.DataFrame(np.array(zz.values.tolist())[:, :, 0],zz.index, zz.columns)
print(a)
def bootstrap(data, num_samples, statistic, alpha):
n = len(data)
idx = npr.randint(0, n, (num_samples, n))
samples = data[idx]
stat = np.sort(statistic(samples, 1))
return (stat[int((alpha/2.0)*num_samples)],
stat[int((1-alpha/2.0)*num_samples)])
cc = list(a.index.values) # informs generator of the number of rows
def bootbyrow(cc):
for xx in range(1):
xx = list(a.index.values)
for xx in range(len(cc)):
k = a.apply(lambda y: y[xx])
k = k.values
for xx in range(1):
kk = list(bootstrap(k,10000,np.mean,0.05))
yield list(kk)
abc = pd.DataFrame(list(bootbyrow(cc))) #bootstrap ci by row
# the next 4 just show that its working correctly
a0 = bootstrap((a.loc[0,].values),10000,np.mean,0.05)
a1 = bootstrap((a.loc[1,].values),10000,np.mean,0.05)
a2 = bootstrap((a.loc[2,].values),10000,np.mean,0.05)
a3 = bootstrap((a.loc[3,].values),10000,np.mean,0.05)
print(abc)
print(a0)
print(a1)
print(a2)
print(a3)