i have a data frame and i am running some code and getting some data as per data frame values but data frame index count is 1.5 Million so it is taking time to extract the data and my server getting stop so whole process is stuck and again starting from zero.
I want to save extract the data in new csv file after every iteration or at after defined rows.
def get_dsm_coverage(df):
import math
import mpmath
list_2019 = []
list_2020 = []
list_2021 = []
for z in df.index:
lat,long = (df['LATITUDE'][z],df['LONGITUDE'][z])
print(z)
zoom = 21
lat_rad = math.radians(lat)
lon_rad = math.radians(long)
n=2**zoom
xtile = str(int(n*((long+180)/360)))
ytile = str(int(n*(1-(np.log(np.tan(lat_rad) +float(mpmath.sec(lat_rad))) / np.pi))/2))
print(long,lat,xtile,ytile)
for year in [2019,2020,2021]:
url = 'https://api.gic.org/images/GetDSMTile/21/' +str(xtile)+ '/' +str(ytile)+'/?layer=bluesky-ultra&year='+str(year)
r = requests.get(url, params= {'AuthToken':token})
if r.status_code!=200:
print('got inside')
url = 'https://api.gic.org/images/GetDSMTile/21/' +str(xtile)+ '/' +str(ytile)+'/?layer=bluesky-ultra-g&year='+str(year)
r = requests.get(url, params= {'AuthToken':token})
try:
content_type = r.headers['Content-type']
except:
content_type = 'application/json;charset=ISO-8859-1'
if content_type == 'image/tiff':
print(r.status_code)
print(url)
print(content_type)
if year==2019:
list_2019.append(1)
elif year==2020:
list_2020.append(1)
else:
list_2021.append(1)
else:
print(content_type)
if year==2019:
list_2019.append(0)
elif year==2020:
list_2020.append(0)
else:
list_2021.append(0)
return list_2019,list_2020,list_2021
list_2019,list_2020,list_2021 = get_dsm_coverage(df)
df['dsm_2019']=list_2019
df['dsm_2020']=list_2020
df['dsm_2021']=list_2021
The crucial part is that we are going to keep track of calculations and regularly. Note this code does not actually hit the API and when run the first time it will intentionally error out so that you can restart it and see it recover. Note that the existence of the temp file signals that there is prior work and as such it should be cleaned up after a successful run.
import json
import math
import os
import random
import mpmath
import numpy
import pandas
import requests
import urllib3
#-------------------------------
# Don't build your own retry.
# Requests already supports it!
#-------------------------------
request_with_retry = requests.Session()
request_with_retry.mount("https://", requests.adapters.HTTPAdapter(
max_retries= urllib3.util.retry.Retry(
total=5,
backoff_factor=1,
status_forcelist=[408, 409, 429, 500, 502, 503, 504]
)
))
#-------------------------------
#-------------------------------
# We can pull this out of the main loop to simplify it
#-------------------------------
def get_tiles(latitude, longitude, zoom_level):
lat_rad = math.radians(latitude)
#lon_rad = math.radians(longitude)
n = 2 ** zoom_level
xtile = n * (longitude + 180) / 360
ytile = n * (1- (numpy.log(numpy.tan(lat_rad) + float(mpmath.sec(lat_rad))) / numpy.pi)) / 2
return int(xtile), int(ytile)
#-------------------------------
#-------------------------------
# Return a dictionary that contains prior saved work (or is blank)
# See: save_prior_work()
#-------------------------------
def fetch_prior_work(tmp_file_path):
try:
with open(tmp_file_path, "r", encoding="utf-8") as temp_in:
return json.load(temp_in)
except (FileNotFoundError, json.decoder.JSONDecodeError):
return {}
#-------------------------------
#-------------------------------
# Save our work to a temp file.
# See: fetch_prior_work()
#-------------------------------
def save_prior_work(tmp_file_path, work_dictionary):
with open(tmp_file_path, "w", encoding="utf-8", newline="") as temp_out:
json.dump(work_dictionary, temp_out)
#-------------------------------
def get_dsm_coverage(token, df, tmp_file_path):
api_url_params = {'AuthToken': token}
api_url_template = "https://api.gic.org/images/GetDSMTile/21/{xtile}/{ytile}/?layer=bluesky-ultra&year={year}"
api_zoom_level = 21
years = ["2019", "2020", "2021"]
note_every = 3 # How often do we print
save_every = 6 # How often do we save progress (probably every 100 or 1000)
## -------------------
## if our tmp_file_path exists, it represents prior work we can skip
## -------------------
year_lists = fetch_prior_work(tmp_file_path)
## -------------------
## -------------------
## make sure our year_lists results is properly initialized
## -------------------
for year in years:
year_lists.setdefault(year, [])
## -------------------
## -------------------
## Determine if there is any prior work we can skip
## -------------------
rows_already_processed = len(year_lists[years[0]])
if rows_already_processed:
print(f"skipping first {rows_already_processed} rows")
## ----------------------
for z in df.index[rows_already_processed:]:
## ----------------------
## printing is expensive so let's only print every so often
## ----------------------
if not z % note_every:
print(f"Row: {z}")
## ----------------------
## ----------------------
## calculate the tile ids
## ----------------------
xtile, ytile = get_tiles(df["LATITUDE"][z], df["LONGITUDE"][z], api_zoom_level)
## ----------------------
for year in years:
url = api_url_template.format_map({"xtile": xtile, "ytile": ytile, "year": year})
## ---------------------------
## TEST: We don't have a key....
## ---------------------------
#response = request_with_retry.get(url, params=api_url_params)
response = None
## ---------------------------
try:
content_type = response.headers['Content-type']
except:
content_type = "application/json;charset=ISO-8859-1"
## ---------------------------
## TEST: We don't have a key....
## ---------------------------
content_type = random.choice(["image/tiff", content_type])
## ---------------------------
if content_type == 'image/tiff':
year_lists[year].append(1)
else:
year_lists[year].append(0)
## ----------------------
## Every so often, dump the work we have done to a temp file.
## ----------------------
if z and not (z % save_every):
print(f"\tSaving Temp File...")
save_prior_work(tmp_file_path, year_lists)
## ----------------------
## ----------------------
## TEST: Force an error the first run so we can restart
## ----------------------
if z == 10 and not rows_already_processed:
raise Exception("Bummer")
## ----------------------
return year_lists.values()
AUTH_TOKEN = ""
TMP_FILE_PATH = "./temp.json"
df = pandas.DataFrame([
("Ansonia", "CT", "USA", 41.346439, -73.084938),
("Walsenburg", "CO", "USA", 37.630322, -104.790543),
("Sterling", "CO", "USA", 40.626743, -103.217026),
("Steamboat Springs", "CO", "USA", 40.490429, -106.842384),
("Ouray", "CO", "USA", 38.025131, -107.675880),
("Leadville", "CO", "USA", 39.247478, -106.300194),
("Gunnison", "CO", "USA", 38.547871, -106.938622),
("Fort Morgan", "CO", "USA", 40.255306, -103.803062),
("Panama City", "FL", "USA", 30.193626, -85.683029),
("Miami Beach", "FL", "USA", 25.793449, -80.139198),
("Cripple Creek", "CO", "USA", 38.749077, -105.183060),
("Central City", "CO", "USA", 39.803318, -105.516830),
("Cañon City", "CO", "USA", 38.444931, -105.245720),
])
df.set_axis(["Name", "State", "Country", "LATITUDE", "LONGITUDE"], axis=1, inplace=True)
list_2019, list_2020, list_2021 = get_dsm_coverage(AUTH_TOKEN, df, TMP_FILE_PATH)
## ----------------------
## if we get here, TMP_FILE_PATH should/could be deleted...
## ----------------------
try:
os.remove(TMP_FILE_PATH)
except OSError:
pass
## ----------------------
df['dsm_2019'] = list_2019
df['dsm_2020'] = list_2020
df['dsm_2021'] = list_2021
print(df)
One should expect the first execution to give:
Row: 0
Row: 3
Row: 6
Saving Temp File...
Row: 9
Traceback (most recent call last):
File "test.py", line 167, in <module>
list_2019, list_2020, list_2021 = get_dsm_coverage(AUTH_TOKEN, df, TMP_FILE_PATH)
File "test.py", line 142, in get_dsm_coverage
raise Exception("Bummer")
Exception: Bummer
and a following execution to give something like:
skipping first 7 rows
Row: 9
Row: 12
Saving Temp File...
Name State Country LATITUDE LONGITUDE dsm_2019 dsm_2020 dsm_2021
0 Ansonia CT USA 41.346439 -73.084938 0 1 1
1 Walsenburg CO USA 37.630322 -104.790543 1 0 0
2 Sterling CO USA 40.626743 -103.217026 0 1 0
3 Steamboat Springs CO USA 40.490429 -106.842384 0 0 0
4 Ouray CO USA 38.025131 -107.675880 0 0 0
5 Leadville CO USA 39.247478 -106.300194 1 0 1
6 Gunnison CO USA 38.547871 -106.938622 0 1 1
7 Fort Morgan CO USA 40.255306 -103.803062 1 0 0
8 Panama City FL USA 30.193626 -85.683029 0 1 1
9 Miami Beach FL USA 25.793449 -80.139198 0 1 1
10 Cripple Creek CO USA 38.749077 -105.183060 1 0 1
11 Central City CO USA 39.803318 -105.516830 0 0 0
12 Cañon City CO USA 38.444931 -105.245720 1 0 0
Related
I am trying to create dummy data for NER task by replacing person_name with some dummy names. But it's giving me weird results in case of same entities occuring multiple times as discussed here:
Strange result when removing item from a list while iterating over it
Modifying list while iterating
Input example spans:
{
'text':"Mohan dob is 25th dec 1980. Mohan loves to play cricket.",
'spans':[{'start':0, 'end':5,'label':'person_name','ngram':'Mohan'},
{start':28, 'end':33,'label':'person_name','ngram':'Mohan'},
{start':13, 'end':26,'label':'date','ngram':'25th dec 1980'}
]
}
The entities person_name occurs twice in a sample.
sample_names=['Jon', 'Sam']
I want to replace (0, 5, 'person_name') and (28, 33, 'person_name') with sample_names.
Dummy Examples Output:
{
{'text':"Jon dob is 25th dec 1980. Jon loves to play cricket.",
'spans':[{'start':0, 'end':3,'label':'person_name','ngram':'Jon'},
{start':26, 'end':31,'label':'person_name','ngram':'Jon'},
{start':11, 'end':24,'label':'date','ngram':'25th dec 1980'}
]
},
{'text':"Sam dob is 25th dec 1980. Sam loves to play cricket.",
'spans':[{'start':0, 'end':3,'label':'person_name','ngram':'Sam'},
{start':26, 'end':31,'label':'person_name','ngram':'Sam'},
{start':11, 'end':24,'label':'date','ngram':'25th dec 1980'}
]
}
}
The spans also get's updated in output
target_entity='person_name'
names=sample_names
Code:
def generate(data, target_entity, names):
text = data['text']
spans = data['spans']
new_sents=[]
if spans:
spans = [(d['start'], d['end'], d['label']) for d in spans]
spans.sort()
labellist=[s[2] for s in spans]
# get before_spans and after_spans around target entity
for n in names:
gap = 0
for i, tup in enumerate(spans):
lab = tup[2]
if lab == target_entity:
new_spans={"before": spans[:i], "after": spans[i+1:]}
print("the spans before and after :\n",new_spans)
start=tup[0] #check this
end=tup[1]
ngram = text[start:end]
new_s = text[:start] + n + text[end:]
gap = len(n) - len(ngram)
before = new_spans["before"]
after = [(tup[0]+gap, tup[1]+gap, tup[2]) for tup in new_spans["after"]]
s_sp = before + [(start, start + len(n), target_label)] + after
text=new_s
en={"text": new_s,"spans": [{"start": tup[0], "end": tup[1], "label": tup[2], "ngram": new_s[tup[0]:tup[1]]} for tup in s_sp]}
spans = s_sp
new_sents.append(en)
If all you seek to do is replace the placeholder with a new value, you can do something like this:
## --------------------
## Some enxaple input from you
## --------------------
input_data = [
(162, 171, 'pno'),
(241, 254, 'person_name'),
(373, 384, 'date'),
(459, 477, 'date'),
None,
(772, 785, 'person_name'),
(797, 806, 'pno')
]
## --------------------
## --------------------
## create an iterator out of our name list
## you will need to decide what happens if sample names
## gets exhausted.
## --------------------
sample_names = [
'Jon',
'Sam'
]
sample_names_itter = iter(sample_names)
## --------------------
for row in input_data:
if not row:
continue
start = row[0]
end = row[1]
name = row[2] if row[2] != "person_name" else next(sample_names_itter)
print(f"{name} dob is 25th dec 1980. {name} loves to play cricket.")
I'm new in Python and trying to get my head around on this code.
We have to import a text file named line-items.txt; excerpt of the txt are as follows including its heading:
product name quantity unit price
product a 1 10.00
product b 5 19.70
product a 3 10.00
product b 7 19.70
We need to write a code that will search for the product name and sum its quantity and unit price then the sales revenue formula would be "total unit price of the product" * "total quantity of the product"; we have to create new text file and the output should be something like this:
product name sales volume sales revenue
product a 4 40.0
product b 12 236.39999999999998
On my code below it has searched the quantity of product b which is 5 and 7 and its unit price (I did print statement to check its output but on the code below I commented the unit price for simplicity) but it's not adding the values that it has searched:
def main():
# opening file to read line-items.txt
with open("line-items.txt", "r") as line_items:
# to get the list of lines and reading the second line of the text
prod_b = 0
newtxt = line_items.readlines()[1:]
for line in newtxt:
text = line.strip().split()
product_name = text[0:2]
quantity = text[2]
unit_price = text[3]
if product_name == ['product', 'b']:
prod_b += int(quantity)
unit_price_b = float(unit_price)
# print(unit_price_b)
print(quantity)
line_items.close()
if name == 'main':
main()
The output of the code above are as follows; it's not adding 5 and 7; what am I doing wrong?
5
7
Thanks,
Rogue
While the answer provided by #JonSG is certainly more elegant. The problem with your code is quite simple and is caused by an indentation error. You need to indent the if statement under the for loop as shown below:
def main():
# opening file to read line-items.txt
with open("line-items.txt", "r") as line_items:
# to get the list of lines and reading the second line of the text
prod_b = 0
newtxt = line_items.readlines()[1:]
for line in newtxt:
text = line.strip().split()
product_name = text[0:2]
quantity = text[2]
unit_price = text[3]
if product_name == ['product', 'b']:
prod_b += int(quantity)
unit_price_b = float(unit_price)
# print(unit_price_b)
print(quantity)
line_items.close()
Using a nested collections.defaultdict makes this problem rather straightforward.
import collections
import json
results = collections.defaultdict(lambda: collections.defaultdict(float))
with open("line-items.txt", "r") as line_items:
next(line_items) ## skip first line
for row in line_items.readlines():
cells = row.split(" ")
product_name = f"{cells[0]} {cells[1]}"
quatity = int(cells[2])
price = float(cells[3])
results[product_name]["quantity"] += quatity
results[product_name]["sales volume"] += quatity * price
print(json.dumps(results, indent=4))
results in:
{
"product a": {
"quantity": 4.0,
"sales volume": 40.0
},
"product b": {
"quantity": 12.0,
"sales volume": 236.4
}
}
import json
import requests
from os import makedirs
from os.path import join, exists
from datetime import date, timedelta
ARTICLES_DIR = join('tempdata', 'articles')
makedirs(ARTICLES_DIR, exist_ok=True)
API_ENDPOINT = 'http://content.guardianapis.com/search'
my_params = {
'q': 'coronavirus,stock,covid',
'sectionID': 'business',
'from-date': "2019-01-01",
'to-date': "2020-09-30",
'order-by': "newest",
'show-fields': 'all',
'page-size': 300,
'api-key': '### my cryptic key ###'
}
# day iteration from here:
# http://stackoverflow.com/questions/7274267/print-all-day-dates-between-two-dates
start_date = date(2019, 1, 1)
end_date = date(2020,9, 30)
dayrange = range((end_date - start_date).days + 1)
for daycount in dayrange:
dt = start_date + timedelta(days=daycount)
datestr = dt.strftime('%Y-%m-%d')
fname = join(ARTICLES_DIR, datestr + '.json')
if not exists(fname):
# then let's download it
print("Downloading", datestr)
all_results = []
my_params['from-date'] = datestr
my_params['to-date'] = datestr
current_page = 1
total_pages = 1
while current_page <= total_pages:
print("...page", current_page)
my_params['page'] = current_page
resp = requests.get(API_ENDPOINT, my_params)
data = resp.json()
all_results.extend(data['response']['results'])
# if there is more than one page
current_page += 1
total_pages = data['response']['pages']
with open(fname, 'w') as f:
print("Writing to", fname)
# re-serialize it for pretty indentation
f.write(json.dumps(all_results, indent=2))
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-18-f04b4f0fe9ed> in <module>
49 resp = requests.get(API_ENDPOINT, my_params)
50 data = resp.json()
---> 51 all_results.extend(data['response']['results'])
52 # if there is more than one page
53 current_page += 1
KeyError: 'results'
Same error occurs for 'pages'
At first there was no issues and was able to run it. Download crashed after 2020-03-24. Since then can't get the code running again.
I'm referring to Line 51 and 54. At least at this point the codes crashes.
Not sure how to get rid of the issue. Any ideas?
Understanding the error message would be the first step - it compains about a missing key. Check if data['response']['results'] is present (hint: it is not) and check what exactly the structure of your data['response'] is.
Fortunately one can use the api parameter 'test' so we can help using that key:
my_params = {
'q': 'coronavirus,stock,covid',
'sectionID': 'business',
'from-date': "2019-01-01",
'to-date': "2020-09-30",
'order-by': "newest",
'show-fields': 'all',
'page-size': 300,
'api-key': 'test' # test key for that API
}
On running, I get the same exception, inspect data['response'] and get:
Lets see what parameters are given, shall we?
my_params = {
'q': 'coronavirus,stock,covid',
'sectionID': 'business',
'from-date': "2019-01-01",
'to-date': "2020-09-30",
'order-by': "newest",
'show-fields': 'all',
'page-size': 300, # TOO BIG
'api-key': 'test'
}
Fix that to 200 and you'll get
Downloading 2019-01-01
...page 1
Writing to tempdata\articles\2019-01-01.json
Downloading 2019-01-02
...page 1
Writing to tempdata\articles\2019-01-02.json
Downloading 2019-01-03
...page 1
Writing to tempdata\articles\2019-01-03.json
Downloading 2019-01-04
...page 1
Writing to tempdata\articles\2019-01-04.json
Downloading 2019-01-05
[snipp]
I have many csv files that only have one row of data. I need to take data from two of the cells and put them into a master csv file ('new_gal.csv'). Initially this will only contain the headings, but no data.
#The file I am pulling from:
file_name = "N4261_pacs160.csv"
#I have the code written to separate gal_name, cat_name, and cat_num (N4261, pacs, 160)
An example of the csv is given here. I am trying to pull "flux" and "rms" from this file. (Sorry it isn't aligned nicely; I can't figure out the formatting).
name band ra dec raerr decerr flux snr snrnoise stn rms strn fratio fwhmxfit fwhmyfit flag_elong edgeflag flag_blend warmat
obsid ssomapflag dist angle
HPPSC160A_J121923.1+054931 red 184.846389 5.8254 0.000151 0.00015
227.036 10.797 21.028 16.507 13.754 37.448 1.074 15.2 11 0.7237
f 0 f 1342199758 f 1.445729 296.577621
I read this csv and pull the data I need
with open(file_name, 'r') as table:
reader = csv.reader(table, delimiter=',')
read = iter(reader)
next(read)
for row in read:
fluxP = row[6]
errP = row[10]
#Open the master csv with pandas
df = pd.read_csv('new_gal.csv')
The master csv file has format:
Galaxy Cluster Mult. Detect. LumDist z W1 W1 err W2 W2 err W3 W3 err W4 W4 err 70 70 err 100 100 err 160 160 err 250 250 err 350 350 err 500 500 err
The main problem I have, is that I want to search the "Galaxy" column in the 'new_gal.csv' for the galaxy name. If it is not there, I need to add a new row with the galaxy name and the flux and error measurement. When I run this multiple times, I get duplicate rows even though I have the append command nested in the if statement. I only want it to append a new row if the galaxy name is not already there; otherwise, it should only change the values of the flux and error measurements for that galaxy.
if cat_name == 'pacs':
if gal_name not in df["Galaxy"]:
df = df.append({"Galaxy": gal_name}, ignore_index=True)
if cat_num == "70":
df.loc[df.Galaxy == gal_name, ["70"]] = fluxP
df.loc[df.Galaxy == gal_name, ["70 err"]] = errP
elif cat_num == "100":
df.loc[df.Galaxy == gal_name, ["100"]] = fluxP
df.loc[df.Galaxy == gal_name, ["100 err"]] = errP
elif cat_num == "160":
df.loc[df.Galaxy == gal_name, ["160"]] = fluxP
df.loc[df.Galaxy == gal_name, ["160 err"]] = errP
else:
if cat_num == "70":
df.loc[df.Galaxy == gal_name, ["70"]] = fluxP
df.loc[df.Galaxy == gal_name, ["70 err"]] = errP
elif cat_num == "100":
df.loc[df.Galaxy == gal_name, ["100"]] = fluxP
df.loc[df.Galaxy == gal_name, ["100 err"]] = errP
elif cat_num == "160":
df.loc[df.Galaxy == gal_name, ["160"]] = fluxP
df.loc[df.Galaxy == gal_name, ["160 err"]] = errP
After running the code 5 times with the same file, I have 5 identical lines in the table.
I think I've got something that'll work after tinkering with it this morning...
Couple points... You shouldn't incrementally build in pandas...get the data setup done externally then do 1 build. In what I have below, I'm building a big dictionary from the small csv files and then using merge to put that together with the master file.
If your .csv files aren't formatted properly, you can either try to replace the split character below or switch over to csv reader that is a bit more powerful.
You should put all of the smaller .csv files in a folder called 'orig_data' to make this work.
main prog
# galaxy compiler
import os, re
import pandas as pd
# folder location for the small .csvs, NOT the master
data_folder = 'orig_data' # this folder should be in same directory as program
result = {}
splitter = r'(.+)_([a-zA-Z]+)([0-9]+)\.' # regex to break up file name into 3 groups
for file in os.listdir(data_folder):
file_data = {}
# split up the filename and process
galaxy, cat_name, cat_num = re.match(splitter, file).groups()
#print(galaxy, cat_name, cat_num)
with open(os.path.join(data_folder, file), 'r') as src:
src.readline() # read the header and disregard it
data = src.readline().replace(' ','').strip().split(',') # you can change the split char
flux = float(data[2])
rms = float(data[3])
err_tag = cat_num + ' err'
file_data = { 'cat_name': cat_name,
cat_num: flux,
err_tag: rms}
result[galaxy] = file_data
df2 = pd.DataFrame.from_dict(result, orient='index')
df2.index.rename('galaxy', inplace=True)
# check the resulting build!
#print(df2)
# build master dataframe
master_df = pd.read_csv('master_data.csv')
#print(master_df.head())
# merge the 2 dataframes on galaxy name. See the dox on merge for other
# options and whether you want an "outer" join or other type of join...
master_df = master_df.merge(df2, how='outer', on='galaxy')
# convert boolean flags properly
conv = {'t': True, 'f': False}
master_df['flag_nova'] = master_df['flag_nova'].map(conv).astype('bool')
print(master_df)
print()
print(master_df.info())
print()
print(master_df.describe())
example data files in orig_data folder
filename: A99_dbc100.csv
band,weight,flux,rms
junk, 200.44,2e5,2e-8
filename: B250_pacs100.csv
band,weight,flux,rms
nada,2.44,19e-5, 74
...etc.
example master csv
galaxy,color,stars,flag_nova
A99,red,15,f
B250,blue,4e20,t
N1000,green,3e19,f
X99,white,12,t
Result:
galaxy color stars ... 200 err 100 100 err
0 A99 red 1.500000e+01 ... NaN 200000.00000 2.000000e-08
1 B250 blue 4.000000e+20 ... NaN 0.00019 7.400000e+01
2 N1000 green 3.000000e+19 ... 88.0 NaN NaN
3 X99 white 1.200000e+01 ... NaN NaN NaN
[4 rows x 9 columns]
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4 entries, 0 to 3
Data columns (total 9 columns):
galaxy 4 non-null object
color 4 non-null object
stars 4 non-null float64
flag_nova 4 non-null bool
cat_name 3 non-null object
200 1 non-null float64
200 err 1 non-null float64
100 2 non-null float64
100 err 2 non-null float64
dtypes: bool(1), float64(5), object(3)
memory usage: 292.0+ bytes
None
stars 200 200 err 100 100 err
count 4.000000e+00 1.0 1.0 2.000000 2.000000e+00
mean 1.075000e+20 1900000.0 88.0 100000.000095 3.700000e+01
std 1.955121e+20 NaN NaN 141421.356103 5.232590e+01
min 1.200000e+01 1900000.0 88.0 0.000190 2.000000e-08
25% 1.425000e+01 1900000.0 88.0 50000.000143 1.850000e+01
50% 1.500000e+19 1900000.0 88.0 100000.000095 3.700000e+01
75% 1.225000e+20 1900000.0 88.0 150000.000048 5.550000e+01
max 4.000000e+20 1900000.0 88.0 200000.000000 7.400000e+01
I have been all over this site and google trying to solve this problem.
It appears as though I'm missing a fundamental concept in making a plottable dataframe.
I've tried to ensure that I have a column of strings for the "Teams" and a column of ints for the "Points"
Still I get: TypeError: Empty 'DataFrame': no numeric data to plot
import csv
import pandas
import numpy
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter
set_of_teams = set()
def load_epl_games(file_name):
with open(file_name, newline='') as csvfile:
reader = csv.DictReader(csvfile)
raw_data = {"HomeTeam": [], "AwayTeam": [], "FTHG": [], "FTAG": [], "FTR": []}
for row in reader:
set_of_teams.add(row["HomeTeam"])
set_of_teams.add(row["AwayTeam"])
raw_data["HomeTeam"].append(row["HomeTeam"])
raw_data["AwayTeam"].append(row["AwayTeam"])
raw_data["FTHG"].append(row["FTHG"])
raw_data["FTAG"].append(row["FTAG"])
raw_data["FTR"].append(row["FTR"])
data_frame = pandas.DataFrame(data=raw_data)
return data_frame
def calc_points(team, table):
points = 0
for row_number in range(table["HomeTeam"].count()):
home_team = table.loc[row_number, "HomeTeam"]
away_team = table.loc[row_number, "AwayTeam"]
if team in [home_team, away_team]:
home_team_points = 0
away_team_points = 0
winner = table.loc[row_number, "FTR"]
if winner == 'H':
home_team_points = 3
elif winner == 'A':
away_team_points = 3
else:
home_team_points = 1
away_team_points = 1
if team == home_team:
points += home_team_points
else:
points += away_team_points
return points
def get_goals_scored_conceded(team, table):
scored = 0
conceded = 0
for row_number in range(table["HomeTeam"].count()):
home_team = table.loc[row_number, "HomeTeam"]
away_team = table.loc[row_number, "AwayTeam"]
if team in [home_team, away_team]:
if team == home_team:
scored += int(table.loc[row_number, "FTHG"])
conceded += int(table.loc[row_number, "FTAG"])
else:
scored += int(table.loc[row_number, "FTAG"])
conceded += int(table.loc[row_number, "FTHG"])
return (scored, conceded)
def compute_table(df):
raw_data = {"Team": [], "Points": [], "GoalDifference":[], "Goals": []}
for team in set_of_teams:
goal_data = get_goals_scored_conceded(team, df)
raw_data["Team"].append(team)
raw_data["Points"].append(calc_points(team, df))
raw_data["GoalDifference"].append(goal_data[0] - goal_data[1])
raw_data["Goals"].append(goal_data[0])
data_frame = pandas.DataFrame(data=raw_data)
data_frame = data_frame.sort_values(["Points", "GoalDifference", "Goals"], ascending=[False, False, False]).reset_index(drop=True)
data_frame.index = numpy.arange(1,len(data_frame)+1)
data_frame.index.names = ["Finish"]
return data_frame
def get_finish(team, table):
return table[table.Team==team].index.item()
def get_points(team, table):
return table[table.Team==team].Points.item()
def display_hbar(tables):
raw_data = {"Team": [], "Points": []}
for row_number in range(tables["Team"].count()):
raw_data["Team"].append(tables.loc[row_number+1, "Team"])
raw_data["Points"].append(int(tables.loc[row_number+1, "Points"]))
df = pandas.DataFrame(data=raw_data)
#df = pandas.DataFrame(tables, columns=["Team", "Points"])
print(df)
print(df.dtypes)
df["Points"].apply(int)
print(df.dtypes)
df.plot(kind='barh',x='Points',y='Team')
games = load_epl_games('epl2016.csv')
final_table = compute_table(games)
#print(final_table)
#print(get_finish("Tottenham", final_table))
#print(get_points("West Ham", final_table))
display_hbar(final_table)
The output:
Team Points
0 Chelsea 93
1 Tottenham 86
2 Man City 78
3 Liverpool 76
4 Arsenal 75
5 Man United 69
6 Everton 61
7 Southampton 46
8 Bournemouth 46
9 West Brom 45
10 West Ham 45
11 Leicester 44
12 Stoke 44
13 Crystal Palace 41
14 Swansea 41
15 Burnley 40
16 Watford 40
17 Hull 34
18 Middlesbrough 28
19 Sunderland 24
Team object
Points int64
dtype: object
Team object
Points int64
dtype: object
Traceback (most recent call last):
File "C:/Users/Michael/Documents/Programming/Python/Premier League.py", line 99, in <module>
display_hbar(final_table)
File "C:/Users/Michael/Documents/Programming/Python/Premier League.py", line 92, in display_hbar
df.plot(kind='barh',x='Points',y='Team')
File "C:\Program Files (x86)\Python36-32\lib\site- packages\pandas\plotting\_core.py", line 2941, in __call__
sort_columns=sort_columns, **kwds)
File "C:\Program Files (x86)\Python36-32\lib\site-packages\pandas\plotting\_core.py", line 1977, in plot_frame
**kwds)
File "C:\Program Files (x86)\Python36-32\lib\site-packages\pandas\plotting\_core.py", line 1804, in _plot
plot_obj.generate()
File "C:\Program Files (x86)\Python36-32\lib\site-packages\pandas\plotting\_core.py", line 258, in generate
self._compute_plot_data()
File "C:\Program Files (x86)\Python36-32\lib\site-packages\pandas\plotting\_core.py", line 373, in _compute_plot_data
'plot'.format(numeric_data.__class__.__name__))
TypeError: Empty 'DataFrame': no numeric data to plot
What am I doing wrong in my display_hbar function that is preventing me from plotting my data?
Here is the csv file
df.plot(x = "Team", y="Points", kind="barh");
You should swap x and y in df.plot(...). Because y must be numeric according to the pandas documentation.