Pandas: customed aggregate functions for DataFrameGroupBy - python-3.x

I have the following data frame my_df:
name date A_score B_score
------------------------------------------
John 2017-01-01 5 6
John 2017-01-10 10 8
John 2017-02-04 3 5
Andy 2017-01-25 8 9
Andy 2017 02-05 7 1
Andy 2017-02-12 9 9
For each name, we want to find the absolute delta of A_score and B_score. The absolute delta is defined as the absolute value difference between the earliest date and the second earliest date.
The resulting data frame should be like:
name A_score_result B_score_result
----------------------------------------------
John 5 2
Andy 1 8
To achieve this, I tried:
new_df = my_df.groupby('name').apply(lambda x:myFun(x))
and
new_df = my_df.groupby('name').agg(['myFun'])
where myFun is:
def myFun(x):
y = x[2]-x[1]
return y
However, both approaches have errors like below:
/usr/local/lib/python3.4/dist-packages/pandas/core/frame.py in __getitem__(self, key)
2057 return self._getitem_multilevel(key)
2058 else:
-> 2059 return self._getitem_column(key)
2060
2061 def _getitem_column(self, key):
/usr/local/lib/python3.4/dist-packages/pandas/core/frame.py in _getitem_column(self, key)
2064 # get column
2065 if self.columns.is_unique:
-> 2066 return self._get_item_cache(key)
2067
2068 # duplicate columns & possible reduce dimensionality
/usr/local/lib/python3.4/dist-packages/pandas/core/generic.py in _get_item_cache(self, item)
1384 res = cache.get(item)
1385 if res is None:
-> 1386 values = self._data.get(item)
1387 res = self._box_item_values(item, values)
1388 cache[item] = res
/usr/local/lib/python3.4/dist-packages/pandas/core/internals.py in get(self, item, fastpath)
3541
3542 if not isnull(item):
-> 3543 loc = self.items.get_loc(item)
3544 else:
3545 indexer = np.arange(len(self.items))[isnull(self.items)]
/usr/local/lib/python3.4/dist-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
2134 return self._engine.get_loc(key)
2135 except KeyError:
-> 2136 return self._engine.get_loc(self._maybe_cast_indexer(key))
2137
2138 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4145)()
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4009)()
pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13166)()
pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13120)()
KeyError: 0
Any suggestion on how to fix this problem? Thanks a lot!

Try this:
In [358]: df.drop('date',1).groupby('name').agg(lambda x: abs(x.iloc[1] - x.iloc[0]))
Out[358]:
A_score B_score
name
Andy 1 8
John 5 2

Related

Value Error: index must be monotonic increasing or decreasing while using resample('M') function on datetime values

I am stuck at this point in my code. I am trying to divide the startdate and enddate into multiple rows based on months and for that I am trying to use the resample function to sample the dates on monthly basis. The sample code looks like this-
PS- A lot of the BCA_REF, STARTDATE, ENDDATE values are repeated and are not unique owing to the usecase
df = pd.DataFrame(
data = [['abc','2018-08-01','2025-07-31'], ['abc','2018-08-01','2025-07-31'],['xyz','2017-04-01','2017-04-01'], ['xyz','2017-04-01','2017-04-01'], ['pqr','2016-05-16','2017-10-15']],
columns = ['BCA_REF', 'STARTDATE', 'ENDDATE']
)
df['STARTDATE'] = pd.to_datetime(df['STARTDATE'])
df['ENDDATE'] = pd.to_datetime(df['ENDDATE'])
df_start_end = df.melt(id_vars=['BCA_REF'],value_vars=['STARTDATE','ENDDATE'], value_name='date')
df_new = (
df_start_end.groupby(['BCA_REF'])
.apply(lambda x: x.drop_duplicates('date').set_index('date')
.resample('M').pad())
.drop(columns=['BCA_REF','variable'])
.reset_index()
)
After I run this for 40K such rows, it gives me the following error
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/tmp/ipykernel_15069/2048245652.py in <module>
4 merged_final_new = (
5 mf_start_end.groupby(['BCA_REF'])
----> 6 .apply(lambda x: x.drop_duplicates('date').set_index('date')
7 .resample('M').pad())
8 # .drop(columns=['BCA_REF','variable'])
~/.local/lib/python3.7/site-packages/pandas/core/groupby/groupby.py in apply(self, func, *args, **kwargs)
1273 with option_context("mode.chained_assignment", None):
1274 try:
-> 1275 result = self._python_apply_general(f, self._selected_obj)
1276 except TypeError:
1277 # gh-20949
~/.local/lib/python3.7/site-packages/pandas/core/groupby/groupby.py in _python_apply_general(self, f, data)
1307 data after applying f
1308 """
-> 1309 keys, values, mutated = self.grouper.apply(f, data, self.axis)
1310
1311 return self._wrap_applied_output(
~/.local/lib/python3.7/site-packages/pandas/core/groupby/ops.py in apply(self, f, data, axis)
850 # group might be modified
851 group_axes = group.axes
--> 852 res = f(group)
853 if not _is_indexed_like(res, group_axes, axis):
854 mutated = True
/tmp/ipykernel_15069/2048245652.py in <lambda>(x)
5 mf_start_end.groupby(['BCA_REF'])
6 .apply(lambda x: x.drop_duplicates('date').set_index('date')
----> 7 .resample('M').pad())
8 # .drop(columns=['BCA_REF','variable'])
9 # .reset_index()
~/.local/lib/python3.7/site-packages/pandas/core/resample.py in pad(self, limit)
507 DataFrame.fillna: Fill NA/NaN values using the specified method.
508 """
--> 509 return self._upsample("pad", limit=limit)
510
511 ffill = pad
~/.local/lib/python3.7/site-packages/pandas/core/resample.py in _upsample(self, method, limit, fill_value)
1204 else:
1205 result = obj.reindex(
-> 1206 res_index, method=method, limit=limit, fill_value=fill_value
1207 )
1208
~/.local/lib/python3.7/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
322 #wraps(func)
323 def wrapper(*args, **kwargs) -> Callable[..., Any]:
--> 324 return func(*args, **kwargs)
325
326 kind = inspect.Parameter.POSITIONAL_OR_KEYWORD
~/.local/lib/python3.7/site-packages/pandas/core/frame.py in reindex(self, *args, **kwargs)
4770 kwargs.pop("axis", None)
4771 kwargs.pop("labels", None)
-> 4772 return super().reindex(**kwargs)
4773
4774 #deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "labels"])
~/.local/lib/python3.7/site-packages/pandas/core/generic.py in reindex(self, *args, **kwargs)
4817 # perform the reindex on the axes
4818 return self._reindex_axes(
-> 4819 axes, level, limit, tolerance, method, fill_value, copy
4820 ).__finalize__(self, method="reindex")
4821
~/.local/lib/python3.7/site-packages/pandas/core/frame.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
4596 if index is not None:
4597 frame = frame._reindex_index(
-> 4598 index, method, copy, level, fill_value, limit, tolerance
4599 )
4600
~/.local/lib/python3.7/site-packages/pandas/core/frame.py in _reindex_index(self, new_index, method, copy, level, fill_value, limit, tolerance)
4612 ):
4613 new_index, indexer = self.index.reindex(
-> 4614 new_index, method=method, level=level, limit=limit, tolerance=tolerance
4615 )
4616 return self._reindex_with_indexers(
~/.local/lib/python3.7/site-packages/pandas/core/indexes/base.py in reindex(self, target, method, level, limit, tolerance)
3824 if self._index_as_unique:
3825 indexer = self.get_indexer(
-> 3826 target, method=method, limit=limit, tolerance=tolerance
3827 )
3828 else:
~/.local/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_indexer(self, target, method, limit, tolerance)
3484 )
3485
-> 3486 return self._get_indexer(target, method, limit, tolerance)
3487
3488 def _get_indexer(
~/.local/lib/python3.7/site-packages/pandas/core/indexes/base.py in _get_indexer(self, target, method, limit, tolerance)
3506
3507 if method in ["pad", "backfill"]:
-> 3508 indexer = self._get_fill_indexer(target, method, limit, tolerance)
3509 elif method == "nearest":
3510 indexer = self._get_nearest_indexer(target, limit, tolerance)
~/.local/lib/python3.7/site-packages/pandas/core/indexes/base.py in _get_fill_indexer(self, target, method, limit, tolerance)
3582 indexer = engine_method(target_values, limit)
3583 else:
-> 3584 indexer = self._get_fill_indexer_searchsorted(target, method, limit)
3585 if tolerance is not None and len(self):
3586 indexer = self._filter_indexer_tolerance(target_values, indexer, tolerance)
~/.local/lib/python3.7/site-packages/pandas/core/indexes/base.py in _get_fill_indexer_searchsorted(self, target, method, limit)
3606 indexer = self.get_indexer(target)
3607 nonexact = indexer == -1
-> 3608 indexer[nonexact] = self._searchsorted_monotonic(target[nonexact], side)
3609 if side == "left":
3610 # searchsorted returns "indices into a sorted array such that,
~/.local/lib/python3.7/site-packages/pandas/core/indexes/base.py in _searchsorted_monotonic(self, label, side)
5763 return len(self) - pos
5764
-> 5765 raise ValueError("index must be monotonic increasing or decreasing")
5766
5767 def get_slice_bound(self, label, side: str_t, kind=None) -> int:
ValueError: index must be monotonic increasing or decreasing
I tried to look for solutions for this error wherein people suggested using sort_index()/sort_values() for the 'date' column but it still does not work. I believe the issue is with the resample function.
Any help would be appreciated. Thank you

how to handle snstwitter Keyerror "player_stream_content_type" in python?

I am collecting historic tweets using sntwitter (ref: https://betterprogramming.pub/how-to-scrape-tweets-with-snscrape-90124ed006af). For some of the keyword searches, I am getting the error "player_stream_content_type". I got the source code for the module on github (https://github.com/JustAnotherArchivist/snscrape/blob/master/snscrape/modules/twitter.py) but I am unable to figure out how to handle the error. Any suggestions on how to handle this is highly appreciated.
import tweepy
import pandas as pd
import os
import snscrape.modules.twitter as sntwitter
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0
# Creating list to append tweet data to
tweets_list1 = []
for i,tweet in enumerate(sntwitter.TwitterSearchScraper('itv since:2017-03-06 until:2017-04-03').get_items()):
if tweet.lang=="en":
tweets_list1.append([tweet.date, tweet.id, tweet.rawContent, tweet.user.username,tweet.user.id,
tweet.user.followersCount, tweet.user.friendsCount,tweet.user.location,
tweet.replyCount, tweet.retweetCount,tweet.likeCount,tweet.quoteCount,
tweet.hashtags,tweet.inReplyToUser,tweet.mentionedUsers
])
print(len(tweets_list1))
tweets_df2 = pd.DataFrame(tweets_list1, columns=['Datetime', 'Tweet Id', 'Text', 'Username',
'user_id','user_followers_count','user_friends_count',
'user_location','reply_count','retweet_count','like_count',
'quote_count','hashtags',
'is_reply_to','mentioned_users'])
Error message:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-14-716543e9be2b> in <module>
2 tweets_list1 = []
3
----> 4 for i,tweet in enumerate(sntwitter.TwitterSearchScraper('itv since:2017-03-06 until:2017-04-03').get_items()):
5 if tweet.lang=="en":
6
/opt/anaconda3/envs/p38/lib/python3.8/site-packages/snscrape/modules/twitter.py in get_items(self)
1448
1449 for obj in self._iter_api_data('https://api.twitter.com/2/search/adaptive.json', _TwitterAPIType.V2, params, paginationParams, cursor = self._cursor):
-> 1450 yield from self._v2_timeline_instructions_to_tweets(obj)
1451
1452 #classmethod
/opt/anaconda3/envs/p38/lib/python3.8/site-packages/snscrape/modules/twitter.py in _v2_timeline_instructions_to_tweets(self, obj, includeConversationThreads)
802 for entry in entries:
803 if entry['entryId'].startswith('sq-I-t-') or entry['entryId'].startswith('tweet-'):
--> 804 yield from self._v2_instruction_tweet_entry_to_tweet(entry['entryId'], entry['content'], obj)
805 elif includeConversationThreads and entry['entryId'].startswith('conversationThread-') and not entry['entryId'].endswith('-show_more_cursor'):
806 for item in entry['content']['timelineModule']['items']:
/opt/anaconda3/envs/p38/lib/python3.8/site-packages/snscrape/modules/twitter.py in _v2_instruction_tweet_entry_to_tweet(self, entryId, entry, obj)
825 else:
826 raise snscrape.base.ScraperException(f'Unable to handle entry {entryId!r}')
--> 827 yield self._tweet_to_tweet(tweet, obj)
828
829 def _get_tweet_id(self, tweet):
/opt/anaconda3/envs/p38/lib/python3.8/site-packages/snscrape/modules/twitter.py in _tweet_to_tweet(self, tweet, obj)
1267 kwargs['quotedTweet'] = self._tweet_to_tweet(obj['globalObjects']['tweets'][tweet['quoted_status_id_str']], obj)
1268 if 'card' in tweet:
-> 1269 kwargs['card'] = self._make_card(tweet['card'], _TwitterAPIType.V2, self._get_tweet_id(tweet))
1270 return self._make_tweet(tweet, user, **kwargs)
1271
/opt/anaconda3/envs/p38/lib/python3.8/site-packages/snscrape/modules/twitter.py in _make_card(self, card, apiType, tweetId)
1113 video = Video(
1114 thumbnailUrl = bindingValues['player_image'],
-> 1115 variants = [VideoVariant(contentType = bindingValues['player_stream_content_type'], url = bindingValues['amplify_url_vmap'], bitrate = None)],
1116 ),
1117 )
KeyError: 'player_stream_content_type'

Featuretools TypeError: unhashable type: 'Int64Index'

I am trying to create an entity set from 3 dataframes and while doing so I am getting the error: TypeError: unhashable type: 'Int64Index'
I have searched the Internet for similar issues but could not find any issues related to datetime types. Please note that none of the columns of df_raw_view_logs are unique and hence none of the columns can be used as index value and hence the dataframe.index is being used.
I am sharing the dtypes for the dataframe for which it is throwing error when I make a column from it as a time index.
df_raw_view_logs.dtypes
server_time datetime64[ns]
device_type int8
session_id int64
user_id int64
item_id int64
dtype: object
es = ft.EntitySet()
es = es.entity_from_dataframe(entity_id="train",
dataframe=df_es_train,
index=df_es_train.index,
time_index="impression_time",
)
es = es.entity_from_dataframe(entity_id="viewlogs",
dataframe=df_es_view_logs,
index=df_es_view_logs.index,
time_index="server_time",
)
es = es.entity_from_dataframe(entity_id="itemdata",
dataframe=df_es_item_data,
index=df_es_item_data.index,
)
new_relationship = ft.Relationship(es["train"]["user_id"],
es["viewlogs"]["user_id"])
es = es.add_relationship(new_relationship)
new_relationship_1 = ft.Relationship(es["viewlogs"]["item_id"],
es["itemdata"]["item_id"])
es = es.add_relationship(new_relationship_1)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-32-81425e9b87c5> in <module>
9 dataframe=df_es_view_logs,
10 index=df_es_view_logs.index,
---> 11 time_index="server_time",
12 )
13
D:\Anaconda3\envs\fastai\lib\site-packages\featuretools\entityset\entityset.py in entity_from_dataframe(self, entity_id, dataframe, index, variable_types, make_index, time_index, secondary_time_index, already_sorted)
495 secondary_time_index=secondary_time_index,
496 already_sorted=already_sorted,
--> 497 make_index=make_index)
498 self.entity_dict[entity.id] = entity
499 self.reset_data_description()
D:\Anaconda3\envs\fastai\lib\site-packages\featuretools\entityset\entity.py in __init__(self, id, df, entityset, variable_types, index, time_index, secondary_time_index, last_time_index, already_sorted, make_index, verbose)
67 """
68 _validate_entity_params(id, df, time_index)
---> 69 created_index, index, df = _create_index(index, make_index, df)
70
71 self.id = id
D:\Anaconda3\envs\fastai\lib\site-packages\featuretools\entityset\entity.py in _create_index(index, make_index, df)
547 # Case 3: user wanted to make index but column already exists
548 raise RuntimeError("Cannot make index: index variable already present")
--> 549 elif index not in df.columns:
550 if not make_index:
551 # Case 4: user names index, it is not in df. does not specify
D:\Anaconda3\envs\fastai\lib\site-packages\pandas\core\indexes\base.py in __contains__(self, key)
3917 #Appender(_index_shared_docs['contains'] % _index_doc_kwargs)
3918 def __contains__(self, key):
-> 3919 hash(key)
3920 try:
3921 return key in self._engine
D:\Anaconda3\envs\fastai\lib\site-packages\pandas\core\indexes\base.py in __hash__(self)
3932
3933 def __hash__(self):
-> 3934 raise TypeError("unhashable type: %r" % type(self).__name__)
3935
3936 def __setitem__(self, key, value):
TypeError: unhashable type: 'Int64Index'
This is erroring because the index argument is supposed to be a string that is the name of the column in your DataFrame that is the index. Not the index values themselves.

How to iterate over window objects to add them to a DataFrame?

I have an object, it seems to be a window object, EWM [com=9.5,min_periods=0,adjust=True,ignore_na=False,axis=0], it was created from a dataframe predictions_df_list["prices"] to be a one with dates as index and exponential weighted average of prices as values. I wanted to add it to a dataframe: predictions_df_list['ewma']. Yet it raised a NotImplementedError in inferring:
---------------------------------------------------------------------------
NotImplementedError Traceback (most recent call last)
<ipython-input-21-b1286fe39d1c> in <module>
---> 59 predictions_df_list['ewma'] = pd.DataFrame.ewm(predictions_df_list["prices"], span=20) #pd.DataFrame.ewma
60 predictions_df_list['actual_value'] = test['prices']
61 predictions_df_list['actual_value_ewma'] = pd.DataFrame.ewm(predictions_df_list["actual_value"], span=20)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in __setitem__(self, key, value)
3117 else:
3118 # set column
-> 3119 self._set_item(key, value)
3120
3121 def _setitem_slice(self, key, value):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in _set_item(self, key, value)
3192
3193 self._ensure_valid_index(value)
-> 3194 value = self._sanitize_column(key, value)
3195 NDFrame._set_item(self, key, value)
3196
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in _sanitize_column(self, key, value, broadcast)
3385 value = _sanitize_index(value, self.index, copy=False)
3386
-> 3387 elif isinstance(value, Index) or is_sequence(value):
3388 from pandas.core.series import _sanitize_index
3389
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\dtypes\inference.py in is_sequence(obj)
470
471 try:
--> 472 iter(obj) # Can iterate over it.
473 len(obj) # Has a length associated with it.
474 return not isinstance(obj, string_and_binary_types)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\window.py in __iter__(self)
184 def __iter__(self):
185 url = 'https://github.com/pandas-dev/pandas/issues/11704'
--> 186 raise NotImplementedError('See issue #11704 {url}'.format(url=url))
187
188 def _get_index(self, index=None):
NotImplementedError: See issue #11704 https://github.com/pandas-dev/pandas/issues/11704
When looking for documentation on window objects which seems that window objects are Python2 objects. Anyway here is predictions_df_list["prices"] which I am working with for a reproducing the error :
2007-11-01 14021.1
2007-11-02 13825.1
2007-11-03 13533.1
2007-11-04 14021.1
2007-11-05 13345.1
2007-11-06 12578.1
2007-11-07 14021.1
2007-11-08 13533.1
2007-11-09 12678.1
2007-11-10 12578.1
2007-11-11 14021.1
2007-11-12 13825.1
2007-11-13 13533.1
2007-11-14 12661.1
2007-11-15 13320.1
2007-11-16 12678.1
2007-11-17 12775.1
2007-11-18 13533.1
2007-11-19 13868.1
2007-11-20 12581.1
2007-11-21 13345.1
2007-11-22 13533.1
2007-11-23 12678.1
2007-11-24 13533.1
2007-11-25 12684.1
2007-11-26 13825.1
2007-11-27 14021.1
2007-11-28 14021.1
2007-11-29 12678.1
2007-11-30 12578.1
...
2007-12-02 13320.1
2007-12-03 12661.1
2007-12-04 13533.1
2007-12-05 12578.1
2007-12-06 13533.1
2007-12-07 13533.1
2007-12-08 14021.1
2007-12-09 12639.1
2007-12-10 12661.1
2007-12-11 13345.1
2007-12-12 12578.1
2007-12-13 14021.1
2007-12-14 13345.1
2007-12-15 13533.1
2007-12-16 12895.1
2007-12-17 13686.1
2007-12-18 14052.1
2007-12-19 14021.1
2007-12-20 13686.1
2007-12-21 12730.1
2007-12-22 13686.1
2007-12-23 12586.1
2007-12-24 12741.1
2007-12-25 12678.1
2007-12-26 13533.1
2007-12-27 12775.1
2007-12-28 12578.1
2007-12-29 12661.1
2007-12-30 12895.1
2007-12-31 12639.1
Freq: D, Name: prices, Length: 61, dtype: float64
Your ewma values can be found by using the EMA object you have and calling .mean() on it.
df['ewm'] = df['values'].ewm(alpha=0.001).mean()

Cannot plot dataframe as barh because TypeError: Empty 'DataFrame': no numeric data to plot

I have been all over this site and google trying to solve this problem.
It appears as though I'm missing a fundamental concept in making a plottable dataframe.
I've tried to ensure that I have a column of strings for the "Teams" and a column of ints for the "Points"
Still I get: TypeError: Empty 'DataFrame': no numeric data to plot
import csv
import pandas
import numpy
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter
set_of_teams = set()
def load_epl_games(file_name):
with open(file_name, newline='') as csvfile:
reader = csv.DictReader(csvfile)
raw_data = {"HomeTeam": [], "AwayTeam": [], "FTHG": [], "FTAG": [], "FTR": []}
for row in reader:
set_of_teams.add(row["HomeTeam"])
set_of_teams.add(row["AwayTeam"])
raw_data["HomeTeam"].append(row["HomeTeam"])
raw_data["AwayTeam"].append(row["AwayTeam"])
raw_data["FTHG"].append(row["FTHG"])
raw_data["FTAG"].append(row["FTAG"])
raw_data["FTR"].append(row["FTR"])
data_frame = pandas.DataFrame(data=raw_data)
return data_frame
def calc_points(team, table):
points = 0
for row_number in range(table["HomeTeam"].count()):
home_team = table.loc[row_number, "HomeTeam"]
away_team = table.loc[row_number, "AwayTeam"]
if team in [home_team, away_team]:
home_team_points = 0
away_team_points = 0
winner = table.loc[row_number, "FTR"]
if winner == 'H':
home_team_points = 3
elif winner == 'A':
away_team_points = 3
else:
home_team_points = 1
away_team_points = 1
if team == home_team:
points += home_team_points
else:
points += away_team_points
return points
def get_goals_scored_conceded(team, table):
scored = 0
conceded = 0
for row_number in range(table["HomeTeam"].count()):
home_team = table.loc[row_number, "HomeTeam"]
away_team = table.loc[row_number, "AwayTeam"]
if team in [home_team, away_team]:
if team == home_team:
scored += int(table.loc[row_number, "FTHG"])
conceded += int(table.loc[row_number, "FTAG"])
else:
scored += int(table.loc[row_number, "FTAG"])
conceded += int(table.loc[row_number, "FTHG"])
return (scored, conceded)
def compute_table(df):
raw_data = {"Team": [], "Points": [], "GoalDifference":[], "Goals": []}
for team in set_of_teams:
goal_data = get_goals_scored_conceded(team, df)
raw_data["Team"].append(team)
raw_data["Points"].append(calc_points(team, df))
raw_data["GoalDifference"].append(goal_data[0] - goal_data[1])
raw_data["Goals"].append(goal_data[0])
data_frame = pandas.DataFrame(data=raw_data)
data_frame = data_frame.sort_values(["Points", "GoalDifference", "Goals"], ascending=[False, False, False]).reset_index(drop=True)
data_frame.index = numpy.arange(1,len(data_frame)+1)
data_frame.index.names = ["Finish"]
return data_frame
def get_finish(team, table):
return table[table.Team==team].index.item()
def get_points(team, table):
return table[table.Team==team].Points.item()
def display_hbar(tables):
raw_data = {"Team": [], "Points": []}
for row_number in range(tables["Team"].count()):
raw_data["Team"].append(tables.loc[row_number+1, "Team"])
raw_data["Points"].append(int(tables.loc[row_number+1, "Points"]))
df = pandas.DataFrame(data=raw_data)
#df = pandas.DataFrame(tables, columns=["Team", "Points"])
print(df)
print(df.dtypes)
df["Points"].apply(int)
print(df.dtypes)
df.plot(kind='barh',x='Points',y='Team')
games = load_epl_games('epl2016.csv')
final_table = compute_table(games)
#print(final_table)
#print(get_finish("Tottenham", final_table))
#print(get_points("West Ham", final_table))
display_hbar(final_table)
The output:
Team Points
0 Chelsea 93
1 Tottenham 86
2 Man City 78
3 Liverpool 76
4 Arsenal 75
5 Man United 69
6 Everton 61
7 Southampton 46
8 Bournemouth 46
9 West Brom 45
10 West Ham 45
11 Leicester 44
12 Stoke 44
13 Crystal Palace 41
14 Swansea 41
15 Burnley 40
16 Watford 40
17 Hull 34
18 Middlesbrough 28
19 Sunderland 24
Team object
Points int64
dtype: object
Team object
Points int64
dtype: object
Traceback (most recent call last):
File "C:/Users/Michael/Documents/Programming/Python/Premier League.py", line 99, in <module>
display_hbar(final_table)
File "C:/Users/Michael/Documents/Programming/Python/Premier League.py", line 92, in display_hbar
df.plot(kind='barh',x='Points',y='Team')
File "C:\Program Files (x86)\Python36-32\lib\site- packages\pandas\plotting\_core.py", line 2941, in __call__
sort_columns=sort_columns, **kwds)
File "C:\Program Files (x86)\Python36-32\lib\site-packages\pandas\plotting\_core.py", line 1977, in plot_frame
**kwds)
File "C:\Program Files (x86)\Python36-32\lib\site-packages\pandas\plotting\_core.py", line 1804, in _plot
plot_obj.generate()
File "C:\Program Files (x86)\Python36-32\lib\site-packages\pandas\plotting\_core.py", line 258, in generate
self._compute_plot_data()
File "C:\Program Files (x86)\Python36-32\lib\site-packages\pandas\plotting\_core.py", line 373, in _compute_plot_data
'plot'.format(numeric_data.__class__.__name__))
TypeError: Empty 'DataFrame': no numeric data to plot
What am I doing wrong in my display_hbar function that is preventing me from plotting my data?
Here is the csv file
df.plot(x = "Team", y="Points", kind="barh");
You should swap x and y in df.plot(...). Because y must be numeric according to the pandas documentation.

Resources