How do I unstack these three histogram columns? Position Argument Fails - python-3.x

I am using the below code to create a histogram plot of three different variables. I would like to separate out the three bars at each data point in order to obtain a better visualizations. I tried adding the "position" argument for each feature but it won't work
count, bin_edges = np.histogram(df['age'])
fig = plt.figure(figsize=(7,3))
ax = fig.add_subplot(111) # Create matplotlib axes
df['age'].plot(kind = 'hist', figsize=(10,5), xticks = bin_edges,
width = 2, color = 'blue', alpha=0.4)
df[df['y'] == 1]['age'].plot(kind = 'hist', figsize=(10,5), xticks = bin_edges,
width = 2, color='red', alpha=0.4)
df[(df['y'] == 1)&(df['new_customer'] == 1)]['age'].plot(kind = 'hist', figsize=(10,5), xticks = bin_edges,
width = 2, color='green', alpha=0.4)
plt.title("Age")
plt.xlabel("Age Bins")
plt.ylabel("Number of Contacts")
plt.legend(loc='upper right')
plt.show()
EDIT: this is what my df looks like:
df[['age', 'y', 'new_customer']]
age y new_customer
0 56 0 1
1 57 0 1
2 37 0 1
3 40 0 1
4 56 0 1
5 45 0 1
6 59 0 1
7 41 0 1
8 24 0 1
9 25 0 1
10 41 0 1
11 25 0 1
12 29 0 1

The pandas plotting api isn't nearly as flexible as the underlying Matplotlib library it uses to make the actual plots. Just use Matplotlib directly:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
csv = ''' age y new_customer
0 56 0 1
1 57 1 1
2 37 0 1
3 40 0 1
4 56 1 1
5 45 0 0
6 59 0 1
7 41 1 1
8 24 0 0
9 25 0 1
10 41 1 1
11 25 0 0
12 29 0 1'''
df = pd.read_csv(pd.compat.StringIO(csv), sep='\s+')
bin_edges = np.histogram_bin_edges(df['age'])
fig = plt.figure(figsize=(7,3))
ax = fig.add_subplot(111) # Create matplotlib axes
data = [df['age'],
df[df['y'] == 1]['age'],
df[(df['y'] == 1)&(df['new_customer'] == 1)]['age']]
plt.hist(data, bins=bin_edges, label=['age', 'age_y', 'age_y_newcustomer'])
bin_cens = (bin_edges[:-1] + bin_edges[1:])/2
plt.xticks(bin_cens)
plt.title("Age")
plt.xlabel("Age Bins (center)")
plt.ylabel("Number of Contacts")
plt.legend()
plt.show()
Output:

Related

How to write values over matplotlib bar charts without distorted figures

NOTE
The answers here produce a distorted figure; here is the bad result:
Here is the modified code that produces the bad result:
def plot_compare_bar(col1, col2, frame, fig_prefix=''):
frame = frame.sort_values(by=col1)
ind = np.arange(len(frame))
width = 0.4
fig, ax = plt.subplots(figsize=(9, 5))
ax.barh(ind, frame[col1], width, color='red', label=col1)
ax.barh(ind + width, frame[col2], width, color='blue', label=col2)
ax.set(
yticks=ind + width, yticklabels=frame['Class Name'],
ylim=[2 * width - 1, len(frame)], title=(
f'{fig_prefix} {col1} vs {col2} evaluation results'))
for i, v in enumerate(frame[col1].values):
ax.text(v + 3, i + .25, str(v), color='red', fontweight='bold')
for i, v in enumerate(frame[col2].values):
ax.text(v + 3, i + .25, str(v), color='blue', fontweight='bold')
ax.legend()
The original question:
import matplotlib.pyplot as plt
import numpy as np
def plot_compare_bar(col1, col2, frame, fig_prefix=''):
frame = frame.sort_values(by=col1)
ind = np.arange(len(frame))
width = 0.4
fig, ax = plt.subplots(figsize=(9, 5))
ax.barh(ind, frame[col1], width, color='red', label=col1)
ax.barh(ind + width, frame[col2], width, color='blue', label=col2)
ax.set(
yticks=ind + width, yticklabels=frame['Class Name'],
ylim=[2 * width - 1, len(frame)], title=(
f'{fig_prefix} {col1} vs {col2} evaluation results'))
ax.legend()
frame is a pandas DataFrame that looks like the following:
Class Name Average Precision Actual Detections True Positives False Positives Combined
2 Palm Tree 91.152760 1379 1428 1292 141 1433
5 Traffic Lights 71.026533 1269 1036 948 88 1036
3 Street Lamp 63.871910 995 848 727 121 848
0 Car 57.535491 3153 1955 1881 104 1985
1 Street Sign 56.925982 1109 704 658 46 704
6 Pedestrian 55.243564 1418 887 835 73 908
13 Road Block 52.182602 363 215 203 14 217
4 Minivan 51.786659 68 41 38 3 41
11 Bus 36.805556 43 18 16 2 18
9 Trash Can 14.444444 90 13 13 0 13
10 Bicycle 5.882353 17 1 1 0 1
8 Flag 5.000000 124 10 7 3 10
7 Fire Hydrant 1.923077 52 1 1 0 1
12 Pickup Truck 0.000000 20 0 0 0 0
14 Delivery Truck 0.000000 4 0 0 0 0
15 Motorcycle 0.000000 3 0 0 0 0
The function I defined above, produces the following plot:
I need every bar's value written next to it which might look like this:
How to modify the function above to do it.
You should not hard-coded the text. Instead, try to extract the values from patches:
def plot_compare_bar(col1, col2, frame, fig_prefix=''):
frame = frame.sort_values(by=col1)
ind = np.arange(len(frame))
width = 0.4
fig, ax = plt.subplots(figsize=(10,10))
ax.barh(ind, frame[col1], width, color='red', label=col1)
ax.barh(ind + width, frame[col2], width, color='blue', label=col2)
ax.set(
yticks=ind + width, yticklabels=frame['Class Name'],
ylim=[2 * width - 1, len(frame)], title=(
f'{fig_prefix} {col1} vs {col2} evaluation results'))
# annotation here
for patch in ax.patches:
# extract information from patch
pw = patch.get_width()
_,y = patch.get_xy()
color = patch.get_facecolor()
ax.text(pw + 3, y + width/2, str(pw),
color=color,verticalalignment='center')
ax.legend(loc='lower right')
Output:

How can I add an X axis showing plot data seconds to a matplotlib pyplot price volume graph?

The code below plots a price volume chart using data from a tab separated csv file. Each row contains values for those columns: IDX, TRD, TIMESTAMPMS, VOLUME and PRICE. As is, the X axis shows the IDX value. I would like the X axis to display the seconds computed from the timestamp in milliseconds attached to each row. How can this be obtained ?
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import pandas as pd
data = pd.read_csv('secondary-2018-08-12-21-32-56.csv', index_col=0, sep='\t')
print(data.head(50))
fig, ax = plt.subplots(nrows=2, sharex=True, figsize=(10,5))
ax[0].plot(data.index, data['PRICE'])
ax[1].bar(data.index, data['VOLUME'])
plt.show()
The drawn graph looks like this:
Here are the data as displayed by the
print(data.head(50))
instruction:
TRD TIMESTAMPMS VOLUME PRICE
IDX
1 4 1534102380000 0.363583 6330.41
2 20 1534102381000 5.509219 6329.13
3 3 1534102382000 0.199049 6328.69
4 5 1534102383000 1.055055 6327.36
5 2 1534102384000 0.006343 6328.26
6 4 1534102385000 0.167502 6330.38
7 1 1534102386000 0.002039 6326.69
8 0 1534102387000 0.000000 6326.69
9 4 1534102388000 0.163813 6327.62
10 2 1534102389000 0.007060 6326.66
11 4 1534102390000 0.015489 6327.64
12 5 1534102391000 0.035618 6328.35
13 2 1534102392000 0.006003 6330.12
14 5 1534102393000 0.172913 6328.77
15 1 1534102394000 0.019972 6328.03
16 3 1534102395000 0.007429 6328.03
17 1 1534102396000 0.000181 6328.03
18 3 1534102397000 1.041483 6328.03
19 2 1534102398000 0.992897 6328.74
20 3 1534102399000 0.061871 6328.11
21 2 1534102400000 0.000123 6328.77
22 4 1534102401000 0.028650 6330.25
23 2 1534102402000 0.035504 6330.01
24 3 1534102403000 0.982527 6330.11
25 5 1534102404000 0.298366 6329.11
26 2 1534102405000 0.071119 6330.06
27 3 1534102406000 0.025547 6330.02
28 2 1534102407000 0.003413 6330.11
29 4 1534102408000 0.431217 6330.05
30 3 1534102409000 0.021627 6330.23
31 1 1534102410000 0.009661 6330.28
32 1 1534102411000 0.004209 6330.27
33 1 1534102412000 0.000603 6328.07
34 6 1534102413000 0.655872 6330.31
35 1 1534102414000 0.000452 6328.09
36 7 1534102415000 0.277340 6328.07
37 8 1534102416000 0.768351 6328.04
38 1 1534102417000 0.078893 6328.20
39 2 1534102418000 0.000446 6326.24
40 2 1534102419000 0.317381 6326.83
41 2 1534102420000 0.100009 6326.24
42 2 1534102421000 0.000298 6326.25
43 6 1534102422000 0.566820 6330.00
44 1 1534102423000 0.000060 6326.30
45 2 1534102424000 0.047524 6326.30
46 4 1534102425000 0.748773 6326.61
47 3 1534102426000 0.007656 6330.23
48 1 1534102427000 0.000019 6326.32
49 1 1534102428000 0.000014 6326.34
50 0 1534102429000 0.000000 6326.34
I believe you need to data.setindex('TIMESTAMPMS') to get the axis to autoscale
I dont know if i understood you correctly, try with:
data['TIMESTAMPMS'] = data['TIMESTAMPMS']/1000
ax[0].plot(data['TIMESTAMPMS'], data['PRICE'])
ax[1].bar(data['TIMESTAMPMS'], data['VOLUME'])

How to expand Python Pandas Dataframe in linearly spaced increments

Beginner question:
I have a pandas dataframe that looks like this:
x1 y1 x2 y2
0 0 2 2
10 10 12 12
and I want to expand that dataframe by half units along the x and y coordinates to look like this:
x1 y1 x2 y2 Interpolated_X Interpolated_Y
0 0 2 2 0 0
0 0 2 2 0.5 0.5
0 0 2 2 1 1
0 0 2 2 1.5 1.5
0 0 2 2 2 2
10 10 12 12 10 10
10 10 12 12 10.5 10.5
10 10 12 12 11 11
10 10 12 12 11.5 11.5
10 10 12 12 12 12
Any help would be much appreciated.
The cleanest way I know how to expand rows like this is through groupby.apply. May be faster to use something like itertuples in pandas but it will be a little more complicated code (keep that in mind if your data-set is larger).
groupby the index which will send each row to my apply function (your index has to be unique for each row, if its not just run reset_index). I can return a DataFrame from my apply therefore we can expand from one row to multiple rows.
caveat, your x2-x1 and y2-y1 distance must be the same or this won't work.
import pandas as pd
import numpy as np
def expand(row):
row = row.iloc[0] # passes a dateframe so this gets reference to first and only row
xdistance = (row.x2 - row.x1)
ydistance = (row.y2 - row.y1)
xsteps = np.arange(row.x1, row.x2 + .5, .5) # create steps arrays
ysteps = np.arange(row.y1, row.y2 + .5, .5)
return (pd.DataFrame([row] * len(xsteps)) # you can expand lists in python by multiplying like this [val] * 3 = [val, val, val]
.assign(int_x = xsteps, int_y = ysteps))
(df.groupby(df.index) # "group" on each row
.apply(expand) # send row to expand function
.reset_index(level=1, drop=True)) # groupby gives us an extra index we don't want
starting df
x1 y1 x2 y2
0 0 2 2
10 10 12 12
ending df
x1 y1 x2 y2 int_x int_y
0 0 0 2 2 0.0 0.0
0 0 0 2 2 0.5 0.5
0 0 0 2 2 1.0 1.0
0 0 0 2 2 1.5 1.5
0 0 0 2 2 2.0 2.0
1 10 10 12 12 10.0 10.0
1 10 10 12 12 10.5 10.5
1 10 10 12 12 11.0 11.0
1 10 10 12 12 11.5 11.5
1 10 10 12 12 12.0 12.0

delete specific rows from csv using pandas

I have a csv file in the format shown below:
I have written the following code that reads the file and randomly deletes the rows that have steering value as 0. I want to keep just 10% of the rows that have steering value as 0.
df = pd.read_csv(filename, header=None, names = ["center", "left", "right", "steering", "throttle", 'break', 'speed'])
df = df.drop(df.query('steering==0').sample(frac=0.90).index)
However, I get the following error:
df = df.drop(df.query('steering==0').sample(frac=0.90).index)
locs = rs.choice(axis_length, size=n, replace=replace, p=weights)
File "mtrand.pyx", line 1104, in mtrand.RandomState.choice
(numpy/random/mtrand/mtrand.c:17062)
ValueError: a must be greater than 0
Can you guys help me?
sample DataFrame built with #andrew_reece's code
In [9]: df
Out[9]:
center left right steering throttle brake
0 center_54.jpg left_75.jpg right_39.jpg 1 0 0
1 center_20.jpg left_81.jpg right_49.jpg 3 1 1
2 center_34.jpg left_96.jpg right_11.jpg 0 4 2
3 center_98.jpg left_87.jpg right_34.jpg 0 0 0
4 center_67.jpg left_12.jpg right_28.jpg 1 1 0
5 center_11.jpg left_25.jpg right_94.jpg 2 1 0
6 center_66.jpg left_27.jpg right_52.jpg 1 3 3
7 center_18.jpg left_50.jpg right_17.jpg 0 0 4
8 center_60.jpg left_25.jpg right_28.jpg 2 4 1
9 center_98.jpg left_97.jpg right_55.jpg 3 3 0
.. ... ... ... ... ... ...
90 center_31.jpg left_90.jpg right_43.jpg 0 1 0
91 center_29.jpg left_7.jpg right_30.jpg 3 0 0
92 center_37.jpg left_10.jpg right_15.jpg 1 0 0
93 center_18.jpg left_1.jpg right_83.jpg 3 1 1
94 center_96.jpg left_20.jpg right_56.jpg 3 0 0
95 center_37.jpg left_40.jpg right_38.jpg 0 3 1
96 center_73.jpg left_86.jpg right_71.jpg 0 1 0
97 center_85.jpg left_31.jpg right_0.jpg 3 0 4
98 center_34.jpg left_52.jpg right_40.jpg 0 0 2
99 center_91.jpg left_46.jpg right_17.jpg 0 0 0
[100 rows x 6 columns]
In [10]: df.steering.value_counts()
Out[10]:
0 43 # NOTE: 43 zeros
1 18
2 15
4 12
3 12
Name: steering, dtype: int64
In [11]: df.shape
Out[11]: (100, 6)
your solution (unchanged):
In [12]: df = df.drop(df.query('steering==0').sample(frac=0.90).index)
In [13]: df.steering.value_counts()
Out[13]:
1 18
2 15
4 12
3 12
0 4 # NOTE: 4 zeros (~10% from 43)
Name: steering, dtype: int64
In [14]: df.shape
Out[14]: (61, 6)
NOTE: make sure that steering column has numeric dtype! If it's a string (object) then you would need to change your code as follows:
df = df.drop(df.query('steering=="0"').sample(frac=0.90).index)
# NOTE: ^ ^
after that you can save the modified (reduced) DataFrame to CSV:
df.to_csv('/path/to/filename.csv', index=False)
Here's a one-line approach, using concat() and sample():
import numpy as np
import pandas as pd
# first, some sample data
# generate filename fields
positions = ['center','left','right']
N = 100
fnames = ['{}_{}.jpg'.format(loc, np.random.randint(100)) for loc in np.repeat(positions, N)]
df = pd.DataFrame(np.array(fnames).reshape(3,100).T, columns=positions)
# generate numeric fields
values = [0,1,2,3,4]
probas = [.5,.2,.1,.1,.1]
df['steering'] = np.random.choice(values, p=probas, size=N)
df['throttle'] = np.random.choice(values, p=probas, size=N)
df['brake'] = np.random.choice(values, p=probas, size=N)
print(df.shape)
(100,3)
The first few rows of sample output:
df.head()
center left right steering throttle brake
0 center_72.jpg left_26.jpg right_59.jpg 3 3 0
1 center_75.jpg left_68.jpg right_26.jpg 0 0 2
2 center_29.jpg left_8.jpg right_88.jpg 0 1 0
3 center_22.jpg left_26.jpg right_23.jpg 1 0 0
4 center_88.jpg left_0.jpg right_56.jpg 4 1 0
5 center_93.jpg left_18.jpg right_15.jpg 0 0 0
Now drop all but 10% of rows with steering==0:
newdf = pd.concat([df.loc[df.steering!=0],
df.loc[df.steering==0].sample(frac=0.1)])
With the probability weightings I used in this example, you'll see somewhere between 50-60 remaining entries in newdf, with about 5 steering==0 cases remaining.
Using a mask on steering combined with a random number should work:
df = df[(df.steering != 0) | (np.random.rand(len(df)) < 0.1)]
This does generate some extra random values, but it's nice and compact.
Edit: That said, I tried your example code and it worked as well. My guess is the error is coming from the fact that your df.query() statement is returning an empty dataframe, which probably means that the "sample" column does not contain any zeros, or alternatively that the column is read as strings rather than numeric. Try converting the column to integer before running the above snippet.

Empty square for legend for stackplot

I'm trying to generate a stack plot of version data using matplotlib. I have that portion working and displaying properly, but I'm unable to get the legend to display anything other than an empty square in the corner.
ra_ys = np.asarray(ra_ys)
# Going to generate a stack plot of the version stats
fig = plt.figure()
ra_plot = fig.add_subplot(111)
# Our x axis is going to be the dates, but we need them as numbers
x = [date2num(date) for date in dates]
# Plot the data
ra_plot.stackplot(x, ra_ys)
# Setup our legends
ra_plot.legend(ra_versions) #Also tried converting to a tuple
ra_plot.set_title("blah blah words")
print(ra_versions)
# Only want x ticks on the dates we supplied, and want them to display AS dates
ra_plot.set_xticks(x)
ra_plot.set_xticklabels([date.strftime("%m-%d") for date in dates])
plt.show()
ra_ys is a multidimensional array:
[[ 2 2 2 2 2 2 2 2 2 2 1]
[ 1 1 1 1 1 1 1 1 1 1 1]
[ 1 1 1 1 1 1 1 1 1 1 1]
[53 52 51 50 50 49 48 48 48 48 47]
[18 19 20 20 20 20 21 21 21 21 21]
[ 0 0 12 15 17 18 19 19 19 19 22]
[ 5 5 3 3 3 3 3 3 3 3 3]
[ 4 4 3 3 2 2 2 2 2 2 2]
[14 14 6 4 3 3 2 2 2 2 2]
[ 1 1 1 1 1 1 1 1 1 1 1]
[ 1 1 1 1 1 1 1 1 1 1 1]
[ 1 1 1 1 1 1 1 1 1 1 1]
[ 2 2 2 2 2 2 2 2 2 2 2]
[ 1 1 1 1 1 1 1 1 1 1 1]
[ 1 1 1 1 1 1 1 1 1 1 1]
[ 3 3 2 2 2 2 2 2 2 2 2]]
x is some dates: [734969.0, 734970.0, 734973.0, 734974.0, 734975.0, 734976.0, 734977.0, 734978.0, 734979.0, 734980.0, 734981.0]
ra_versions is a list: ['4.5.2', '4.5.7', '4.5.8', '5.0.0', '5.0.1', '5.0.10', '5.0.7', '5.0.8', '5.0.9', '5.9.105', '5.9.26', '5.9.27', '5.9.29', '5.9.31', '5.9.32', '5.9.34']
Am I doing something wrong? Can stack plots not have legends?
EDIT: I tried to print the handles and labels for the plot and got two empty lists ([] []):
handles, labels = theplot.get_legend_handles_labels()
print(handles,labels)
I then tested the same figure using the follow code for a proxy handle and it worked. So it looks like the lack of handles is the problem.
p = plt.Rectangle((0, 0), 1, 1, fc="r")
theplot.legend([p], ['test'])
So now the question is, how can I generate a variable number of proxy handles that match the colors of my stack plot?
This is the final (cleaner) approach to getting the legend. Since there are no handles, I generate proxy artists for each line. It's theoretically capable of handling cases where colors are reused, but it'll be confusing.
def plot_version_data(title, dates, versions, version_ys, savename=None):
print("Prepping plot for \"{0}\"".format(title))
fig = plt.figure()
theplot = fig.add_subplot(111)
# Our x axis is going to be the dates, but we need them as numbers
x = [date2num(date) for date in dates]
# Use these colors
colormap = "bgrcmy"
theplot.stackplot(x, version_ys, colors=colormap)
# Make some proxy artists for the legend
p = []
i = 0
for _ in versions:
p.append(plt.Rectangle((0, 0), 1, 1, fc=colormap[i]))
i = (i + 1) % len(colormap)
theplot.legend(p, versions)
theplot.set_ylabel(versions) # Cheating way to handle the legend
theplot.set_title(title)
# Setup the X axis - rotate to keep from overlapping, display like Oct-16,
# make sure there's no random whitespace on either end
plt.xticks(rotation=315)
theplot.set_xticks(x)
theplot.set_xticklabels([date.strftime("%b-%d") for date in dates])
plt.xlim(x[0],x[-1])
if savename:
print("Saving output as \"{0}\"".format(savename))
fig.savefig(os.path.join(sys.path[0], savename))
else:
plt.show()

Resources