Python: Compute Bin-Mean Value of Scatter Plot Bullets - python-3.x

I have three 1D arrays (A, B, C) of equal length/size. I plot a scatter plot of B vs. A where I color each scatter plot bullet by the corresponding value in the C array (see the code below).
# Imports
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
# Create the Arrays
A = 10 * np.random.random_sample((20, 20))
B = 10 * np.random.random_sample((20, 20))
C = 100 * np.random.random_sample((20, 20))
A = A.reshape(20*20)
B = B.reshape(20*20)
C = C.reshape(20*20)
# Create the Colormap and Define Boundaries
cmap_C = cm.jet
cmap_C.set_bad(color='white')
bounds_C = np.arange(0, 110, 10)
norm_C = mpl.colors.BoundaryNorm(bounds_C, cmap_C.N)
# Plot the Figure
plt.figure()
plt.scatter(A, B, c=C, marker='o', s=100, cmap=cmap_C, norm=norm_C)
plt.xlim([-1, 11])
plt.ylim([-1, 11])
plt.xticks(np.arange(0, 11, 1))
plt.yticks(np.arange(0, 11, 1))
plt.xlabel('A')
plt.ylabel('B')
plt.grid()
plt.colorbar(label='Value of C')
plt.show()
Some bullets overlap in the figure so we cannot see them clearly. Therfore, next I now want to compute and plot the mean C value of all scatter plot bullets within each 1 integer x 1 integer bin in the figure so that each square grid point is colored by one single color (these bins are illustrated by the figure gridding). How can I do this?

It's not totally clear what you are trying to do, but I think there is an analytic result to your question before you work too hard. The expected mean value of color (C vector) is 50 because you have generated a uniformly distributed sample [0, 100]. The coordinates are also uniformly distributed, but that is irrelevant. Of course, there will be some variance in each of the grid squares.
If you need to go forward as an exercise, I'd construct a dictionary of coordinate:color mappings to help set up a screen...
color_map = {(x, y): color for x, y, color in zip(A,B,C)}
Then you could set up a dictionary to gather results for each grid and probably by taking the int() value of the coordinates put the data into the correct data field for the grid

Below is a solution that works for my purposes.
# Imports
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
from zipfile import ZipFile
# Create the Arrays
xx = 5
yy = 5
A = 10 * np.random.random_sample((xx, yy))
B = 10 * np.random.random_sample((xx, yy))
C = 100 * np.random.random_sample((xx, yy))
A = A.reshape(xx*yy)
B = B.reshape(xx*yy)
C = C.reshape(xx*yy)
color_map = {(x, y): color for x, y, color in zip(A,B,C)}
xedges = np.arange(11)
yedges = np.arange(11)
H, xedges, yedges = np.histogram2d(A, B, bins=(xedges, yedges))
HT = H.T
ca = np.asarray(list(color_map))
print(ca)
cai = ca.astype(int)
print(cai)
# Extracting all dictionary values using loop + keys()
res = []
for key in color_map.keys() :
res.append(color_map[key])
res = np.asarray(res)
resi = res.astype(int)
print(resi)
BMC = np.zeros([10, 10])
for i in np.arange(len(resi)):
BMC[cai[i,1],cai[i,0]] = BMC[cai[i,1],cai[i,0]] + resi[i]
print(cai[i])
print(resi[i])
print(BMC[cai[i,1],cai[i,0]])
print(HT)
print(BMC)
BMC = BMC/HT
print(BMC)
# Create the Colormap and Define Boundaries
cmap_C = cm.jet
cmap_C.set_bad(color='white')
bounds_C = np.arange(-5, 115, 10)
norm_C = mpl.colors.BoundaryNorm(bounds_C, cmap_C.N)
cmap_hist2d = cm.CMRmap_r
cmap_hist2d.set_bad(color='white')
bounds_hist2d = np.arange(-0.5, 4.5, 1)
norm_hist2d = mpl.colors.BoundaryNorm(bounds_hist2d, cmap_hist2d.N)
cmap_C = cm.jet
cmap_C.set_bad(color='white')
BMC_plot = np.ma.array ( BMC, mask=np.isnan(BMC)) # Mask NaN
bounds_C = np.arange(-5, 115, 10)
norm_C = mpl.colors.BoundaryNorm(bounds_C, cmap_C.N)
plt.subplot(311)
plt.scatter(A, B, c=C, marker='o', s=100, cmap=cmap_C, norm=norm_C)
plt.xlim([-1, 11])
plt.ylim([-1, 11])
plt.xticks(np.arange(0, 11, 1))
plt.yticks(np.arange(0, 11, 1))
plt.ylabel('B')
plt.grid()
plt.colorbar(label='Value of C', ticks=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
plt.subplot(312)
x, y = np.meshgrid(xedges, yedges)
plt.pcolor(x, y, HT, cmap=cmap_hist2d, norm=norm_hist2d)
plt.xlim([-1, 11])
plt.ylim([-1, 11])
plt.xticks(np.arange(0, 11, 1))
plt.yticks(np.arange(0, 11, 1))
plt.ylabel('B')
plt.grid()
plt.colorbar(label='Number of Data in Bin', ticks=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
plt.subplot(313)
plt.pcolor(x, y, BMC_plot, cmap=cmap_C, norm=norm_C)
plt.xlim([-1, 11])
plt.ylim([-1, 11])
plt.xticks(np.arange(0, 11, 1))
plt.yticks(np.arange(0, 11, 1))
plt.xlabel('A')
plt.ylabel('B')
plt.grid()
plt.colorbar(label='Bin-Mean C Value', ticks=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
plt.show()

Related

Is there some way can add label in legend in plot by one step?

My legend now shows,
I want to add my label in legend, from 0 to 7, but I don't want to add a for-loop in my code and correct each label step by step, my code like that,
fig, ax = plt.subplots()
ax.set_title('Clusters by OPTICS in 2D space after PCA')
ax.set_xlabel('First Component')
ax.set_ylabel('Second Component')
points = ax.scatter(
pca_2_spec[:,0],
pca_2_spec[:,1],
s = 7,
marker='o',
c = pred_pca_2_spec,
cmap= 'rainbow')
ax.legend(*points.legend_elements(), title = 'cluster')
plt.show()
Assuming pred_pca_2_spec is some np.array with values [0, 5, 10, 15, 20, 30, 35] to change the values of these to be in the range 0-7, simply divide (each element) by 5.
Sample Data:
import numpy as np
from matplotlib import pyplot as plt
np.random.seed(54)
pca_2_spec = np.random.randint(-100, 300, (100, 2))
pred_pca_2_spec = np.random.choice([0, 5, 10, 15, 20, 25, 30, 35], 100)
Plotting Code:
fig, ax = plt.subplots()
ax.set_title('Clusters by OPTICS in 2D space after PCA')
ax.set_xlabel('First Component')
ax.set_ylabel('Second Component')
points = ax.scatter(
pca_2_spec[:, 0],
pca_2_spec[:, 1],
s=7,
marker='o',
c=pred_pca_2_spec / 5, # Divide By 5
cmap='rainbow')
ax.legend(*points.legend_elements(), title='cluster')
plt.show()

How to stop chart from printing in this cell but the next cell with really short code

These codes produce a chart
import numpy as np
import matplotlib.pyplot as plt
N = 5
menMeans = (20, 35, 30, 35, 27)
womenMeans = (25, 32, 34, 20, 25)
menStd = (2, 3, 4, 1, 2)
womenStd = (3, 5, 2, 3, 3)
ind = np.arange(N) # the x locations for the groups
width = 0.35 # the width of the bars: can also be len(x) sequence
p1 = plt.bar(ind, menMeans, width, yerr=menStd)
p2 = plt.bar(ind, womenMeans, width,
bottom=menMeans, yerr=womenStd)
plt.ylabel('Scores')
plt.title('Scores by group and gender')
plt.xticks(ind, ('G1', 'G2', 'G3', 'G4', 'G5'))
plt.yticks(np.arange(0, 81, 10))
plt.legend((p1[0], p2[0]), ('Men', 'Women'))
Jupyter notebook automatically print the chart, even I didn't call plt.show(). I don't want to show the chart in the same cell with the code but the next cell by running a really short code such as plt.show(). In order to keep the cell as concise as possible.
Just enclose all your plot-related statements inside a function called plot_and_show(). Then you can call the function when you are ready.
import matplotlib.pyplot as plt
import numpy as np
N = 5
menMeans = (20, 35, 30, 35, 27)
womenMeans = (25, 32, 34, 20, 25)
menStd = (2, 3, 4, 1, 2)
womenStd = (3, 5, 2, 3, 3)
ind = np.arange(N) # the x locations for the groups
width = 0.35 # the width of the bars: can also be len(x) sequence
def plot_and_show():
p1 = plt.bar(ind, menMeans, width, yerr=menStd)
p2 = plt.bar(ind, womenMeans, width,
bottom=menMeans, yerr=womenStd)
plt.ylabel('Scores')
plt.title('Scores by group and gender')
plt.xticks(ind, ('G1', 'G2', 'G3', 'G4', 'G5'))
plt.yticks(np.arange(0, 81, 10))
plt.legend((p1[0], p2[0]), ('Men', 'Women'))
plot_and_show()

How to autoscale y-axis for bargraph in matplotlib?

I need to autoscale the y-axis on my bargraph in matplotlib in order to display the small differences in values. The reason why it needs to be autoscaled instead of having a fixed limit is because the values will change depending on what the user inputs. I've tried yscale log, but that doesn't work for negative values. I've tried symlog, but the graph stays the same. This is my current code:
x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
y = range(700, 710, 1)
fig, ax = plt.subplots()
ax.bar(x, y)
plt.show()
Plots are automatically scaled for the full range of the data provided to the API.
For a bar plot, the best option to display the differences in the values of the bars, is probably to set the ylim for vertical bars or xlim for horizontal bars.
negative data
import matplotlib.pyplot as plt
x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
y = range(-700, -750, -5)
fig, ax = plt.subplots(figsize=(7, 5))
ax.bar(x, y)
plt.ylim(min(y), max(y))
positive data
x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
y = range(700, 750, 5)
fig, ax = plt.subplots(figsize=(7, 5))
ax.bar(x, y)
plt.ylim(min(y), max(y))
mixed data
If the data has a wide range of positive and negative values, there's probably not a good option, as you've noted symlog doesn't help the issue.
The best option may be to plot the positive and negative data separately.
Creating a mask does't work with a list, so convert the lists to numpy arrays.
import numpy as np
x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
y = [700, -700, 710, -710, 720, -720, 730, -730, 740, -740]
x = np.array(x)
y = np.array(y)
mask = y >= 0 # positive mask
pos_y = y[mask] # get the positive values
neg_y = y[~mask] # get the negative values; ~ is not
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(7, 5))
ax1.bar(x[mask], pos_y) # also mask x to plot the bar at the correct x-tick
ax1.set_title('Positive Values')
ax1.set_ylim(min(pos_y), max(pos_y))
ax1.set_xticks(range(0, 12)) # buffer the number of x-ticks, so the x-ticks of the two plots align.
ax2.bar(x[~mask], neg_y)
ax2.set_title('Negative Values')
ax2.set_ylim(min(neg_y), max(neg_y))
ax2.set_xticks(range(0, 12))
plt.tight_layout() # better spacing between the two plots

How to set line colors using hex from color palette holoviews

I want to specify manually the color of a line segment in holoviews, based on a third column.
I'm aware of the hv.Path examples, however, this reduces the length of the line with 1 segment, which I don't want.
I can do it using bokeh, or using matplotlib, but I can't get it right using holoviews
def get_color(min_val, max_val, val, palette):
return palette[(int((val-min_val)*((len(palette)-1)/(max_val-min_val))+.5))]
from bokeh.io import output_file, show
from bokeh.plotting import figure
y = [0,1,2,3,4,5]
x = [0]*len(y)
z = [1,2,3,4,5]
p = figure(plot_width=500, plot_height=200, tools='')
[p.line([x[i],x[i+1]],[y[i],y[i+1]],line_color = get_color(1,5,z,Viridis256), line_width=4) for i,z in enumerate(z) ]
show(p)
import numpy
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
# The line format you curently have:
lines = [[(0, 1, 2, 3, 4), (4, 5, 6, 7, 8)],
[(0, 1, 2, 3, 4), (0, 1, 2, 3, 4)],
[(0, 1, 2, 3, 4), (8, 7, 6, 5, 4)],
[(4, 5, 6, 7, 8), (0, 1, 2, 3, 4)]]
# Reformat it to what `LineCollection` expects:
lines = [zip(x, y) for x, y in lines]
z = np.array([0.1, 9.4, 3.8, 2.0])
fig, ax = plt.subplots()
lines = LineCollection(lines, array=z, cmap=plt.cm.rainbow, linewidths=5)
ax.add_collection(lines)
fig.colorbar(lines)
# Manually adding artists doesn't rescale the plot, so we need to autoscale
ax.autoscale()
plt.show()
from bokeh.io import output_file, show
from bokeh.plotting import figure
y = [0,1,2,3,4,5]
x = [0]*len(y)
z = [1,2,3,4,5]
p = figure(plot_width=500, plot_height=200, tools='')
[p.line([x[i],x[i+1]],[y[i],y[i+1]],line_color = get_color(1,5,z,Viridis256), line_width=4) for i,z in enumerate(z) ]
show(p)
from bokeh.palettes import Viridis256
curvlst = [hv.Curve([[x[i],y[i]],[x[i+1],y[i+1]]],line_color = get_color(1,5,z,Viridis256)) for i,z in enumerate(z) ]
hv.Overlay(curvlst)
WARNING:param.Curve26666: Setting non-parameter attribute line_color=#440154 using a mechanism intended only for parameters
You could use a so called dim transform by rewriting the function a little bit:
def get_color(val, min_val, max_val, palette):
return [palette[(int((val-min_val)*((len(palette)-1)/(max_val-min_val))+.5))]]
y = [0,1,2,3,4,5]
x = [0]*len(y)
z = [1,2,3,4,5]
hv.NdOverlay({z: hv.Curve(([x[i],x[i+1]], [y[i],y[i+1]])) for i, z in enumerate(z)}, kdims=['z']).opts(
'Curve', color=hv.dim('z', get_color, 1, 5, Viridis256))
That being said, I don't think you should have to manually colormap Curves so I've opened: https://github.com/pyviz/holoviews/issues/3764.
I think I found out..
from bokeh.palettes import Viridis256
def get_color(min_val, max_val, val, palette):
return palette[(int((val-min_val)*((len(palette)-1)/(max_val-min_val))+.5))]
curvlst = [hv.Curve([[x[i],y[i]],[x[i+1],y[i+1]]]).opts(color=get_color(1,5,z,Viridis256)) for i,z in enumerate(z) ]
hv.Overlay(curvlst)
Please let me know it this is good practise, or if you know a better way..

How to center the grid of a plot on scatter points?

I have this scatter plot:
I'd like to move the grid in a way that each point (green square) would be surrounded by the grid's cells. For example:
The code to reproduce the plot:
import matplotlib.pyplot as plt
data = [24, 24, 24, 16, 16, 2, 2, 2]
x = list(range(0, len(data)))
y = list(range(0, 25))
plt.scatter(x, data, marker='s', c='g', s=100)
plt.yticks(y)
plt.xticks(x)
plt.grid(True)
plt.show()
Maybe something like the following meets the requirement. You can use the minor ticks for the grid and the major ticks for the labels.
import numpy as np
import matplotlib.pyplot as plt
data = [24, 24, 24, 16, 16, 2, 2, 2]
x = list(range(0, len(data)))
fig, ax = plt.subplots()
ax.scatter(x, data, marker='s', c='g', s=49)
ax.set_yticks(np.arange(25))
ax.set_yticks(np.arange(25+1)-0.5, minor=True)
ax.set_xticks(np.arange(len(data)))
ax.set_xticks(np.arange(len(data)+1)-0.5, minor=True)
ax.grid(True, which="minor")
ax.set_aspect("equal")
plt.show()

Resources