How to color a imshow chart based on a custom range - layout

I'm working on a chart that works as a heatmap, but instead of a "traditional" color scale, I want to set a specific color to values between x and x+0.05;
The image below is a good visual example to illustrate what I want to do:
I have tried some approaches to do it but no one of them have worked.
Below is an MWE
import pandas as pd
import plotly.express as px
dict_vals = [
{"user1": None, "user2": 0.906, "user3": 0.842},
{"user1": 0.906, "user2": None, "user3": 0.527},
{"user1": 0.842, "user2": 0.527, "user3": None},
]
matrix_df=pd.DataFrame.from_dict(dict_vals)
fig = px.imshow(
matrix_df,
x=matrix_df.columns,
y=matrix_df.columns,
zmin=0.75,
zmax=0.95,
text_auto=True,
aspect="auto"
)
colors = [
"rgb(214, 11, 67)",
"rgb(255, 107, 0)",
"rgb(255, 201, 77)",
"rgb(168, 227, 0)",
"rgb(65, 175, 26)",
]
fig.update_coloraxes(
showscale=False,
colorscale=[
(0.0, colors[0]),
(0.8, colors[1]),
(0.85, colors[2]),
(0.9, colors[3]),
(0.95, colors[4]),
(1, colors[4]),
],
)
If you have any reference about how to solve this, I will be very grateful

I found a way to do what I was looking for
import pandas as pd
import plotly.express as px
import numpy as np
dict_vals = [
{"user1": None, "user2": 0.906, "user3": 0.842},
{"user1": 0.906, "user2": None, "user3": 0.527},
{"user1": 0.842, "user2": 0.527, "user3": None},
]
matrix_df=pd.DataFrame.from_dict(dict_vals)
color_list=[[0, colors[0]],
[0.80, colors[0]],
[0.80, colors[1]],
[0.85, colors[1]],
[0.85, colors[2]],
[0.90, colors[2]],
[0.90, colors[3]],
[0.95, colors[3]],
[0.95, colors[4]],
[1.00, colors[4]]]
fig = px.imshow(
matrix_df.values,
x=matrix_df.columns,
y=matrix_df.columns,
zmin=0,
zmax=1,
text_auto=True,
aspect="auto",
color_continuous_scale=color_list
)
fig.update_coloraxes(showscale=True)
The result will be:
I hope it can help someone in the future.
Regards,
Leonardo

Related

Custom annotation of text in seaborn heatmap

I want to assign different fontsizes for positive and negative values in the following heatmap plotted using seaborn.
import seaborn as sns # % matplotlib inline
import matplotlib.pyplot as plt
data = np.array([[0.000000, 0.000000], [-0.231049, 0.000000], [0.231049, 0.000000]])
data = {0: [0.000000, 0.000000], 1: [2.31049, 0.000000], 2: [-0.231049, 0.000000]}
df = pd.DataFrame.from_dict(data, orient='index')
sns.heatmap(
df, cmap='bwr', vmax=10, vmin=0, annot=True, fmt='f',
linewidths=0.25, annot_kws={"fontsize": 16}, center=0, square=True
)
sns.heatmap(
df, cmap='bwr', vmax=0, vmin=-10, annot=True, fmt='f',
linewidths=0.25, annot_kws={"fontsize": 6}, center=0, square=True
)
plt.show()
I tried to specify the min and max and plot, in two steps but the colors and fonts aren't-displayed right.
Suggestions on how to fix this will be of great help.
To make it easier to keep the properties in sync, the code below uses a for loop. For the positive part, the dataframe is filtered to only contain the positive values. (Internally, pandas fills in NaN for the values that get filtered away, and seaborn leaves those cells blank.)
vmin and vmax are set to the same values for both the negative and positive part of the loop. That way, the colorbar will show all values. To avoid drawing the colorbar twice, cbar=False once.
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randint(-10, 11, (12, 12)))
fig, ax = plt.subplots()
for posneg in ['pos', 'neg']:
sns.heatmap(
df[df > 0] if posneg == 'pos' else df[df < 0],
cmap='bwr', vmin=-10, vmax=10, center=0, annot=True, fmt='.0f',
annot_kws={"fontsize": 16 if posneg == 'pos' else 8},
cbar=(posneg != 'pos'), cbar_kws={'ticks': range(-10, 11, 2)},
linewidths=0.25, square=True, ax=ax
)
plt.show()
PS: The code above uses if/else inside some of the arguments. Such a conditional expression can be handy when only something short is involved, or in a list comprehension.
An alternative would be to use a normal if test together with variables, e.g.:
for posneg in ['pos', 'neg']:
if posneg == 'pos':
df_filtered = df[df > 0]
fontsize = 16
fontweight = 'bold'
else:
df_filtered = df[df < 0]
fontsize = 12
fontweight = 'light'
sns.heatmap(
df_filtered,
cmap='bwr', vmin=-10, vmax=10, center=0, annot=True, fmt='.0f',
annot_kws={"fontweight": fontweight, "fontsize": fontsize},
cbar=(posneg != 'pos'), cbar_kws={'ticks': range(-10, 11, 2)},
linewidths=0.25, square=True, ax=ax
)

Insert a png image in a matplotlib figure

I'm trying to insert a png image in matplotlib figure (ref)
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.figure import Figure
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
ax = plt.subplot(111)
ax.plot(
[1, 2, 3], [1, 2, 3],
'go-',
label='line 1',
linewidth=2
)
arr_img = plt.imread("stinkbug.png")
im = OffsetImage(arr_img)
ab = AnnotationBbox(im, (1, 0), xycoords='axes fraction')
ax.add_artist(ab)
plt.show()
Inset image:
Output obtained:
I'd like to know how to resize the image that has to be inserted to avoid overlaps.
EDIT:
Saving the figure
ax.figure.savefig("output.svg", transparent=True, dpi=600, bbox_inches="tight")
You can zoom the image and the set the box alignment to the lower right corner (0,1) plus some extra for the margins:
im = OffsetImage(arr_img, zoom=.45)
ab = AnnotationBbox(im, (1, 0), xycoords='axes fraction', box_alignment=(1.1,-0.1))
You may also want to use data coordinates, which is the default, and use the default box_alignment to the center, e.g. ab = AnnotationBbox(im, (2.6, 1.45)). See the xycoords parameter doc for more information about various coordinate options.

Kmeans - Find the closest distance between two points of two cluster

I'm using a csv file to divide into 2 clusters. The blue one is the benign URL and the red one is the malicious URL. The problem is my instructor want to know the closest point of two clusters and the distance between them. I tried to use the distance between two centroids but it didn't work!
Thank you for your help!
Here the code and and the clusters:
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data = pd.read_csv("major_combined.csv")
X_mal = data.iloc[:,[1, 5, 6, 11, 13, 15, 19, 20, 23]]
kmeans_mal = KMeans(n_clusters=2, init='random', max_iter=300, n_init=10, random_state = 0)
y_kmeans_mal = kmeans_mal.fit_predict(X_mal)
plt.figure(figsize=(10,10))
plt.scatter(X_mal.iloc[y_kmeans_mal == 0,4], X_mal.iloc[y_kmeans_mal == 0,8], s= 100, color = 'blue', label ='0')
plt.scatter(X_mal.iloc[y_kmeans_mal == 1,4], X_mal.iloc[y_kmeans_mal == 1,8], s= 100, color = 'red', label ='1')
plt.scatter(kmeans_mal.cluster_centers_[:,4], kmeans_mal.cluster_centers_[:,8], s= 300, color = 'yellow', label ='Centroid')
plt.title('k-means clustering')
plt.xlabel('document')
plt.ylabel('window')
plt.legend()
plt.grid()
plt.show()

Training a Random Forest on Tensorflow

I am trying to train a tensorflow based random forest regression on numerical and continuos data.
When I try to fit my estimator it begins with the message below:
INFO:tensorflow:Constructing forest with params =
INFO:tensorflow:{'num_trees': 10, 'max_nodes': 1000, 'bagging_fraction': 1.0, 'feature_bagging_fraction': 1.0, 'num_splits_to_consider': 10, 'max_fertile_nodes': 0, 'split_after_samples': 250, 'valid_leaf_threshold': 1, 'dominate_method': 'bootstrap', 'dominate_fraction': 0.99, 'model_name': 'all_dense', 'split_finish_name': 'basic', 'split_pruning_name': 'none', 'collate_examples': False, 'checkpoint_stats': False, 'use_running_stats_method': False, 'initialize_average_splits': False, 'inference_tree_paths': False, 'param_file': None, 'split_name': 'less_or_equal', 'early_finish_check_every_samples': 0, 'prune_every_samples': 0, 'feature_columns': [_NumericColumn(key='Average_Score', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), _NumericColumn(key='lat', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), _NumericColumn(key='lng', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)], 'num_classes': 1, 'num_features': 2, 'regression': True, 'bagged_num_features': 2, 'bagged_features': None, 'num_outputs': 1, 'num_output_columns': 2, 'base_random_seed': 0, 'leaf_model_type': 2, 'stats_model_type': 2, 'finish_type': 0, 'pruning_type': 0, 'split_type': 0}
Then the process breaks down and I get a value error below:
ValueError: Shape must be at least rank 2 but is rank 1 for 'concat' (op: 'ConcatV2') with input shapes: [?], [?], [?], [] and with computed input tensors: input[3] = <1>.
This is the code I am using:
import tensorflow as tf
from tensorflow.contrib.tensor_forest.python import tensor_forest
from tensorflow.python.ops import resources
import pandas as pd
from tensorflow.contrib.tensor_forest.client import random_forest
from tensorflow.python.estimator.inputs import numpy_io
import numpy as np
def getFeatures():
Average_Score = tf.feature_column.numeric_column('Average_Score')
lat = tf.feature_column.numeric_column('lat')
lng = tf.feature_column.numeric_column('lng')
return [Average_Score,lat ,lng]
# Import hotel data
Hotel_Reviews=pd.read_csv("./DataMining/Hotel_Reviews.csv")
Hotel_Reviews_Filtered=Hotel_Reviews[(Hotel_Reviews.lat.notnull() |
Hotel_Reviews.lng.notnull())]
Hotel_Reviews_Filtered_Target = Hotel_Reviews_Filtered[["Reviewer_Score"]]
Hotel_Reviews_Filtered_Features = Hotel_Reviews_Filtered[["Average_Score","lat","lng"]]
#Preprocess the data
x=Hotel_Reviews_Filtered_Features.to_dict('list')
for key in x:
x[key] = np.array(x[key])
y=Hotel_Reviews_Filtered_Target.values
#specify params
params = tf.contrib.tensor_forest.python.tensor_forest.ForestHParams(
feature_colums= getFeatures(),
num_classes=1,
num_features=2,
regression=True,
num_trees=10,
max_nodes=1000)
#build the graph
graph_builder_class = tensor_forest.RandomForestGraphs
est=random_forest.TensorForestEstimator(
params, graph_builder_class=graph_builder_class)
#define input function
train_input_fn = numpy_io.numpy_input_fn(
x=x,
y=y,
batch_size=1000,
num_epochs=1,
shuffle=True)
est.fit(input_fn=train_input_fn, steps=500)
The variables x is a list of numpy array of shape (512470,):
{'Average_Score': array([ 7.7, 7.7, 7.7, ..., 8.1, 8.1, 8.1]),
'lat': array([ 52.3605759, 52.3605759, 52.3605759, ..., 48.2037451,
48.2037451, 48.2037451]),
'lng': array([ 4.9159683, 4.9159683, 4.9159683, ..., 16.3356767,
16.3356767, 16.3356767])}
The variable y is numpy array of shape (512470,1):
array([[ 2.9],
[ 7.5],
[ 7.1],
...,
[ 2.5],
[ 8.8],
[ 8.3]])
Force each array in x to be 2 dim using ndmin=2. Then the shapes should match and concat should be able to operate.

How to plot vertical lines in plotly offline?

How would one plot a vertical line in plotly offline, using python? I want to add lines at x=20, x=40, and x=60, all in the same plot.
def graph_contracts(self):
trace1 = go.Scatter(
x=np.array(range(len(all_prices))),
y=np.array(all_prices), mode='markers', marker=dict(size=10, color='rgba(152, 0, 0, .8)'))
data = [trace1]
layout = go.Layout(title='Market Contracts by Period',
xaxis=dict(title='Contract #',
titlefont=dict(family='Courier New, monospace', size=18, color='#7f7f7f')),
yaxis=dict(title='Prices ($)',
titlefont=dict(family='Courier New, monospace', size=18, color='#7f7f7f')))
fig = go.Figure(data=data, layout=layout)
py.offline.plot(fig)
You can add lines via shape in layout, e.g.
import plotly
plotly.offline.init_notebook_mode()
import random
x=[i for i in range(100)]
trace = plotly.graph_objs.Scatter(x=x,
y=[random.random() for _ in x],
mode='markers')
shapes = list()
for i in (20, 40, 60):
shapes.append({'type': 'line',
'xref': 'x',
'yref': 'y',
'x0': i,
'y0': 0,
'x1': i,
'y1': 1})
layout = plotly.graph_objs.Layout(shapes=shapes)
fig = plotly.graph_objs.Figure(data=[trace],
layout=layout)
plotly.offline.plot(fig)
would give you
This is my example. The most important instruction is this.
fig.add_trace(go.Scatter(x=[12, 12], y=[-300,300], mode="lines", name="SIGNAL"))
The most important attribute is MODE='LINES'.
Actually this example is about a segment with x=12
EXAMPLE
import pandas as pd
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import numpy as np
import plotly.tools as tls
df1 = pd.read_csv('./jnjw_f8.csv')
layout = go.Layout(
xaxis = go.layout.XAxis(
tickmode = 'linear',
tick0 = 1,
dtick = 3
),
yaxis = go.layout.YAxis(
tickmode = 'linear',
tick0 = -100,
dtick = 3
))
fig = go.Figure(layout = layout)
fig.add_trace(go.Scatter(x = df1['x'], y =
df1['y1'],name='JNJW_sqrt'))
fig.add_trace(go.Scatter(x=[12, 12], y=[-300,300],
mode="lines", name="SIGNAL"))
fig.show()
Look here too.
how to plot a vertical line with plotly
A feature for vertical and horizontal lines is implemented with Plotly.py 4.12 (released 11/20). It works for plotly express and graph objects. See here: https://community.plotly.com/t/announcing-plotly-py-4-12-horizontal-and-vertical-lines-and-rectangles/46783
Simple example:
import plotly.express as px
df = px.data.stocks(indexed=True)
fig = px.line(df)
fig.add_vline(x='2018-09-24')
fig.show()
fig.add_vline(x=2.5, line_width=3, line_dash="dash", line_color="green")

Resources