I would like to combine different risk ratios into one forest plot. I would expect the output to be similar to metamiss in STATA or metafor in R. How can I do this in Python?
By using the zEPID package I create a forest plot of different risk ratios.
import matplotlib.image as mpimg
import numpy as np
import matplotlib.pyplot as plt
import zepid
from zepid.graphics import EffectMeasurePlot
labs = ["ACA(Isq=41.37% Tausq=0.146 pvalue=0.039 )",
"ICA0(Isq=25.75% Tausq=0.092 pvalue=0.16 )",
"ICA1(Isq=60.34% Tausq=0.121 pvalue=0.00 )",
"ICAb(Isq=25.94% Tausq=0.083 pvalue=0.16 )",
"ICAw(Isq=74.22% Tausq=0.465 pvalue=0.00 )"]
measure = [2.09,2.24,1.79,2.71,1.97]
lower = [1.49,1.63,1.33,2.00,1.25]
upper = [2.92,3.07,2.42,3.66,3.11]
p = EffectMeasurePlot(label=labs, effect_measure=measure, lcl=lower, ucl=upper)
p.labels(effectmeasure='RR')
p.colors(pointshape="D")
ax=p.plot(figsize=(7,3), t_adjuster=0.09, max_value=4, min_value=0.35 )
plt.title("Random Effect Model(Risk Ratio)",loc="right",x=1, y=1.045)
plt.suptitle("Missing Data Imputation Method",x=-0.1,y=0.98)
ax.set_xlabel("Favours Control Favours Haloperidol ", fontsize=10)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(True)
ax.spines['left'].set_visible(False)
plt.savefig("Missing Data Imputation Method",bbox_inches='tight')
The statsmodels library has an API for doing simple meta-analysis and plotting forest plots. It supports DerSimonian-Laird (chi2) and Paule-Mandel (iterated). See the statsmodel docs for more use cases, options and examples.
An example from their docs:
import numpy as np
from statsmodels.stats.meta_analysis import combine_effects
# dummy data
mean_effect = np.array([61.00,61.40,62.21,62.30,62.34,62.60,62.70,62.84,65.90])
var_effect = np.array([0.2025,1.2100,0.0900,0.2025,0.3844,0.5625,0.0676,0.0225,1.8225])
idx = ['lab1','lab2','lab3','lab4','lab5','lab6','lab7','lab8','lab9']
# meta-analysis and forest plot
results = combine_effects(mean_effect, var_effect, method_re="chi2", use_t=True, row_names=idx)
print(results.summary_frame())
fig = results.plot_forest()
Output:
eff sd_eff ci_low ci_upp w_fe w_re
lab1 61.000000 0.450000 60.118016 61.881984 0.057436 0.123113
lab2 61.400000 1.100000 59.244040 63.555960 0.009612 0.040314
lab3 62.210000 0.300000 61.622011 62.797989 0.129230 0.159749
lab4 62.300000 0.450000 61.418016 63.181984 0.057436 0.123113
lab5 62.340000 0.620000 61.124822 63.555178 0.030257 0.089810
lab6 62.600000 0.750000 61.130027 64.069973 0.020677 0.071005
lab7 62.700000 0.260000 62.190409 63.209591 0.172052 0.169810
lab8 62.840000 0.150000 62.546005 63.133995 0.516920 0.194471
lab9 65.900000 1.350000 63.254049 68.545951 0.006382 0.028615
fixed effect 62.583397 0.107846 62.334704 62.832090 1.000000 NaN
random effect 62.390139 0.245750 61.823439 62.956838 NaN 1.000000
fixed effect wls 62.583397 0.189889 62.145512 63.021282 1.000000 NaN
random effect wls 62.390139 0.294776 61.710384 63.069893 NaN 1.000000
I’d also recommend having a read through the docs for the individual methods such as combine_effects() which contains additional notes and references regarding the implementation.
Since I haven't found a customizable package to create a forest plot, I developed myforestplot for that purpose.
The following is one example of a forest plot using titanic dataset.
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np
import matplotlib.pyplot as plt
import myforestplot as mfp
data = (pd.read_csv("titanic.csv")
[["survived", "pclass", "sex", "age", "embark_town"]]
.dropna()
)
ser = data["age"]
data["age"] = (ser
.mask(ser >= 40, "40 or more")
.mask(ser < 40, "20_39")
.mask(ser <20, "0_19")
)
res = smf.logit("survived ~ sex + age + embark_town", data=data).fit()
order = ["age", "sex", "embark_town"]
cont_cols = []
item_order = {"embark_town": ['Southampton', 'Cherbourg', 'Queenstown'],
"age": ["0_19", "20_39", "40 or more"]
}
df = mfp.statsmodels_pretty_result_dataframe(data, res,
order=order,
cont_cols=cont_cols,
item_order=item_order,
fml=".3f",
)
df is a dataframe for creating a forest plot.
plt.rcParams["font.size"] = 8
fp = mfp.SimpleForestPlot(ratio=(8,3), dpi=150, figsize=(5,3), df=df,
vertical_align=True)
fp.errorbar(errorbar_kwds=None, log_scale=True)
xticklabels = [0.1, 0.5, 1.0, 2.0]
fp.ax2.set_xlim(np.log([0.1, 1.5]))
fp.ax2.set_xticks(np.log(xticklabels))
fp.ax2.set_xticklabels(xticklabels)
fp.ax2.set_xlabel("OR (log scale)")
fp.ax2.axvline(x=0, ymin=0, ymax=1.0, color="black", alpha=0.5)
fp.ax1.set_xlim([0.35, 1])
fp.embed_cate_strings("category", 0.3, header="Category",
text_kwds=dict(fontweight="bold"),
header_kwds=dict(fontweight="bold")
)
fp.embed_strings("item", 0.36, header="", replace={"age":""})
fp.embed_strings("nobs", 0.60, header="N")
fp.embed_strings("risk_pretty", 0.72, header="OR (95% CI)")
fp.horizontal_variable_separators()
fp.draw_outer_marker(log_scale=True, scale=0.008)
plt.show()
and we obtain the figure.
A forest plot image
Related
Aim of the task: I have sets of coordinates (X and Y) coordinates of the geometry and I want to make my geometry aligned. The coordinate and respective geometry is shown in the picture.
X1_coordinate = [0.0, 0.87, 1.37, 1.87, 2.73, 3.6, 4.46, 4.96, 5.46, 4.6, 3.73, 2.87, 2.0, 1.5, 1.0, 0.5, 2.37, 3.23, 4.1]
Y1_coordinate = [0.0, 0.5, -0.37, -1.23, -0.73, -0.23, 0.27, -0.6, -1.46, -1.96, -2.46, -2.96, -3.46, -2.6, -1.73, -0.87, -2.1, -1.6, -1.1]
Question: Can I apply Principal Component Analysis on 2D geometries to make it aligned such that its principal axis will be parallel to the reference axis (X and Y)?
Expected output: I want my geometry like this. This is just an example. I want my geometry in such as way that, principal axis of geometry lies on the reference axis or be parallel to reference axis.
What I tried: I tried below code to implement PCA and to obtain the geometry aligned.
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.decomposition import PCA
import pandas as pd
from sklearn.preprocessing import StandardScaler
plt.style.use('ggplot')
# Load the data
# iris = datasets.load_iris()
X = X1_coordinate
y = Y1_coordinate
# Z-score the features
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
# The PCA model
pca = PCA(n_components=2) # estimate only 2 PCs
X_new = pca.fit_transform(X) # project the original data into the PCA space
However, after running the code, I got error as mentioned below.
Kindly let me know what should I do to make my geometry aligned. Looking forward to get answers.
Basically, you can apply PCA to this task.
import sklearn
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
circle_pts = sklearn.datasets.make_circles() #get two circles with labels
circle_pts = circle_pts[0][circle_pts[1]==0] #leave only one circle
ang = 63/180*np.pi #radians of rotation
R = np.array([[np.cos(ang), -np.sin(ang)], [np.sin(ang), np.cos(ang)]])
ellipse_pts = circle_pts
ellipse_pts[:,0] *= 4.5
ellipse_rot_pts = ellipse_pts # R.T
plt.figure()
plt.scatter(ellipse_rot_pts[:,0], ellipse_rot_pts[:,1])
plt.axis("equal")
plt.tight_layout()
plt.show()
scaler = StandardScaler(with_std=False)
scaler.fit(ellipse_rot_pts)
X = scaler.transform(ellipse_rot_pts)
pca = PCA(n_components=2) # estimate only 2 PCs
X_new = pca.fit_transform(X) # project the original data into the PCA space
plt.figure()
plt.scatter(X[:,0],X[:,1])
singular_values = pca.singular_values_
plt.plot([0, singular_values[0]*pca.components_[0,0]], [0, singular_values[0]*pca.components_[0,1]])
plt.plot([0, singular_values[1]*pca.components_[1,0]], [0, singular_values[1]*pca.components_[1,1]])
plt.axis("equal")
plt.show()
plt.figure()
plt.title("Aligned with axis figure")
plt.scatter(X_new[:,0],X_new[:,1])
plt.axis("equal")
plt.show()
But the problem is that not every geometry is appropriate for this. ellipse has two main axis of symmetry. Your figure for example doesn't. So principal components that are been seeking via maximum variance in data doesn't correspond with your example(expected output) axis alignement.
For example your set of points correspond to this variant of components alignment:
Your geometry
And for a modificated little more symmetric object.
A little more symmetrical figure
Hope i helped
I create a plot using sns base on a DafaFrame.
Now, I would like to add new curve from another dataframe on the plot created previusly.
This is the code of my plot:
tline = sns.lineplot(x='reads', y='time', data=df, hue='method', style='method', markers=True, dashes=False, ax=axs[0, 0])
tline.set_xlabel('Numero di reads')
tline.set_ylabel ('Time [s]')
tline.legend(loc='lower right')
tline.set_yscale('log')
tline.autoscale(enable=True, axis='x')
tline.autoscale(enable=True, axis='y')
Now I have another Dataframe with the same column of the first DataFrame. How can I add this new curve with a custom entry in the legend?
This is the structure of the DataFrame:
Dataset
Method
Reads
Time
Peak-memory
14M
Set
14000000
7.33
1035204
20K
Set
200000
0.38
107464
200K
Set
20000
0.07
42936
2M
Set
28428648
16.09
2347740
28M
Set
2000000
1.41
240240
I suggest to use matplotlibs OOP interface like this
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns
# generate sample data
time_column = np.arange(10)
data_column1 = np.random.randint(0, 10, 10)
data_column2 = np.random.randint(0, 10, 10)
# store in pandas dfs
df1 = pd.DataFrame(zip(time_column, data_column1), columns=['Time', 'Data'])
df2 = pd.DataFrame(zip(time_column, data_column2), columns=['Time', 'Data'])
f, ax = plt.subplots()
sns.lineplot(df1.Time, df1.Data, label='foo', ax=ax)
sns.lineplot(df2.Time, df2.Data, label='bar', ax=ax)
ax.legend()
plt.show()
which generates the following output
the important thing is that both lineplots are on the same subplot (ax in this case).
So, here is my code:
import pandas as pd
import scipy.stats as st
import matplotlib.pyplot as plt
from matplotlib.ticker import AutoMinorLocator
from fitter import Fitter, get_common_distributions
df = pd.read_csv("project3.csv")
bins = [282.33, 594.33, 906.33, 1281.33, 15030.33, 1842.33, 2154.33, 2466.33, 2778.33, 3090.33, 3402.33]
#declaring
facecolor = '#EAEAEA'
color_bars = '#3475D0'
txt_color1 = '#252525'
txt_color2 = '#004C74'
fig, ax = plt.subplots(1, figsize=(16, 6), facecolor=facecolor)
ax.set_facecolor(facecolor)
n, bins, patches = plt.hist(df.City1, color=color_bars, bins=10)
#grid
minor_locator = AutoMinorLocator(2)
plt.gca().xaxis.set_minor_locator(minor_locator)
plt.grid(which='minor', color=facecolor, lw = 0.5)
xticks = [(bins[idx+1] + value)/2 for idx, value in enumerate(bins[:-1])]
xticks_labels = [ "{:.0f}-{:.0f}".format(value, bins[idx+1]) for idx, value in enumerate(bins[:-1])]
plt.xticks(xticks, labels=xticks_labels, c=txt_color1, fontsize=13)
#beautify
ax.tick_params(axis='x', which='both',length=0)
plt.yticks([])
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
for idx, value in enumerate(n):
if value > 0:
plt.text(xticks[idx], value+5, int(value), ha='center', fontsize=16, c=txt_color1)
plt.title('Histogram of rainfall in City1\n', loc = 'right', fontsize = 20, c=txt_color1)
plt.xlabel('\nCentimeters of rainfall', c=txt_color2, fontsize=14)
plt.ylabel('Frequency of occurrence', c=txt_color2, fontsize=14)
plt.tight_layout()
#plt.savefig('City1_Raw.png', facecolor=facecolor)
plt.show()
city1 = df['City1'].values
f = Fitter(city1, distributions=get_common_distributions())
f.fit()
fig = f.plot_pdf(names=None, Nbest=4, lw=1, method='sumsquare_error')
plt.show()
print(f.get_best(method = 'sumsquare_error'))
The issue is with the plots it shows. The first histogram it generates is
Next I get another graph with best fitted distributions which is
Then an output statement
{'chi2': {'df': 10.692966790090342, 'loc': 16.690849400411103, 'scale': 118.71595997157786}}
Process finished with exit code 0
I have a couple of questions. Why is chi2, the best fitted distribution not plotted on the graph?
How do I plot these distributions on top of the histograms and not separately? The hist() function in fitter library can do that but there I don't get to control the bins and so I end up getting like 100 bins with some flat looking data.
How do I solve this issue? I need to plot the best fit curve on the histogram that looks like image1. Can I use any other module/package to get the work done in similar way? This uses least squares fit but I am OK with least likelihood or log likelihood too.
Simple way of plotting things on top of each other (using some properties of the Fitter class)
import scipy.stats as st
import matplotlib.pyplot as plt
from fitter import Fitter, get_common_distributions
from scipy import stats
numberofpoints=50000
df = stats.norm.rvs( loc=1090, scale=500, size=numberofpoints)
fig, ax = plt.subplots(1, figsize=(16, 6))
n, bins, patches = ax.hist( df, bins=30, density=True)
f = Fitter(df, distributions=get_common_distributions())
f.fit()
errorlist = sorted(
[
[f._fitted_errors[dist], dist]
for dist in get_common_distributions()
]
)[:4]
for err, dist in errorlist:
ax.plot( f.x, f.fitted_pdf[dist] )
plt.show()
Using the histogram normalization, one would need to play with scaling to generalize again.
I am trying to estimate a normal density using a quadratic approximation in tensorflow (code 4.14 from McElreath's Statistical Rethinking).
The code I have so far is:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow_probability import distributions as tfd
_BASE_URL = "https://raw.githubusercontent.com/rmcelreath/rethinking/Experimental/data"
HOWELL_DATASET_PATH = f"{_BASE_URL}/Howell1.csv"
df = pd.read_csv(HOWELL_DATASET_PATH, sep=';')
df = df[df['age'] >= 18]
mu = tf.linspace(start=140.0, stop=160.0, num=200)
sigma= tf.linspace(start=4.0, stop=9.0, num=200)
tf.reduce_sum(tfd.Normal(loc=mu, scale=sigma).log_prob(df.height))
This fails due to df having shape (352,) whilst I am creating (200,) points for my normal distribution to be evaluated on.
However
tf.reduce_sum(tfd.Normal(loc=mu, scale=sigma).log_prob(2))
and
tf.reduce_sum(tfd.Normal(loc=mu[0], scale=sigma[0]).log_prob(df.height))
both work.
I need to create a (200, 352) tensor - one Normal for each mu, sigma on my grid, and then evaluate it with my sample data - df. The question I have is: how do I do this?
I think TFP's joint distribution is a nice way to express this:
mu = tf.linspace(start=140.0, stop=160.0, num=200)
sigma = tf.linspace(start=7.0, stop=9.0, num=200)
def mk_joint(nobs):
return tfd.JointDistributionNamed(dict(
mu=tfd.Normal(178, 20),
sigma=tfd.Uniform(0, 50),
height=lambda mu, sigma: tfd.Sample(tfd.Normal(loc=mu, scale=sigma), nobs)
))
joint = mk_joint(len(df))
joint.sample()
print(f'joint event shape: {joint.event_shape}')
lp = joint.log_prob(dict(mu=mu[:,tf.newaxis], sigma=sigma, height=df.height))
import matplotlib.pyplot as plt
plt.imshow(lp)
plt.xlabel('sigma')
plt.xticks(np.arange(len(sigma))[::10], sigma[::10].numpy().round(2), rotation=90)
plt.ylabel('mu')
plt.yticks(np.arange(len(mu))[::10], mu[::10].numpy().round(2))
plt.show()
=>
joint event shape: {'sigma': TensorShape([]), 'mu': TensorShape([]), 'height': TensorShape([352])}
So, I figured out that one way to do it would be to create a (200, 200, 352) grid and then reshape, and the rest of the calculations follow straightforwardly.
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow_probability import distributions as tfd
_BASE_URL = "https://raw.githubusercontent.com/rmcelreath/rethinking/Experimental/data"
HOWELL_DATASET_PATH = f"{_BASE_URL}/Howell1.csv"
df = pd.read_csv(HOWELL_DATASET_PATH, sep=';')
df = df[df['age'] >= 18]
mu = tf.linspace(start=140.0, stop=160.0, num=200)
sigma = tf.linspace(start=7.0, stop=9.0, num=200)
means, variances, _ = tf.meshgrid(mu, sigma, np.zeros((352,)).astype(np.float32))
means = tf.reshape(means, [40000, 352])
variances = tf.reshape(variances, [40000, 352])
normal = tfd.Normal(loc=means, scale=variances)
log_lik = tf.reduce_sum(normal.log_prob(df.height), axis=1)
logprob_mu = tfd.Normal(178.0, 20.0).log_prob(means)
logprob_sigma = tfd.Uniform(low=0.0, high=50.0).log_prob(variances)
log_joint_prod = log_lik + logprob_mu[:, 0] + logprob_sigma[:, 0]
joint_prob_tf = tf.exp(log_joint_prod - tf.reduce_max(log_joint_prod))
I used KMeans for clustering as shown below, but I don't know to plot my clusters in a scatter plot.
Or like This plot too
My code is:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
documents = ["This little kitty came to play when I was eating at a restaurant.",
"Merley has the best squooshy kitten belly.",
"Google Translate app is incredible.",
"If you open 100 tab in google you get a smileyface.",
"Best cat photo I've ever taken.",
"Climbing ninja cat.",
"Impressed with google map feedback.",
"Key promoter extension for Google Chrome."]
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)
true_k = 2
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)
If I understand your question correctly, I think you might be looking to do something like this? I plotted the data, coloring by label, after converting to cluster distance space.
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import pandas as pd
documents = ["This little kitty came to play when I was eating at a restaurant.",
"Merley has the best squooshy kitten belly.",
"Google Translate app is incredible.",
"If you open 100 tab in google you get a smileyface.",
"Best cat photo I've ever taken.",
"Climbing ninja cat.",
"Impressed with google map feedback.",
"Key promoter extension for Google Chrome."]
df = pd.DataFrame(documents) # read in your data with pd.read_csv or if in list form like above do this
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df[0].values.astype('U')) # make sure you have unicode strings [0] is the column of the sentences
true_k = 2
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=2000, n_init=20)
Xt = model.fit_transform(X)
# things with tf-idf score
X = X.toarray()
fns = np.array(vectorizer.get_feature_names()) # feature names/ordered by index
# retrieve labels with max score
labels = model.labels_
d = []
for n in sorted(np.unique(labels)):
t = X[(labels == n)].sum(axis=0) #max tf/idf score cumulative/cluster
words = fns[t == t.max()]
d.append(",".join(words))
t = Xt.T # cluster distance space X transpose to be plotted with mpl
### plot the clusters
fig, ax = plt.subplots(1,1)
cluster_color_dict = {0:'purple', 1 :'blue'} # change these to desired colors
for i in range(len(t[0])):
ax.scatter(t[0][i], t[1][i], c= cluster_color_dict[labels[i]], edgecolors='grey', lw = 0.5, s = 200)
p1 = [] # legend patches
for i in range(2):
print i
h = ax.scatter([],[], c= cluster_color_dict[i],
edgecolors= 'grey', lw = 0.5, s = 80, label = d[i])
p1.append(h)
l1 = ax.legend(handles = p1, title= 'cluster', bbox_to_anchor = (1,1), loc = 'upper left')