Index of documents for similarity matrix and community louvain graph - python-3.x

I am running this script to calculate and plot, the similarity between some documents.
#!/usr/bin/python
# -*- coding: utf-8 -*-
import os
import codecs
import string, re
import nltk
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from pathlib import Path
from matplotlib import cm as cm
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
path = "C:\\Users\\user\\Desktop\\texts\\dataset"
text_files = os.listdir(path)
#print (text_files)
tfidf_vectorizer = TfidfVectorizer()
documents = [open(f, encoding="utf-8").read() for f in text_files if f.endswith('.txt')]
sparse_matrix = tfidf_vectorizer.fit_transform(documents)
#with open('C:\\Users\\user\\Desktop\\texts\\results\\pairwise_similarity2.csv', 'w') as f:
# for item in pairwise_similarity:
# f.write("%s\n" % item)
# f.write('\n')
labels = []
for f in text_files:
if f.endswith('.txt'):
labels.append(f)
#print(labels)
pairwise_similarity = sparse_matrix * sparse_matrix.T
pairwise_similarity_array = pairwise_similarity.toarray()
fig, ax = plt.subplots(figsize=(20,20))
cax = ax.matshow(pairwise_similarity_array, interpolation='spline16')
ax.grid(True)
plt.title('News articles similarity matrix')
plt.xticks(range(23), labels, rotation=90);
plt.yticks(range(23), labels);
fig.colorbar(cax, ticks=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
plt.show()
Even though I' have created a labels list, I was wondering how can I access the index of documents to be able to associate a specific document with scores? This would also be helpful to track documents in other tasks as well. For example, I am also using the louvain community library to draw further assumption for the dataset but when trying to apply the labels list as labels it gives an error: AttributeError: 'list' object has no attribute 'items'
Here is the code and the output of community louvain
[![# load the karate club graph
G = nx.from_numpy_matrix(pairwise_similarity_array)
# compute the best partition
partition = community_louvain.best_partition(G)
#print(partition)
modularity = community_louvain.modularity(partition, G)
print(modularity)
# draw the graph
pos = nx.spring_layout(G)
# color the nodes according to their partition
cmap = cm.get_cmap('coolwarm', max(partition.values()) + 1)
nx.draw_networkx_nodes(G, pos, partition.keys(), node_size=100,
cmap=cmap, node_color=list(partition.values()))
nx.draw_networkx_edges(G, pos, alpha=0.5)
nx.draw_networkx_labels(G,pos, labels=doc_labels, font_size=12,font_family='sans-serif')
plt.show()
dendro = community_louvain.generate_dendrogram(G)][1]][1]

Related

Annotating clustering from DBSCAN to original Pandas DataFrame

I have working code that is utilizing dbscan to find tight groups of sparse spatial data imported with pd.read_csv.
I am maintaining the original spatial data locations and would like to annotate the labels returned by dbscan for each data point to the original dataframe and then write a csv with the same information.
So the code below is doing exactly what I would expect it to at this point, I would just like to extend it to import the label for each row in the original dataframe.
import argparse
import string
import os, subprocess
import pathlib
import glob
import gzip
import re
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from sklearn.cluster import DBSCAN
X = pd.read_csv(tmp_csv_name)
X = X.drop('Name', axis = 1)
X = X.drop('Type', axis = 1)
X = X.drop('SomeValue', axis = 1)
# only columns 'x' and 'y' now remain
db=DBSCAN(eps=EPS, min_samples=minSamples, metric='euclidean', algorithm='auto', leaf_size=30).fit(X)
labels = def_inst_dbsc.labels_
unique_labels = set(labels)
# maxX , maxY are manual inputs temporarily
while sizeX > 16 or sizeY > 16 :
sizeX=sizeX*0.8 ; sizeY=sizeY*0.8
fig, ax = plt.subplots(figsize=(sizeX,sizeY))
plt.xlim(0,maxX)
plt.ylim(0,maxY)
plt.scatter(X['x'], X['y'], c=colors, marker="o", picker=True)
# hackX , hackY are manual inputs temporarily
# which represent the boundaries defined in the original dataset
poly = patches.Polygon(xy=list(zip(hackX,hackY)), fill=False)
ax.add_patch(poly)
plt.show()

Scatter points assigned colour from CSV file

I am importing CSV data in the format x,y,z,p to plot a trisurface which has the scatter plots displayed on top.
The trisurface script works (ax.plot_trisurf), however, I would like to colour the scatter points (ax.scatter) according to either the 1 or -1 assigned in the fourth column of the CSV file.
enter image description here
The x,y,z data is complicated and can't be coloured, hence trying to assign it as simply as possible in the fourth column.
I have attached a basic image, essentially I just want to be able to have a selection of the red dots a different colour without affecting the trisurface they are on.
Any comments or suggestions are be very welcome!
My most recent error is:
ax.scatter(X, Y, np.log10(Z), c= (not p <= (0)({True: 'g', False: 'r'})), marker='o')
TypeError: 'int' object is not callable
enter code here
from typing import Any
import matplotlib
from mpl_toolkits.mplot3d import axes3d
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys
import csv
import bbox
import matplotlib.ticker as mticker
# Import CSV data
from numpy import ndarray
csvFileName = sys.argv[0]
csvData = []
with open('ParvaluesMESS.csv', 'r') as csvfile:
csvReader = csv.reader(csvfile, delimiter=',')
for csvRow in csvReader:
csvData.append(csvRow)
csvData = np.array(csvData)
csvData = csvData.astype(float)
X, Y, Z, p = csvData[:,0], csvData[:,1], csvData[:,2], csvData[:,3]
# Plot management: note Z is logged
# Change vmin and vmax values for colorbar range if needed
# Alpha value is transparency
# 111 means 1x1 grid, first subplot
fig = plt.figure(figsize=(5,5))
ax = fig.add_subplot(111, projection='3d')
cb = ax.plot_trisurf(X, Y, np.log10(Z), cmap='coolwarm', alpha=0.75)
#cb = ax.plot_trisurf(X, Y, np.log10(Z), cmap='coolwarm', alpha=0.75, vmin=0, vmax=1)
ax.scatter(X, Y, np.log10(Z), col==(p > 0({True: 'g', False: 'r'})), marker='o')
#ax.zaxis._set_scale('log')
def log_tick_formatter(val, pos=None):
"""Reformat log ticks for display"""
return f"$10^{{{int(val)}}}$"
# Set Z axis to log
ax.zaxis.set_major_formatter(mticker.FuncFormatter(log_tick_formatter))
# ax.zaxis.set_major_locator(mticker.MaxNLocator(integer=True))
def ticklabels(ticks):
ticks_labels = []
for i in ticks:
ticks_labels.append(f'2^{np.log2(i)}')
return ticks_labels
fig.colorbar(cb, shrink=0.5)
ax.set_title("First-year sea ice PAR")
ax.set_xlabel("Ice Thickness m")
ax.set_ylabel("Snow thickness m")
ax.set_zlabel("µmol $^{m-2}$ $^{s-1}$")
ax.view_init(azim=70, elev=30)
ax.set_xlim3d(20, 350)
image_format = 'png' # e.g .png, .svg, etc.
image_name = 'test.eps'
plt.show()
fig.savefig(image_name, format=image_format, dpi=1200)
it was resolved by rearranging into arrays:
from typing import Any
import matplotlib
from mpl_toolkits.mplot3d import axes3d
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys
import matplotlib.ticker as mticker
from numpy import ndarray
data=pd.read_csv('ParvaluesMESS.csv',header=None,sep=',',names=.
['Ice','Snow','umol','P'])
x=data[['Ice']].to_numpy()
y=data[['Snow']].to_numpy()
z=data[['umol']].to_numpy()
p=data[['P']].to_numpy()
x=(x.astype(float)).flatten()
y=(y.astype(float)).flatten()
z=(z.astype(float)).flatten()
p=(p.astype(float)).flatten()
fig = plt.figure(figsize=(5, 5))
ax = fig.add_subplot(111, projection='3d')
cb = ax.plot_trisurf(x, y, np.log10(z), cmap='coolwarm', alpha=0.75)
ax.scatter(x, y, np.log10(z),c=p,cmap='RdYlGn')
#your formats
def log_tick_formatter(val, pos=None):
"""Reformat log ticks for display"""
return f"$10^{{{int(val)}}}$"
# Set Z axis to log
ax.zaxis.set_major_formatter(mticker.FuncFormatter(log_tick_formatter))
# ax.zaxis.set_major_locator(mticker.MaxNLocator(integer=True))
def ticklabels(ticks):
ticks_labels = []
for i in ticks:
ticks_labels.append(f'2^{np.log2(i)}')
return ticks_labels
fig.colorbar(cb, shrink=0.5)
ax.set_title("First-year sea ice PAR")
ax.set_xlabel("Ice Thickness m")
ax.set_ylabel("Snow thickness m")
ax.set_zlabel("µmol $^{m-2}$ $^{s-1}$")
ax.view_init(azim=70, elev=30)
ax.set_xlim3d(20, 350)
image_name = 'BenImag'
image_format = 'png' # e.g .png, .svg, etc.
plt.show()
fig.savefig(image_name, format=image_format, dpi=1200)

Getting rid of extra lines in Python shapefile plot?

I am trying to do a basic plot of the world map using Python and the Matplotlib library. However, when I plot the polygons the plot shows many straight lines that do not seem to be part of the polygon. I am relatively new at working with shapefiles but the code I'm using has worked for a previous shapefile I used, so I'm confused and wondering what might be missing in the code.
The code I'm using is:
import numpy as np
import pandas as pd
import shapefile as shp
import matplotlib.pyplot as plt
import seaborn as sns
import os
sns.set(style='whitegrid', palette='ocean', color_codes=True)
sns.mpl.rc('figure', figsize=(10,6))
sf = shp.Reader(shp_path)
def plot_map(sf, x_lim = None, y_lim = None, figsize = (11,9)):
'''
Plot map with lim coordinates
'''
plt.figure(figsize = figsize)
id=0
for shape in sf.shapeRecords():
x = [i[0] for i in shape.shape.points[:]]
y = [i[1] for i in shape.shape.points[:]]
plt.plot(x, y, 'k')
if (x_lim == None) & (y_lim == None):
x0 = np.mean(x)
y0 = np.mean(y)
plt.text(x0, y0, id, fontsize=10)
id = id+1
if (x_lim != None) & (y_lim != None):
plt.xlim(x_lim)
plt.ylim(y_lim)
plot_map(sf)
plt.show()
The following link shows resulting graph (I'm not allowed to post pictures yet?):
Any help is appreciated, thank you all!
pls use 'k.', or use scatter instead of plot
import numpy as np
import pandas as pd
import shapefile as shp
import matplotlib.pyplot as plt
import seaborn as sns
import os
sns.set(style='whitegrid', palette='ocean', color_codes=True)
sns.mpl.rc('figure', figsize=(10,6))
sf = shp.Reader(shp_path)
def plot_map(sf, x_lim = None, y_lim = None, figsize = (11,9)):
'''
Plot map with lim coordinates
'''
plt.figure(figsize = figsize)
id=0
for shape in sf.shapeRecords():
x = [i[0] for i in shape.shape.points[:]]
y = [i[1] for i in shape.shape.points[:]]
## change here
plt.plot(x, y, 'k.')
if (x_lim == None) & (y_lim == None):
x0 = np.mean(x)
y0 = np.mean(y)
plt.text(x0, y0, id, fontsize=10)
id = id+1
if (x_lim != None) & (y_lim != None):
plt.xlim(x_lim)
plt.ylim(y_lim)
plot_map(sf)
plt.show()

colour map grids based on value in pandas dataframe

I want to fill the gridded map with colors based on the value of interest. A sample data is here:
import pandas as pd
df = pd.DataFrame()
df['lon'] = [100,105,110,115,120,125,130]
df['lat'] = [38,40,42,44,46,48,50]
df['value'] = [1,2,3,4,5,6,7]
Specifically, is it possible to do this with Cartopy? I found a similar question here:https://stackoverflow.com/questions/53412785/plotting-pandas-csv-data-onto-cartopy-map. But that post was to plot scattered points, I need to fill the grids with colors.
I myself tried:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
lon, lat = np.meshgrid(df['lon'], df['lat'])
fig = plt.figure(figsize=[15,15])
ax = plt.axes(projection=ccrs.PlateCarree())
ax.pcolormesh(lon,lat,df['variable'],latlon=True,cmap='jet')
plt.show()
The error is at ax.pcolormesh(...), it says "not enough values to unpack (expected 2, got 1)"
Many thanks for your help.
For discrete data you can create rectangular patches for each point. Here is a possible solution for your sample data set. Each row of data (lat, long, value) is used to create a rectangular patch. The value is normalized by dividing with max(value) to enable using colormap for coloring the patches.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import matplotlib.patches as mpatches
def make_rect(clon, clat, dlon, dlat):
lon_min = clon - dlon/2.
lat_min = clat - dlat/2.
lon_max = clon + dlon/2.
lat_max = clat + dlat/2.
# clockwise from LL
#lons = [lon_min, lon_min, lon_max, lon_max, lon_min]
#lats = [lat_min, lat_max, lat_max, lat_min, lat_min]
ll = [lon_min,lat_min]
ul = [lon_min,lat_max]
ur = [lon_max,lat_max]
lr = [lon_max,lat_min]
return [ll, ul, ur, lr, ll]
df = pd.DataFrame()
df['lon'] = [100,105,110,115,120,125,130]
df['lat'] = [38,40,42,44,46,48,50]
df['value'] = [1,2,3,4,5,6,7] # not suffice for meshgrid plot
# The colormap to use.
cm = plt.cm.get_cmap('jet')
fig = plt.figure(figsize=[8,6])
ax = plt.axes(projection=ccrs.PlateCarree(), extent=[95, 134, 35, 52])
# plot the red dots using the available data
# comment out if not needed
ax.plot(df['lon'], df['lat'], 'ro')
# plot rectangular patches at the data points
dlon, dlat = 5, 2 #spacings between data points
for lon1, lat1, val1 in zip(df['lon'], df['lat'], df['value']):
pcorners = make_rect(lon1, lat1, dlon, dlat)
poly = mpatches.Polygon(pcorners, ec='gray', fill=True, lw=0.25, \
fc=cm(val1 / max(df['value'])), transform=ccrs.PlateCarree())
ax.add_patch(poly)
ax.gridlines(draw_labels=True)
plt.show()
The output plot:

How to loop through items in pandas col and run and plot a scikit model?

I got some interesting user data from races. I know when the respecitve athletes planed to finish a race and I know when they actaully finished (next to some more stuff). The goal is to find out when the athletes come in late. I want to run a support vector machine for each athlete and plot the decision boundaries.
Here is what I do:
import numpy as np
import pandas as pd
from sklearn import svm
from mlxtend.plotting import plot_decision_regions
import matplotlib.pyplot as plt
# Create arbitrary dataset for example
df = pd.DataFrame({'User': np.random.random_integers(low=1, high=4, size=50),
'Planned_End': np.random.uniform(low=-5, high=5, size=50),
'Actual_End': np.random.uniform(low=-1, high=1, size=50),
'Late': np.random.random_integers(low=0, high=2, size=50)}
)
# Fit Support Vector Machine Classifier
X = df[['Planned_End', 'Actual_End']]
y = df['Late']
clf = svm.SVC(decision_function_shape='ovo')
for i, y in df['User']:
clf.fit(X, y)
ax = plt.subplot()
fig = plot_decision_regions(X=X, y=y, clf=clf, legend=2)
plt.title(lab)
plt.show()
I get the following error: TypeError: 'numpy.int64' object is not iterable - that is, I somehow can't loop through the column.
I guess it comes down to the numpy data format? How can I solve that?
try iteritems()
for i, y in df['User'].iteritems():
Your User Series contains numpy.int64 objects so you can only use:
for y in df['User']:
And you don't use i anywhere.
As for the rest of the code, this produces some solution, please edit accordingly:
import numpy as np
import pandas as pd
from sklearn import svm
from mlxtend.plotting import plot_decision_regions
import matplotlib.pyplot as plt
# Create arbitrary dataset for example
df = pd.DataFrame({'User': np.random.random_integers(low=1, high=4, size=50),
'Planned_End': np.random.uniform(low=-5, high=5, size=50),
'Actual_End': np.random.uniform(low=-1, high=1, size=50),
'Late': np.random.random_integers(low=0, high=2, size=50)}
)
# Fit Support Vector Machine Classifier
X = df[['Planned_End', 'Actual_End']].as_matrix()
y = df['Late']
clf = svm.SVC(decision_function_shape='ovo')
y = df['User'].values
clf.fit(X, y)
ax = plt.subplot()
fig = plot_decision_regions(X=X, y=y, clf=clf, legend=2)
plt.title('lab')
plt.show()

Resources