I'm working on old French Napoleonian cadastre, I've vectorized it, and now I've been studying parcels' neighborhood relation. I want to know which polygon is next to which polygon.
I tried NetworkX python library, but I did not succeed to convert my shapefile to a graph. I want to extract centroids from my polygons and trace relation between them.
I can use line shapefile or area shapefile to represent my parcels.
There is my python code:
import networkx as nx
import matplotlib.pyplot as plt
G=nx.read_shp('path/to/shp') #Read shapefile as graph
pos = {xy: xy for xy in G.nodes()}
nx.draw_networkx_nodes(G,pos,node_size=10,node_color='r')
nx.draw_networkx_edges(G,pos,edge_color='b')
plt.show()
This is my shapefile:
All right, I've fixed my problem with PySal library.
There is my code if someone need to generate some graphs!
##Définition des relations d'adjacence
qW = ps.queen_from_shapefile(str(planche)+".shp")
dataframe = ps.pdio.read_files(str(planche)+".shp")
## Affiche la matrice de voisinage complète.
Wmatrix, ids = qW.full() #ou ps.full(qW)
print("Matrice d'adjacence complète:\n", Wmatrix)
print("\n")
## pour compter le nombre de voisins que possède une parcelle:
n_neighbors = Wmatrix.sum(axis=1)
for i in range (len(n_neighbors)):
if n_neighbors[i] != 0:
print("La parcelle %i a %i voisins" %(i,n_neighbors[i]))
else:
print("La parcelle %i n'a pas de voisin" %i)
print("")
## Affiche [parcelle choisie, ses voisins]:
for i in range (len(n_neighbors)):
self_and_neighbors = [i]
self_and_neighbors.extend(qW.neighbors[i])
if self_and_neighbors[1:] == []:
print("La parcelle %i n'a pas de voisin" %i)
else:
print("Les voisins de la parcelle %i sont les parcelles " %i, self_and_neighbors[1:])
##Extractions des coordonnées des centroïds:
centroids = np.array([list(poly.centroid) for poly in dataframe.geometry])
plt.plot(centroids[:,0], centroids[:,1],'.')
for k,neighs in qW.neighbors.items():
#print(k,neighs)
#origin = centroids[k]
for neigh in neighs:
segment = centroids[[k,neigh]]
plt.plot(segment[:,0], segment[:,1], '-')
##Affichage des numéros des centroïds sur le graph:
for i in range (len(centroids)):
plt.text(centroids[i][0],centroids[i][1],str(i))
plt.title('Graph de la planche '+str(planche)+ " de l'année "+str(annee))
print("\nDuree: %.2f sec" %(time.time()-deb))
show()
Related
So I've been working on a script that a co-worker of mine made now i fixed some of his issues but i cannot seem to figure out why it only actually works when i run it in debugging mode in VSC even when i run it from a normal python shell it does not give me the output files that it does when running in debug mode does anyone know why? (Some Links and sensitive company data has been removed)
here is the code:
import requests
from requests.auth import HTTPBasicAuth
import json
import csv
import os
import pandas as pd
import datetime
import urllib3
from datetime import datetime, timedelta
#______________________________________________________________________________________
#main functie
def Main():
#http request met api account naar de export lijst Rapid7
urllib3.disable_warnings() #negeert de waarschuwingen van de self signed certificaten
url = "URL REMOVED"
r= requests.get(url,verify=False, auth=HTTPBasicAuth('REMOVED', 'REMOVED))
#data opslaan van de http request in csv formaat
with open('downloaded.csv', 'wb') as csv_file:
csv_file.write(r.content)
#open het input bestand van AD
Filenameslist = "C:\Robert-Code\ComputerListForRapid7.json" #volledig pad naar het bestand toegevoegd
with open(Filenameslist) as f:
data = json.load(f)
#converteer json naar een CSV-besand
with open("computerlist.csv", "w") as f:
fieldnames = data[3].keys()
# haal de keys van de 3e regel, want soms is de eerste regel van de sourcefile leeg
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for row in data:
writer.writerow(row)
#Ik gebruik de Pandas module om alleen de kolom "name" van het Rapid7 bestand te krijgen die ik nodig heb.
# Dan draai ik naam en adres om, zodat de naam in de eerste kolom staat. Hiervoor lees ik de CSV in een dataframe
dfR7 = pd.read_csv("downloaded.csv")
titles = list(dfR7.columns)
titles[0],titles[1] = titles[1],titles[0] #draai om, zodat name in de eerste kolom staat
dfR7 = dfR7[titles] # zet de kolommen en data goed in het object
dfR7.sort_values(["Name"], inplace = True)
dfR7.drop(columns=["Address","Operating System","Site","Exploits","Malware","Vulnerabilities","Risk","Last Scan","Assessed"], inplace=True)
dfR7["Name"] = dfR7["Name"].str.split('.').str[0] #strip het domein van het FQDN
dfR7["Name"] = dfR7["Name"].str.lower() # alles lowercase
#Pandas module om ook van het AD-betand 1 kolom "name" over te houden in het object dfAD zodat ik later kan vergelijken.
dfAD = pd.read_csv("computerlist.csv")
dfAD.drop(columns=["DNSHostName","OperatingSystem","IPAddress", "LastLogon"], inplace= True)
dfAD["Computer"] = dfAD["Computer"].str.lower()
#beide objecten opslaan in een csv-bestand deze te vergelijken
dfR7.to_csv("fr7.csv", index=False)
dfAD.to_csv("fAD.csv", index=False)
with open('fr7.csv', 'r') as t1, open('fAD.csv', 'r') as t2:
fileRapid = t1.readlines()
fileAD = t2.readlines()
#de bestanden fr7.csv en fad.csv vergelijken aan de hand van een for loop
# deze dan opslaan in update.csv
with open('update.csv', 'w') as outFile:
for line in fileAD:
if line not in fileRapid:
outFile.write(line)
#hier haal ik weer het oude bestand van AD erbij om deze zometeen te mergen met het net gemaakte update.csv bestand
# zodat ik alle nuttige kolommen weer heb
dfAD = pd.read_csv("computerlist.csv")
dfAD["Computer"] = dfAD["Computer"].str.lower()
dfAD.to_csv("f1AD.csv", index=False)
# merge functie van de Pandas module
data1 = pd.read_csv('update.csv')
data2 = pd.read_csv("f1AD.csv")
output1 = pd.merge(data1, data2,
on='Computer',
how='inner')
#opslaan naar TotalresultsAD.csv
output1.to_csv("totaldifferenceAD_R7.csv", index =False)
#met de datetime module maak ik een variabele: time met de dag van vandaag minus 30 dagen
time = datetime.today() - timedelta(60)
"lees 2 x het bestand in"
dfgood = pd.read_csv("totaldifferenceAD_R7.csv")
dfbad = pd.read_csv("totaldifferenceAD_R7.csv")
#dit outputbestand geeft de assets weer die een LastLogon hebben recenter dan 30 dagen geleden
dfgood['LastLogon'] = pd.to_datetime(dfgood['LastLogon'], errors = 'coerce') #errors = 'coerce' zorgt ervoor dat de foutieve waarden in de kolom LastLogon worden genegeerd
dfgood.sort_values(["LastLogon"], inplace = True)
dfnew = (dfgood['LastLogon'] >= time)
dfnew = dfgood.loc[dfnew]
#dit outputbestand geeft de assets weer die een LastLogon hebben ouder dan 30 dagen geleden
dfbad['LastLogon'] = pd.to_datetime(dfbad['LastLogon'], errors = 'coerce') #errors = 'coerce' zorgt ervoor dat de foutieve waarden in de kolom LastLogon worden genegeerd
dfbad.sort_values(["LastLogon"], inplace = True)
newdf2 = (dfbad['LastLogon'] < time)
newdf2 = dfbad.loc[newdf2]
#wegschrijven uiteindelijke bestanden
dfnew.to_csv("newer_than_60_days.csv",index =False)
newdf2.to_csv("older_than_60_days.csv",index =False)
#opschonen van de bestanden
os.remove("FAD.csv")
os.remove("fr7.csv")
os.remove("computerlist.csv")
os.remove("downloaded.csv")
os.remove("f1AD.csv")
os.remove("update.csv")
if __name__=="__main__":
Main() ```
Thanks in advance for any help
Because I don't have a high enough SO reputation, unfortunately I can't simply comment this and need to make it an 'Answer'.
Changing
r= requests.get(url,verify=False, auth=HTTPBasicAuth('REMOVED', 'REMOVED))
to
r= requests.get(url,verify=False, auth=HTTPBasicAuth('REMOVED', 'REMOVED'))
will get the syntax highlighting all fixed up and may make it easier for someone smarter than me to assist you :)
Something that I've previously come across (primarily with web scraping packages) were functions that didn't play nicely with relative paths - perhaps changing them to absolute paths using os.path.abspath(".....") may help? It's a stab in the dark so that this 'Answer' actually has a potentially useful element to it, but may be an adjustment worth exploring.
I have a small big problem with an app I have created, using Python and Streamlit.
The problem is that when I want to deploy it to the Heroku services, it builds, but when I try to access it, it tells me that there is an applicaiton error.
I have looked at my code, whcih on the local machine works pefectly.
Do you have any idea?
Code:
# Loading packages ##########################################################################
import streamlit as st
import numpy as np
import networkx as nx
from pyvis.network import Network
# import plotly.express as px
###############################################################################################
st.set_page_config(layout="wide")
##### Creazione Sidebar ####################################################################
st.sidebar.title('Ottimizzazione')
st.sidebar.header('Creazione matrice')
st.sidebar.write('Scrivere la matrice quadrata come da esempio Matrice: 1,2;4,5')
st.sidebar.write('Usare la , per separe le varie colonne e il ; per andare alla prossima riga')
input_matrice=st.sidebar.text_area('Scrivere Matrice:')
# Creazione matrice
matrice = np.matrix(input_matrice)
matrice_array = np.asarray(matrice)
##### Creazione Due colonne per output ##############################################################
left_column1, right_column1 = st.beta_columns(2)
left_column2, right_column2 = st.beta_columns(2)
## Visualizzazione matrice
with left_column1:
st.header('Matrice Quadrata Creata:')
matrice_array
with right_column1:
st.header('Visualizzazione del grafo:')
st.write('Il grafo verrà visualizzato in una apgina separata, in quanto al momento non riuslta possible integrarlo nella pagina pricipale')
##### Creaizone Input per calcolo #######################################################
st.sidebar.header('Calcolo percorso:')
st.sidebar.write('Calcolo del percorso più corto, Nx nodo di partenza e Ny nodo di arrivo. I nodi della matrice corrispondono agli indici della colonna.')
selezione = st.sidebar.radio("Seleziona tipo di calcolo",('Da Nx a tutti più vicini','Da Nx a Ny'))
if selezione == 'Da Nx a tutti più vicini':
nodo_partenza=np.int(st.sidebar.number_input('Scrivere nodo di partenza (Numero intero):'))
bottone_calcolo = st.sidebar.button('Calcola percorso', key=1)
if bottone_calcolo:
grafo_matrice = nx.from_numpy_matrix(matrice_array)
percorso = nx.single_source_dijkstra_path(grafo_matrice, nodo_partenza, weight='weight')
lunghezza = nx.single_source_dijkstra_path_length(grafo_matrice, nodo_partenza, weight='weight')
with left_column2:
st.header('Percorsi:')
st.write('Qui vengono mostrati i vari percorsi che sono stati trovati. I valori a destra corrispondono al ordine di successione, mentre I valori a destra i vari nodi.')
percorso
with right_column2:
st.header('Tempi percorsi:')
st.write('Qui vengono mostrati i vari tempi dei vari percorsi')
lunghezza
with right_column1:
nt=Network("1000px","1000px")
nt.from_nx(grafo_matrice)
nt.show("nx.html")
elif selezione == 'Da Nx a Ny':
nodo_partenza=np.int(st.sidebar.number_input('Scrivere nodo di partenza (Numero intero):'))
nodo_arrivo=np.int(st.sidebar.number_input('Scrivere nodo di arrivo (Numero intero):'))
bottone_calcolo = st.sidebar.button('Calcola percorso', key=2)
if bottone_calcolo:
grafo_matrice = nx.from_numpy_matrix(matrice_array)
percorso = nx.shortest_path(grafo_matrice, source=nodo_partenza, target=nodo_arrivo, weight='weight')
lunghezza = nx.shortest_path_length(grafo_matrice, source=nodo_partenza, target=nodo_arrivo, weight='weight')
with left_column2:
st.header('Percorso:')
st.write('Qui viene mostrato il percorso trovato. I valori a destra corrispondono al ordine di successione, mentre I valori a destra i vari nodi.')
percorso
with right_column2:
st.header('Tempo percorso:')
st.write('Qui viene mostrato il tempo totale del pecrorso più breve')
lunghezza
with right_column1:
nt=Network("1000px","1000px")
nt.from_nx(grafo_matrice)
nt.show("nx.html")
Requirements:
streamlit==0.75.0
numpy==1.19.2
networkx==2.5.0
pyvis==0.1.9
Thank you for the help!
I solved the issue, it appear the problem was not in the code, but inside the command of the code for running the webapp
I'm having trouble displaying some data from Globcolour (1), due to the projection used with the matplotlib and cartopy definition of the image.
I downloaded a Total Suspended Matter image in NetCDF format (here is the data enter link description here), and when I tried to display it, along with a coastline from the cartopy package, there is a notorious gap between the coastline and the data. As you can see below, the pixels should be next to the coastline (black line), and not surpassed into the land (yellow pixels in the flags image)
This shouldn't happen. I check using QGIS and loading directly the netcdf file the coastline is set correctly.
Initially I used a PlateeCarrer projection for the image, considering that if the image was in WGS84 they would match, but clearly they don't. I've tried using the transform option in the matplotlib function but I haven't made it work. Either the gap remains, or the coordinates of the figure change to projected ones and my data (which is in geographical coordinates) disappear.
The attributes of the NetCDF file are:
'grid_type': 'Equirectangular',
'spatial_resolution': 4.6383123,
'nb_equ_bins': 55,
'registration': 5,
'lat_step': 0.041666668,
'lon_step': 0.041666668,
'earth_radius': 6378.137,
'max_north_grid': 11.124998,
'max_south_grid': 9.27,
'max_west_grid': -86.25,
'max_east_grid': -83.97,
'northernmost_latitude': 11.124998,
'southernmost_latitude': 9.249998,
'westernmost_longitude': -86.25,
'easternmost_longitude': -84.0,
'nb_grid_bins': 2475,
'nb_bins': 2475,
'pct_bins': 100.0,
'nb_valid_bins': 1089,
'pct_valid_bins': 44.0,
'netcdf_version': '4.3.3.1 of Jul 8 2016 18:15:50 $',
'DPM_reference': 'GC-UD-ACRI-PUG',
'IODD_reference': 'GC-UD-ACRI-PUG'}
The code that I'm using to plot the image is:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import cartopy.crs as ccrs
import dill as pickel
def paint_maps(df_std=None, fecha=1, attributes=None,
savefol='/media/felipe/TOSHIBA EXT/iMARES/Investigacion/2019_MariculturaPacifico/DB/figures/',
disp_fig=0):
"""Función para dibujar los datos contenidos en los archivos netCDF de SST, Salinidad y propiedad ópticas del agua.
Recibe el dataframe con la información en formato de Pandas Dataframe, y selecciona según una fecha establecida,
el conjunto de datos con coordenadas Lat-Lon que debe dibujar. Esos los dibuja y transforma a formato raster. Unido
se dibuja también la línea de costa proveniente de un archivo shapefile. La función dibuja toda la información
contenida en el dataframe aportado (datos, anomalías, flags, y cualquier otro dato que tenga.
Recibe:
df_std: dataframe con la información a dibujar. Debe venir indexado por fecha, lat y lon.
fecha: día que se elige dibujar. Formato string 'yyyymmdd'. Valor 1 significa que grafica el valor promedio de todas las fechas en cada
píxel. Promedio simple ignorando NaN's
attributes: diccionario con los atributos del netcdf de donde se obtiene nombre de variable y unidades. Creado
con open_netcdf.py
savefol: carpeta donde se guardan las imágenes dibujadas
disp_fig: booleano para imprimir figura en pantalla.
Devuelve:
Nada. Solo crea y guarda figuras"""
# Identifica la fecha solicitada (cuando se ha especificado) y confirma que sea parte del registro. Extrae la
# información del Dataframe en la fecha que se solicitó, o calcula el promedio de todas las fechas para graficar
# el valor promedio.
if fecha != 1:
if isinstance(fecha, str):
fecha = pd.to_datetime(fecha + '120000')
else:
print('La fecha indicada no está en formato String. Reinicie la ejecución.')
try:
idx = pd.IndexSlice
df_map = df_std.loc[idx[:, :, fecha], :]
except:
print('Se generó un error. Posiblemente fecha no está dentro del registro. La fecha debe estar entre el ' + df_std.index[0][-1].strftime('%d/%m/%Y') + ' y el ' + df_std.index[-1][-1].strftime('%d/%m/%Y'))
raise
else:
df_map = df_std.groupby(['lat', 'lon']).mean()
# Reestructura la información para tenerla en forma de matriz y dibujarla de forma más simple. Extrae los valores y
# las latitudes y longitudes correspondientes, así como los valores de la variable y sus flags.
df_map2 = df_map.unstack(level=0)
vari = df_map2['mean_val'].values
flags = df_map2['flag_val'].values
lat = df_map2['mean_val'].columns.get_level_values('lat')
lon = df_map2['mean_val'].index.get_level_values('lon')
# Extrae de los atributos del netcdf el nombre de la variable a graficar y las unidades
variable_str = attributes['variable']['long_name']
variable_units = attributes['variable']['units']
# Dibuja el mapa que se haya seleccionado según fecha (valor promedio del valor o fecha específica)
fig, ax = plt.subplots(1, 2, figsize=(10, 10), subplot_kw={'projection': ccrs.PlateCarree()})
extend = [lon[1], lon[-1], lat[1], lat[-1]]
# Primera figura. Variable a graficar. Usa línea de costa del cartopy y coloca una leyenda abajo
ax[0].set_extent(extend)
ax[0].coastlines(resolution='10m')
#cs = ax[0].pcolormesh(lon, lat, vari.T)
cs = ax[0].pcolormesh(lon, lat, vari.T, transform=ccrs.PlateCarree())
ax[0].set_title(variable_str)
cax, kw = matplotlib.colorbar.make_axes(ax[0], location='bottom', pad=0.05, shrink=0.7)
out = fig.colorbar(cs, cax=cax, extend='both', **kw)
out.set_label('Units: '+variable_units, size=10)
# Segunda figura. Flags de la figura. Usa la leyenda directamente de los datos usados.
ax[1].set_extent(extend)
ax[1].coastlines(resolution='10m')
cs2 = ax[1].pcolormesh(lon, lat, flags.T)
ax[1].set_title('Flags')
cax, kw = matplotlib.colorbar.make_axes(ax[1], location='bottom', pad=0.05, shrink=0.7)
out = fig.colorbar(cs2, cax=cax, extend='both', **kw)
out.set_label('Flags', size=10)
# Salva la figura
plt.savefig(savefol+variable_str+'.jpg', bbox_inches='tight')
with open(savefol+'fig_'+variable_str+'.pickel', 'wb') as f:
pickel.dump(fig, f)
# Imprime figura si se elige opción con disp_fig
if disp_fig == 1:
plt.show()
return
It receives the data as a Pandas dataframe. The NetCDF was opened using xarray.open_dataset and then transforming it to Pandas with to_dataframe()
I'm using Python 3.7 in Ubuntu.
Last thing. When loading the cartopy.crs package, this error occurs:
ERROR 1: PROJ: proj_create_from_database: Open of /home/felipe/anaconda3/envs/personal/share/proj failed
Could it be affecting?
we answered to Felipe by email, I copy/paste here:
A small Python script to create a map on your area from a TSM GlobColour Product (I used a monthly product to have a good coverage):
import netCDF4 as nc
import numpy as np
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
fig, ax = plt.subplots(figsize=(5, 5), subplot_kw=dict(projection=ccrs.PlateCarree()))
# my region of interest
ax.set_extent([-86, -84, 9, 11])
ax.coastlines(resolution='10m', color='red')
nc_dst = nc.Dataset('L3m_20100101-20100131__GLOB_4_AV-MER_TSM_MO_00.nc')
# extent of the product
data_extent = [nc_dst.max_west_grid, nc_dst.max_east_grid,
nc_dst.max_south_grid, nc_dst.max_north_grid]
data = nc_dst.variables['TSM_mean'][:]
flags = nc_dst.variables['TSM_flags'][:]
land = flags & 8 # LAND == 3rd bit == 2^3 == 8
data_noland = np.ma.masked_where(land, data)
ax.imshow(data_noland, origin='upper', extent=data_extent)
plt.savefig('TSM_noland.png')
ax.imshow(data, origin='upper', extent=data_extent)
plt.savefig('TSM.png')
I think you are facing to 2 problems:
1) Our products may overlap some land areas because of the Level-3 rebinning during the GlobColour processing: if a 4km pixel has only the corner on the water we will fill the full pixel. We keep them because they may be usefull for some needs (for example areas where the land/water limit is varying), but in the quality flags we provide a LAND mask which could be used to remove these pixels. You can also use your own LAND mask if you prefer. The Python example below shows how to use the LAND mask.
2) I suspect that your Python code introduces an east/south shift of at least half a pixel maybe because the lat/lon arrays are for the center of each pixel but the extent needed by cartopy is the exterior limit.
GlobColour flags are defined in the Product User Guide http://www.globcolour.info/CDR_Docs/GlobCOLOUR_PUG.pdf page 76.
The GlobColour Team
Are you sure your data are in WGS84? Looking at the metadata, I only see:
'earth_radius': 6378.137
which I imply means assume a spherical Earth with radius 6378.137 km. I don't have access to your data, but I would try setting up a cartopy.crs.Globe instance with that radius.
The documentation is somewhat vague about this whereas I would've thought it'd be a pretty straight-forward thing to implement.
The k_mean algorithm applied to the MNIST digit dataset outputs 10 regions with a certain number associated with it, though it isn't the number represented by most of the digits contained within that region.
I do have my ground_truth label table.
How do I make it so that each region generated by the k_mean algorithm ends up being labeled as the digit which has the highest probability of being covered?
I've spent hours yesterday making up this code to do that, but it's still incomplete:
# TODO: for centroid-average method, see https://stackoverflow.com/a/25831425/9768291
def most_probable_digit(indices, data):
"""
Avec un tableau d'indices (d'un label spécifique assigné par scikit, obtenu avec "get_indices_of_label")
où se situent les vrais labels dans 'data', cette fonction calcule combien de fois chaque vrai label
apparaît et retourne celui qui est apparu le plus souvent (et donc qui a la plus grande probabilité
d'être le ground_truth_label désigné par la région délimitée par scikit).
:param indices: tableau des indices dans 'data' qui font parti d'une région du k_mean
:param data: toutes les données réparties dans les régions du k_mean
:return: la valeur (le digit) le plus probable associé à cette région
"""
actual_labels = []
for i in indices:
actual_labels.append(data[i])
if verbose: print("The actual labels for each of those digits are:", actual_labels)
counts = count_labels("actual labels", actual_labels)
probable = counts.index(max(counts))
if verbose: print("Most probable digit:", probable)
return probable
def get_list_of_indices(data, label):
"""
Retourne une liste d'indices correspondant à tous les endroits
où on peut trouver dans 'data' le 'label' spécifié
:param data:
:param label: le numéro associé à une région générée par k_mean
:return:
"""
return (np.where(data == label))[0].tolist()
# TODO: reassign in case of doubles
def obtain_corresponding_labels(data, real_labels):
"""
Assign the most probable label to each region.
:param data: list of regions associated with x_train or x_test (the order is preserved!)
:param real_labels: actual labels to assign to the region numbers
:return: the list of corresponding actual labels to region numbers
"""
switches_to_make = []
for i in range(10):
list_of_indices = get_list_of_indices(data, i) # indices in 'data' which are associated with region "i"
probable_label = most_probable_digit(list_of_indices, real_labels)
print("The assigned region", i, "should be considered as representing the digit ", probable_label)
switches_to_make.append(probable_label)
return switches_to_make
def rearrange_labels(switches_to_make, to_change):
"""
Takes region numbers and assigns the most probable digit (label) to it.
For example, if switches_to_make[3] = 5, it means that the 4th region (index 3 of the list)
should be considered as representing the digit "5".
:param switches_to_make: list of changes to make
:param to_change: this table will be changed according to 'switches_to_make'
:return: nothing, the change is made in-situ
"""
for region in range(len(to_change)):
for label in range(len(switches_to_make)):
if to_change[region] == label: # if it corresponds to the "wrong" label given by scikit
to_change[region] = switches_to_make[label] # assign the "most probable" label
break
def count_error_rate(found, truth):
wrong = 0
for i in range(len(found)):
if found[i] != truth[i]:
wrong += 1
print("Error rate = ", wrong / len(found) * 100, "%\n\n")
def treat_data(switches_to_make, predictions, truth):
rearrange_labels(switches_to_make, predictions) # Rearranging the training labels
count_error_rate(predictions, truth) # Counting error rate
For now, the problem with my code is that it can generate duplicates (if two regions have the same highest probability digit, that digit is associated with both regions).
Here is how I use the code:
kmeans = KMeans(n_clusters=10) # TODO: eventually use "init=ndarray" to be able to use custom centroids for init ?
kmeans.fit(x_train)
training_labels = kmeans.labels_
print("Done with calculating the k-mean.\n")
switches_to_make = utils.obtain_corresponding_labels(training_labels, y_train) # Obtaining the most probable labels
utils.treat_data(switches_to_make, training_labels, y_train)
print("Assigned labels: ", training_labels)
print("Real labels: ", y_train)
print("\n####################################################\nMoving on to predictions")
predictions = kmeans.predict(x_test)
utils.treat_data(switches_to_make, predictions, y_test)
I obtain approximately a 50% error rate with my code.
If I understand you correctly, you want to assign the actual digit value as a cluster label that matches that cluster, correct? If that is the case, I don't think it is possible.
K-Means is an unsupervised learning algorithm. It does not understand what it is looking at and the labels it assigns are arbitrary. Instead of 0, 1, 2, ... it could have called them 'apple', 'orange', 'grape' ... . All K-Means can ever do, is to tell you that a bunch of data points are similar to each other based on some metric, that is all. It is great for data exploration or pattern finding. But not for telling you "What" it actually is.
It does not matter what post processing you do, because the computer can never, programmatically, know what the true labels are, unless you, the human, tell it. In which case you might as well use a supervised learning algorithm.
If you want to train a model, that, when it see's a number, it can then assign the correct label to it, you must use a supervised learning method (where labels are a thing). Look into Random Forest instead, for instance. Here is a similar endeavor.
Here is the code to use my solution:
from sklearn.cluster import KMeans
import utils
# Extraction du dataset
x_train, y_train = utils.get_train_data()
x_test, y_test = utils.get_test_data()
kmeans = KMeans(n_clusters=10)
kmeans.fit(x_train)
training_labels = kmeans.labels_
switches_to_make = utils.find_closest_digit_to_centroids(kmeans, x_train, y_train) # Obtaining the most probable labels (digits) for each region
utils.treat_data(switches_to_make, training_labels, y_train)
predictions = kmeans.predict(x_test)
utils.treat_data(switches_to_make, predictions, y_test)
And utils.py:
import csv
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise_distances_argmin_min
use_reduced = True # Flag variable to use the reduced datasets (generated by 'pre_process.py')
verbose = False # Should debugging prints be shown
def get_data(reduced_path, path):
"""
Pour obtenir le dataset désiré.
:param reduced_path: path vers la version réduite (générée par 'pre_process.py')
:param path: path vers la version complète
:return: numpy arrays (data, labels)
"""
if use_reduced:
data = open(reduced_path)
else:
data = open(path)
csv_file = csv.reader(data)
data_points = []
for row in csv_file:
data_points.append(row)
data_points.pop(0) # On enlève la première ligne, soit les "headers" de nos colonnes
data.close()
# Pour passer de String à int
for i in range(len(data_points)): # for each image
for j in range(len(data_points[0])): # for each pixel
data_points[i][j] = int(data_points[i][j])
# # Pour obtenir des valeurs en FLOAT normalisées entre 0 et 1:
# data_points[i][j] = np.divide(float(data_points[i][j]), 255)
# Pour séparer les labels du data
y_train = [] # labels
for row in data_points:
y_train.append(row[0]) # first column is the label
x_train = [] # data
for row in data_points:
x_train.append(row[1:785]) # other columns are the pixels
x_train = np.array(x_train)
y_train = np.array(y_train)
print("Done with loading the dataset.")
return x_train, y_train
def get_test_data():
"""
Retourne le dataset de test désiré.
:return: numpy arrays (data, labels)
"""
return get_data('../data/reduced_mnist_test.csv', '../data/mnist_test.csv')
def get_train_data():
"""
Retourne le dataset de training désiré.
:return: numpy arrays (data, labels)
"""
return get_data('../data/reduced_mnist_train.csv', '../data/mnist_train.csv')
def display_data(x_train, y_train):
"""
Affiche le digit voulu.
:param x_train: le data (784D)
:param y_train: le label associé
:return:
"""
# Exemple pour afficher: conversion de notre vecteur d'une dimension en 2 dimensions
matrix = np.reshape(x_train, (28, 28))
plt.imshow(matrix, cmap='gray')
plt.title("Voici un " + str(y_train))
plt.show()
def generate_mean_images(x_train, y_train):
"""
Retourne le tableau des images moyennes pour chaque classe
:param x_train:
:param y_train:
:return:
"""
counts = np.zeros(10).astype(int)
for label in y_train:
counts[label] += 1
sum_pixel_values = np.zeros((10, 784)).astype(int)
for img in range(len(y_train)):
for pixel in range(len(x_train[0])):
sum_pixel_values[y_train[img]][pixel] += x_train[img][pixel]
pixel_probability = np.zeros((len(counts), len(x_train[0]))) # (10, 784)
for classe in range(len(counts)):
for pixel in range(len(x_train[0])):
pixel_probability[classe][pixel] = np.divide(sum_pixel_values[classe][pixel] + 1, counts[classe] + 2)
mean_images = []
if verbose:
plt.figure(figsize=(20, 4)) # values of the size of the plot: (x,y) in INCHES
plt.suptitle("Such wow, much impress !")
for classe in range(len(counts)):
class_mean = np.reshape(pixel_probability[classe], (28, 28))
mean_images.append(class_mean)
# Aesthetics
plt.subplot(1, 10, classe + 1)
plt.title(str(classe))
plt.imshow(class_mean, cmap='gray')
plt.xticks([])
plt.yticks([])
plt.show()
return mean_images
#########
# used for "k_mean" (for now)
def count_labels(name, data):
"""
S'occupe de compter le nombre de data associé à chacun des labels.
:param name: nom de ce que l'on compte
:param data: doit être 1D
:return: counts = le nombre pour chaque label
"""
header = "-- " + str(name) + " -- " # making sure it's a String
counts = [0]*10 # initializing the counting array
for label in data:
counts[label] += 1
if verbose: print(header, "Amounts for each label:", counts)
return counts
def get_list_of_indices(data, label):
"""
Retourne une liste d'indices correspondant à tous les endroits
où on peut trouver dans 'data' le 'label' spécifié
:param data:
:param label: le numéro associé à une région générée par k_mean
:return:
"""
return (np.where(data == label))[0].tolist()
def rearrange_labels(switches_to_make, to_change):
"""
Takes region numbers and assigns the most probable digit (label) to it.
For example, if switches_to_make[3] = 5, it means that the 4th region (index 3 of the list)
should be considered as representing the digit "5".
:param switches_to_make: list of changes to make
:param to_change: this table will be changed according to 'switches_to_make'
:return: nothing, the change is made in-situ
"""
for region in range(len(to_change)):
for label in range(len(switches_to_make)):
if to_change[region] == label: # if it corresponds to the "wrong" label given by scikit
to_change[region] = switches_to_make[label] # assign the "most probable" label
break
def count_error_rate(found, truth):
wrong = 0
for i in range(len(found)):
if found[i] != truth[i]:
wrong += 1
percent = wrong / len(found) * 100
print("Error rate = ", percent, "%")
return percent
def treat_data(switches_to_make, predictions, truth):
rearrange_labels(switches_to_make, predictions) # Rearranging the training labels
count_error_rate(predictions, truth) # Counting error rate
# TODO: reassign in case of doubles
# adapted from https://stackoverflow.com/a/45275056/9768291
def find_closest_digit_to_centroids(kmean, data, labels):
"""
The array 'closest' will contain the index of the point in 'data' that is closest to each centroid.
Let's say the 'closest' gave output as array([0,8,5]) for the three clusters. So data[0] is the
closest point in 'data' to centroid 0, and data[8] is the closest to centroid 1 and so on.
If the returned list is [9,4,2,1,3] it would mean that the region #0 (index 0) represents the digit 9 the best.
:param kmean: the variable where the 'fit' data has been stored
:param data: the actual data that was used with 'fit' (x_train)
:param labels: the true labels associated with 'data' (y_train)
:return: list where each region is at its index and the value at that index is the digit it represents
"""
closest, _ = pairwise_distances_argmin_min(kmean.cluster_centers_,
data,
metric="euclidean")
switches_to_make = []
for region in range(len(closest)):
truth = labels[closest[region]]
print("The assigned region", region, "should be considered as representing the digit ", truth)
switches_to_make.append(truth)
print("Digits associated to each region (switches_to_make):", switches_to_make)
return switches_to_make
Essentially, here is the function that solved my problems:
# adapted from https://stackoverflow.com/a/45275056/9768291
def find_closest_digit_to_centroids(kmean, data, labels):
"""
The array 'closest' will contain the index of the point in 'data' that is closest to each centroid.
Let's say the 'closest' gave output as array([0,8,5]) for the three clusters. So data[0] is the
closest point in 'data' to centroid 0, and data[8] is the closest to centroid 1 and so on.
If the returned list is [9,4,2,1,3] it would mean that the region #0 (index 0) represents the digit 9 the best.
:param kmean: the variable where the 'fit' data has been stored
:param data: the actual data that was used with 'fit' (x_train)
:param labels: the true labels associated with 'data' (y_train)
:return: list where each region is at its index and the value at that index is the digit it represents
"""
closest, _ = pairwise_distances_argmin_min(kmean.cluster_centers_,
data,
metric="euclidean")
switches_to_make = []
for region in range(len(closest)):
truth = labels[closest[region]]
print("The assigned region", region, "should be considered as representing the digit ", truth)
switches_to_make.append(truth)
print("Digits associated to each region (switches_to_make):", switches_to_make)
return switches_to_make
I have a Database Class built in Python 3.5.2, which I am built to have general functions other classes could use to connect to the DB.
I am using SQLite and SQLite Studio to check my work.
So far, I have successfully created create-table functions, as well as others that return the name of all tables in the database (in a list), and one that returns the names of all columns in any table of the DB.
The problem:
For some reason, the INSERT INTO method is crashing my DB (it takes a while, then it says it's locked). It creates a db-journal file in the same folder, I assume it's some kind of log.
I have substituted the execute lines for prints, to check that the query is ok. I have got the query it prints and put it into the query editor of sqlite Studio to see if they work, AND THEY DO!
The method uses a while loop to create a query for each insert I want to do, and I am quite sure there is something wrong with my method definition. It must be doing something that crashes de DB (maybe something memory related?)
Here is the code:
import sqlite3
class Database:
def __init__(self, name):
self.name=name
self.db_conn=sqlite3.connect(self.name+'.db')
self.cursor=self.db_conn.cursor()
self.commit=self.db_conn.commit()
def get_name(self):
return self.name
def get_tableNames(self):
#get table names into a list:
self.tables=[]
c=self.db_conn.execute("select name from sqlite_master where type = 'table'")
for row in c:
self.tables.append(row[0])
return self.tables
def get_tableColumns(self,tableName):
#get table columns into a list:
self.tableName=tableName
self.columns=[]
c=self.db_conn.execute("PRAGMA table_info("+ self.tableName+" );")
for row in c:
self.columns.append(row[1])
return self.columns
def create_table(self,table_name,*args):
#crear una tabla con nombre table_name y columns= cada uno de los argumentos de *fields.
#cada campo de args debe ser una lista.
self.table_name=table_name
fields=[]
for field in args:
fields.append(field)
i=0
try:
self.db_conn.execute("CREATE TABLE " +self.table_name+ "(ID INTEGER PRIMARY KEY AUTOINCREMENT);")
self.commit
except sqlite3.OperationalError:
print("La tabla "+self.table_name+" no se ha creado")
#añadimos las columnas a la tabla creada:
while i<len(fields):
if len(fields[i])==2:
try:
self.db_conn.execute("ALTER TABLE "+self.table_name+" ADD COLUMN "+fields[i][0]+" "+fields[i][1].upper()+";")
self.commit
i+=1
except sqlite3.OperationalError:
print("No se ha podido añadir la columna "+fields[i][0]+" a la tabla "+self.table_name)
i+=1
elif len(fields[i])==3:
try:
self.db_conn.execute("ALTER TABLE " +table_name+" ADD COLUMN " +fields[i][0]+" "+fields[i][1].upper()+" ("+str(fields[i][2])+") "+";")
self.commit
i+=1
except sqlite3.OperationalError:
print("No se ha podido añadir la columna "+fields[i][0]+" a la tabla "+self.table_name)
i+=1
else:
print('los argumentos deben ser listas donde el primer elemento será el nombre de la columna y los otros dos, el tipo y tamaño (de haberlo)')
def insert(self,tableName, *vals):
#insertar valores en tabla:
fields=self.get_tableColumns(tableName)
rows=[]
for row in vals:
rows.append(row)
r=0
while r<len(rows):
i=1
query="INSERT INTO "+self.tableName+" ("
while i<len(fields):
query=query+str(fields[i]+',')
i+=1
query=query[:-1]
query=query+') VALUES ('
i=0
while i<len(rows[r]):
if type(rows[r][i]) is str:
query=query+str("'"+rows[r][i]+"'"+',')
else:
query=query+str(rows[r][i])+','
i+=1
query=query[:-1]+');'
print(query)
print('commit')
try:
self.db_conn.execute(query)
self.commit
print("se han añadido los datos a la tabla")
except sqlite3.OperationalError:
print('no se han podido añadir los valores específicos a la tabla '+self.tableName)
r+=1
#esta función peta la DB y por eso no funcionan las queries.
def close(self):
self.db_conn.close()
closed= 'database closed'
return closed
In order to make it work:
db=Database('stock-data')
db.create_table('test',['name','text',50],['age','integer'])
db.insert('test',['john',20],['will',21])
This last expression is the one that crashes.