Pandas dataframe from numpy with last dimension as object - python-3.x

Convert 3d numpy array to pandas dataframe with numpy.array as elements.
Are there any other solutions? What about speed?
import numpy as np
import pandas as pd
ones = np.ones((2,3,5))
temp = [[np.array(column_elem, dtype=np.object) for column_elem in row] for row in ones]
df = pd.DataFrame(temp)

Related

Can't run KMeans algorithm because Pandas DataFrame is loading decimal point numbers as strings

I drop all columns but the two I am interested in. When I try to convert my dataframe to a 2d numpy array from the two columns it turns into an object type that contains strings. I believe this is because the Data_Values has values such as "23.6." Is there anyway I can get rid of the decimal point and trailing numbers in this data as they are all different values.
import numpy as np
import pandas as pd
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from kneed import KneeLocator
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.metrics import pairwise_distances_argmin
data = pd.read_csv('Alzheimer_s_Disease_and_Healthy_Aging_Data.csv', engine='python', header=None)
data.columns = ['RowId', 'YearStart', 'YearEnd', 'LocationAbbr', 'LocationDesc', 'Datasource', 'Class', 'Topic', 'Question', 'Response',
'Data_Value_Unit', 'DataValueTypeID', 'Data_Value_Type', 'Data_Value', 'Data_Value_Alt', 'Data_Value_Footnote_Symbol',
'Data_Value_Footnote', 'Low_Confidence_Limit', 'High_Confidence_Limit', 'Sample_Size', 'StratificationCategory1',
'Stratification1', 'StratificationCategory2', 'Stratification2', 'StratificationCategory3', 'Stratification3', 'Geolocation',
'ClassID', 'TopicID', 'QuestionID', 'ResponseID', 'LocationID', 'StratificationCategoryID1', 'StratificationID1',
'StratificationCategoryID2', 'StratificationID2', 'StratificationCategoryID3', 'StratificationID3', 'Report']
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 10)
pd.set_option('display.width', 10)
data1 = data.iloc[1:]
df = data1[data1["Data_Value_Type"].str.contains("Mean") == False]
df = data1[data1["Data_Value"].str.contains("NaN") == False]
df.dropna()
df = df.drop(columns=['RowId', 'YearStart', 'YearEnd', 'LocationAbbr', 'LocationDesc', 'Datasource', 'Class', 'Topic', 'Question', 'Response',
'Data_Value_Unit', 'DataValueTypeID', 'Data_Value_Type', 'Data_Value_Alt', 'Data_Value_Footnote_Symbol',
'Data_Value_Footnote', 'Low_Confidence_Limit', 'High_Confidence_Limit', 'Sample_Size', 'StratificationCategory1',
'Stratification1', 'StratificationCategory2', 'Stratification2', 'StratificationCategory3', 'Stratification3', 'Geolocation',
'ClassID', 'TopicID', 'QuestionID', 'ResponseID', 'StratificationCategoryID1', 'StratificationID1',
'StratificationCategoryID2', 'StratificationID2', 'StratificationCategoryID3', 'StratificationID3', 'Report'])
x = df.to_numpy()
print(x.dtype)

How to change the datatype of the output

I want the output of this code in int64 format but the output of this code is in float. how can change it? pls suggest
import pandas as pd
import numpy as np
df = pd.read_csv('https://query.data.world/s/HqjNNadqEnwSq1qnoV_JqyRJkc7o6O')
df = df[df.isnull().sum(axis=1) < 5]
print(round(100*(df.isnull().sum()/len(df.index))),2)
Something like this should do the trick...
import pandas as pd
import numpy as np
df = pd.read_csv('https://query.data.world/s/HqjNNadqEnwSq1qnoV_JqyRJkc7o6O')
df = df[df.isnull().sum(axis=1) < 5]
x = round(100*(df.isnull().sum()/len(df.index)))
y = x.astype(np.int64)
print(y)
The key bit being x.astype(np.int64) to convert the format.

I get a KeyError when trying to plot these group of dataframes with Pandas

I get a keyerror for "Displacement" when I try to plot Force against Displacement with pandas for these group of dataframes. Please help.
The link to the excel sheet being used:
https://www.dropbox.com/s/f8lnp973ojv3ish/neurospheress.xlsx?dl=0
I tried clearing any space in the column titles but it doesn't work
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
data = pd.read_excel('neurospheress.xlsx', sep='\s*,\s*', sheet_name = 'LS')
df1 = data.iloc[:80,:2]
df2 = data.iloc[:80,2:4]
df3 = data.iloc[:80,4:]
dfs = [df1,df2,df3]
for i,df in enumerate(dfs):
plt.plot(df['Displacement'], df['Force'], linestyle='--', alpha= 0.8, label='df{}'.format(i))
plt.legend(loc='best')
plt.show()
The below solution works, it basically adds two things to your solution
a) Skip the first row from excel
b) Rename the column names for df2 and df3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
data = pd.read_excel('neurospheress.xlsx', sep='\s*,\s*', sheet_name = 'LS',skiprows=1)
df1 = data.iloc[:80,:2]
df2 = data.iloc[:80,2:4]
df3 = data.iloc[:80,4:]
dfs = [df1,df2,df3]
df2.rename(columns={'Force.1':'Force','Displacement.1':'Displacement'},inplace=True)
df3.rename(columns={'Force.2':'Force','Displacement.2':'Displacement'},inplace=True)
print(data.columns)
print(df1.columns)
print(df2.columns)
for i,df in enumerate(dfs):
plt.plot(df['Displacement'], df['Force'], linestyle='--', alpha= 0.8, label='df{}'.format(i))
plt.legend(loc='best')
plt.show()

sklearn MinMaxScaler mis-scaling?

I'm having trouble understanding one of the scaled columns in a pandas dataframe returned by MinMaxScaler:
The code snippet is as follows:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
A = np.random.randint(5, size=(8, 4))
FrameA = pd.DataFrame()
FrameA = A
scaled_array = MinMaxScaler().fit_transform(FrameA)
Scaled (LHS) and original (RHS)
Column 2 is suspect. The formula seems to be: x[i] / max{x} - 1 which differs from the other columns.

How do I map df column values to hex color in one go?

I have a pandas dataframe with two columns. One of the columns values needs to be mapped to colors in hex. Another graphing process takes over from there.
This is what I have tried so far. Part of the toy code is taken from here.
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
# Create dataframe
df = pd.DataFrame(np.random.randint(0,21,size=(7, 2)), columns=['some_value', 'another_value'])
# Add a nan to handle realworld
df.iloc[-1] = np.nan
# Try to map values to colors in hex
# # Taken from here
norm = matplotlib.colors.Normalize(vmin=0, vmax=21, clip=True)
mapper = plt.cm.ScalarMappable(norm=norm, cmap=plt.cm.viridis)
df['some_value_color'] = df['some_value'].apply(lambda x: mapper.to_rgba(x))
df
Which outputs:
How do I convert 'some_value' df column values to hex in one go?
Ideally using the sns.cubehelix_palette(light=1)
I am not opposed to using something other than matplotlib
Thanks in advance.
You may use matplotlib.colors.to_hex() to convert a color to hexadecimal representation.
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
# Create dataframe
df = pd.DataFrame(np.random.randint(0,21,size=(7, 2)), columns=['some_value', 'another_value'])
# Add a nan to handle realworld
df.iloc[-1] = np.nan
# Try to map values to colors in hex
# # Taken from here
norm = matplotlib.colors.Normalize(vmin=0, vmax=21, clip=True)
mapper = plt.cm.ScalarMappable(norm=norm, cmap=plt.cm.viridis)
df['some_value_color'] = df['some_value'].apply(lambda x: mcolors.to_hex(mapper.to_rgba(x)))
df
Efficiency
The above method it easy to use, but may not be very efficient. In the folling let's compare some alternatives.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
def create_df(n=10):
# Create dataframe
df = pd.DataFrame(np.random.randint(0,21,size=(n, 2)),
columns=['some_value', 'another_value'])
# Add a nan to handle realworld
df.iloc[-1] = np.nan
return df
The following is the solution from above. It applies the conversion to the dataframe row by row. This quite inefficient.
def apply1(df):
# map values to colors in hex via
# matplotlib to_hex by pandas apply
norm = mcolors.Normalize(vmin=np.nanmin(df['some_value'].values),
vmax=np.nanmax(df['some_value'].values), clip=True)
mapper = plt.cm.ScalarMappable(norm=norm, cmap=plt.cm.viridis)
df['some_value_color'] = df['some_value'].apply(lambda x: mcolors.to_hex(mapper.to_rgba(x)))
return df
That's why we might choose to calculate the values into a numpy array first and just assign this array as the newly created column.
def apply2(df):
# map values to colors in hex via
# matplotlib to_hex by assigning numpy array as column
norm = mcolors.Normalize(vmin=np.nanmin(df['some_value'].values),
vmax=np.nanmax(df['some_value'].values), clip=True)
mapper = plt.cm.ScalarMappable(norm=norm, cmap=plt.cm.viridis)
a = mapper.to_rgba(df['some_value'])
df['some_value_color'] = np.apply_along_axis(mcolors.to_hex, 1, a)
return df
Finally we may use a look up table (LUT) which is created from the matplotlib colormap, and index the LUT by the normalized data. Because this solution needs to create the LUT first, it is rather ineffienct for dataframes with less entries than the LUT has colors, but will pay off for large dataframes.
def apply3(df):
# map values to colors in hex via
# creating a hex Look up table table and apply the normalized data to it
norm = mcolors.Normalize(vmin=np.nanmin(df['some_value'].values),
vmax=np.nanmax(df['some_value'].values), clip=True)
lut = plt.cm.viridis(np.linspace(0,1,256))
lut = np.apply_along_axis(mcolors.to_hex, 1, lut)
a = (norm(df['some_value'].values)*255).astype(np.int16)
df['some_value_color'] = lut[a]
return df
Compare the timings
Let's take a dataframe with 10000 rows.
df = create_df(10000)
Original solution (apply1)
%timeit apply1(df)
2.66 s per loop
Array solution (apply2)
%timeit apply2(df)
240 ms per loop
LUT solution (apply3)
%timeit apply1(df)
7.64 ms per loop
In this case the LUT solution gives almost a factor 400 of improvement.

Resources