Python function to iterate each unique column and transform using pyspark - python-3.x

I'm building the following global function in Pyspark to go through each column in my CSV that is in different formats and convert them all to one unique format separated by "-."
I am new to the python world, I am getting
TypeError: Column is not iterable
employeesDF =is reading csv file from local sys
I tried the below code:
def colrename(df):
for col in employeesDF.columns:
F.col(col).alias(col.replace('/s,#', '_'))
return employeesDF
ndf = colrename (employeesDF.columns)
Input:
OutPut:

This will work
import re
def colrename(column):
reg = re.sub(r'\s|#', '_',column)
return reg
df2 = df2.toDF(*(colrename(c) for c in df2.columns))

In case any one interested, I used the code below to do it. I hope this information is useful. Thanks
from pyspark.sql import *
import re
spark = SparkSession.builder.master("local").appName("test").getOrCreate()
df=spark.read.format('csv')\
.option('header',True)\
.option('inferschema',True)\
.load('C:\\bigdata\\datasets\\employee10000_records.csv')
def colrename(df):
for names in df.schema.names:
df = df.withColumnRenamed(names, re.sub(r'([^A-Za-z0-9])','_',names))
return df
colrename (df).show()

Related

Pivot issue in databricks

I have dataframe table having values :
id Country Interest
00 Russian Digestion;Destillation
I want to pivot the Interest column and name new column in azure databricks in python like this :
id Country Int Interest
00Q7 Russ Digestion Digestion;Destillation
00Q7 Russ Destillation Digestion;Destillation
Please advise how it can be done
Regards
RK
I have created a sample dataframe similar to yours using the following code:
data = [['00Q7','Russian Federation','Digestion;Destillation'],['00Q6','United States','Oils;Automobiles']]
df = spark.createDataFrame(data=data,schema = ['id','country','interests'])
display(df)
To get the desired output (like yours), first I have split the data in interests column using pyspark.sql.functions.split.
from pyspark.sql.functions import split,col
df1 = df.withColumn("interest", split(col("interests"), ";"))
display(df1)
Now I have exploded the new column interest using pyspark.sql.functions.explode to get the required output.
from pyspark.sql.functions import explode
op = df1.withColumn('interest',explode(col('interest')))
display(op)
UPDATE:
data = [['00Q7','Russian Federation','01_Digestion;02_Destillation']]
df = spark.createDataFrame(data=data,schema = ['id','country','interests'])
#display(df)
from pyspark.sql.functions import split,col
df1 = df.withColumn("interest", split(col("interests"), ";"))
from pyspark.sql.functions import explode
op = df1.withColumn('interest',explode(col('interest')))
#UPDATE
from pyspark.sql.functions import concat,lit
op.withColumn("set",concat(lit('Set'),split(col('interest'),'_').getItem(0))).show(truncate=False)
UPDATE-2:
pdf['set']= pdf['interest'].str.split('_').str[0]
import numpy as np
pdf["set"] = np.where(pdf["set"].astype(int)<10 , 'Set'+pdf['set'].str[1], 'Set'+pdf['set'])

when i execute pandas-profiling package it won't return min, max and mean values

When i profiling the following data using pandas-profiling==2.8.0 it won't return min, max and mean values.
CSV data
a,b,c
12,2.5,0
12,4.7,5
33,5,4
44,44.21,67
python code
import json
import pandas as pd
from pandas_profiling import ProfileReport
def profile_report(data):
dataset = data.select_dtypes(include=['int64', 'float64'])
profile=ProfileReport(dataset, minimal=True)
json_data=profile.to_json()
results = json.loads(json_data)
print(json.dumps(results, indent=4))
if __name__ == "__main__":
df = pd.read_csv('data.csv',index_col=None)
profile_report(df)
in some case it work properly and return min, max and mean values. but when i execute above csv data it won't return that values
For a dataset with less elements than a given number (say 5), pandas-profiling assumes that your variable is categorical instead of interval.
Use the vars.num.low_categorical_threshold parameter to change this (docs)
Example:
profile = ProfileReport(dataset, minimal=True, vars=dict(num={"low_categorical_threshold": 0}))

Using langdetect output to be imported into a new column in my dataframe

Being rather new to programming with python I tried to language detect segments of text in pandas data frame.
So first I made a function for the 'langdetect' package
import pandas as pd
from langdetect import detect
def language_detect(x):
lang = detect(x)
print(lang)
My second step would be to feed in the data frame for processing. All the segments that need detecting are in separate rows in the dataframe under the same column header.
result = [language_detect(x) for x in df['column_name']]
df['l_detect'] = pd.append(result)
In the output I see the texts being recognized properly.
But when I try to print result.
it returns me with only the value for every entry 'none'
So my questions are:
why do I get 'none' when the the print output from the function has the right values
How can I attach this to my current data frame, since when I try to append it I get 'none' on
every field as well.
Thanks in advance.
The problem is that result is empty because your function language_detect() doesn't return anything (it is only printing the results).
import pandas as pd
from langdetect import detect
lst = [('this is a test', 1), ('what language is this?', 4), ('stackoverflow is a website', 23)]
df = pd.DataFrame(lst, columns = ['text', 'something'])
def language_detect(x):
lang = detect(x)
print(lang)
result = [language_detect(x) for x in df['text']]
result
#Output:[None, None, None]
Just give it a return value:
def language_detect(x):
lang = detect(x)
return lang
df['l_detect'] = df['text'].apply(language_detect)
df.head()
#Output:
# text something l_detect
#0 this is a test 1 en
#1 what language is this? 4 en
#2 stackoverflow is a website 23 en
and it will work as expected.

How to merge big data of csv files column wise into a single csv file using Pandas?

I have lots of big data csv files in terms of countries and I want to merge their column in a single csv file, furthermore, each file has 'Year' as an index and having same in terms of length and numbers. You can see below is a given example of a Japan.csv file.
If anyone can help me please let me know. Thank you!!
Try using:
import pandas as pd
import glob
l = []
path = 'path/to/directory/'
csvs = glob.glob(path + "/*.csv")
for i in csvs:
df = pd.read_csv(i, index_col=None, header=0)
l.append(df)
df = pd.concat(l, ignore_index=True)
This should work. It goes over each file name, reads it and combines everything into one df. You can export this df to csv or do whatever with it. gl.
import pandas as pd
def combine_csvs_into_one_df(names_of_files):
one_big_df = pd.DataFrame()
for file in names_of_files:
try:
content = pd.read_csv(file)
except PermissionError:
print (file,"was not found")
continue
one_big_df = pd.concat([one_big_df,content])
print (file," added!")
print ("------")
print ("Finished")
return one_big_df

Fuzzy logic for excel data -Pandas

I have two dataframes DF(~100k rows)which is a raw data file and DF1(15k rows), mapping file. I'm trying to match the DF.address and DF.Name columns to DF1.Address and DF1.Name. Once the match is found DF1.ID should be populated in DF.ID(if DF1.ID is not None) else DF1.top_ID should be populated in DF.ID.
I'm able to match the address and name with the help of fuzzy logic but i'm stuck how to connect the result obtained to populate the ID.
DF1-Mapping file
DF Raw Data file
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from operator import itemgetter
df=pd.read_excel("Test1", index=False)
df1=pd.read_excel("Test2", index=False)
df=df[df['ID'].isnull()]
zip_code=df['Zip'].tolist()
Facility_city=df['City'].tolist()
Address=df['Address'].tolist()
Name_list=df['Name'].tolist()
def fuzzy_match(x, choice, scorer, cutoff):
return (process.extractOne(x,
choices=choice,
scorer=scorer,
score_cutoff=cutoff))
for pin,city,Add,Name in zip(zip_code,Facility_city,Address,Name_list):
#====Address Matching=====#
choice=df1.loc[(df1['Zip']==pin) &(df1['City']==city),'Address1']
result=fuzzy_match(Add,choice,fuzz.ratio,70)
#====Name Matching========#
if (result is not None):
if (result[3]>70):
choice_1=(df1.loc[(df1['Zip']==pin) &(df1['City']==city),'Name'])
result_1=(fuzzy_match(Name,choice_1,fuzz.ratio,95))
print(ID)
if (result_1 is not None):
if(result_1[3]>95):
#Here populating the matching ID
print("ok")
else:
continue
else:
continue
else:
continue
else:
IIUC: Here is a solution:
from fuzzywuzzy import fuzz
import pandas as pd
#Read raw data from clipboard
raw = pd.read_clipboard()
#Read map data from clipboard
mp = pd.read_clipboard()
#Merge raw data and mp data as following
dfr = mp.merge(raw, on=['Hospital Name', 'City', 'Pincode'], how='outer')
#dfr will have many duplicate rows - eliminate duplicate
#To eliminate duplicate using toke_sort_ratio, compare address x and y
dfr['SCORE'] = dfr.apply(lambda x: fuzz.token_sort_ratio(x['Address_x'], x['Address_y']), axis=1)
#Filter only max ratio rows grouped by Address_x
dfr1 = dfr.iloc[dfr.groupby('Address_x').apply(lambda x: x['SCORE'].idxmax())]
#dfr1 shall have the desired result
This link has sample data to test the solution provided.

Resources