How to check which row in producing LangDetectException error in LangDetect? - python-3.x

I have a dataset of tweets that contains tweets mainly from English but also have several tweets in Indian Languages (such as Punjabi, Hindi, Tamil etc.). I want to keep only English language tweets and remove rows with different language tweets.
I tried this [https://stackoverflow.com/questions/67786493/pandas-dataframe-filter-out-rows-with-non-english-text] and it worked on the sample dataset. However, when I tried it on my dataset it showed error:
LangDetectException: No features in text.
Also, I have already checked other question [https://stackoverflow.com/questions/69804094/drop-non-english-rows-pandasand] where the accepted answer talks about this error and mentioned that empty rows might be the reason for this error, so I already cleaned my dataset to remove all the empty rows.
Simple code which worked on sample data but not on original data:
from langdetect import detect
import pandas as pd
df = pd.read_csv('Sample.csv')
df_new = df[df.text.apply(detect).eq('en')]
print('New df is: ', df_new)
How can I check which row is producing error?
Thanks in Advance!

Use custom function for return True if function detect failed:
df = pd.read_csv('Sample.csv')
def f(x):
try:
detect(x)
return False
except:
return True
s = df.loc[df.text.apply(f), 'text']
Another idea is create new column filled by detect, if failed return NaN, last filtr rows with missing values to df1 and also df_new with new column filled by ouput of function detect:
df = pd.read_csv('Sample.csv')
def f1(x):
try:
return detect(x)
except:
return np.nan
df['new'] = df.text.apply(f1)
df1 = df[df.new.isna()]
df_new = df[df.new.eq('en')]

Related

How to clean CSV file for a coordinate system using pandas?

I wanted to create a program to convert CSV files to DXF(AutoCAD), but the CSV file sometimes comes with a header and sometimes no and there are cells that cannot be empty such as coordinates, and I also noticed that after excluding some of the inputs the value is nan or NaN and it was necessary to get rid of them so I offer you my answer and please share your opinions to implement a better method.
sample input
output
solution
import string
import pandas
def pandas_clean_csv(csv_file):
"""
Function pandas_clean_csv Documentation
- I Got help from this site, it's may help you as well:
Get the row with the largest number of missing data for more Documentation
https://moonbooks.org/Articles/How-to-filter-missing-data-NAN-or-NULL-values-in-a-pandas-DataFrame-/
"""
try:
if not csv_file.endswith('.csv'):
raise TypeError("Be sure you select .csv file")
# get punctuations marks as list !"#$%&'()*+,-./:;<=>?#[\]^_`{|}~
punctuations_list = [mark for mark in string.punctuation]
# import csv file and read it by pandas
data_frame = pandas.read_csv(
filepath_or_buffer=csv_file,
header=None,
skip_blank_lines=True,
error_bad_lines=True,
encoding='utf8',
na_values=punctuations_list
)
# if elevation column is NaN convert it to 0
data_frame[3] = data_frame.iloc[:, [3]].fillna(0)
# if Description column is NaN convert it to -
data_frame[4] = data_frame.iloc[:, [4]].fillna('-')
# select coordinates columns
coord_columns = data_frame.iloc[:, [1, 2]]
# convert coordinates columns to numeric type
coord_columns = coord_columns.apply(pandas.to_numeric, errors='coerce', axis=1)
# Find rows with missing data
index_with_nan = coord_columns.index[coord_columns.isnull().any(axis=1)]
# Remove rows with missing data
data_frame.drop(index_with_nan, 0, inplace=True)
# iterate data frame as tuple data
output_clean_csv = data_frame.itertuples(index=False)
return output_clean_csv
except Exception as E:
print(f"Error: {E}")
exit(1)
out_data = pandas_clean_csv('csv_files/version2_bad_headers.csl')
for i in out_data:
print(i[0], i[1], i[2], i[3], i[4])
Here you can Download my test CSV files

Summarize non-zero values or any values from pandas dataframe with timestamps- From_Time & To_Time

I have a dataframe given below
I want to extract all the non-zero values from each column to put it in a summarize way like this
If any value repeated for period of time then starting time of value should go in 'FROM' column and end time of value should go in 'TO' column with column name in 'BLK-ASB-INV' column and value should go in 'Scount' column. For this I have started to write the code like this
import pandas as pd
df = pd.read_excel("StringFault_Bagewadi_16-01-2020.xlsx")
df = df.set_index(['Date (+05:30)'])
cols=['BLK-ASB-INV', 'Scount', 'FROM', 'TO']
res=pd.DataFrame(columns=cols)
for col in df.columns:
ss=df[col].iloc[df[col].to_numpy().nonzero()[0]]
.......
After that I am unable to think how should I approach to get the desired output. Is there any way to do this in python? Thanks in advance for any help.
Finally I have solved my problem, I have written the code given below works perfectly for me.
import pandas as pd
df = pd.read_excel("StringFault.xlsx")
df = df.set_index(['Date (+05:30)'])
cols=['BLK-ASB-INV', 'Scount', 'FROM', 'TO']
res=pd.DataFrame(columns=cols)
for col in df.columns:
device = []
for i in range(len(df[col])):
if df[col][i] == 0:
None
else:
if i < len(df[col])-1 and df[col][i]==df[col][i+1]:
try:
if df[col].index[i] > device[2]:
continue
except IndexError:
device.append(df[col].name)
device.append(df[col][i])
device.append(df[col].index[i])
continue
else:
if len(device)==3:
device.append(df[col].index[i])
res = res.append({'BLK-ASB-INV':device[0], 'Scount':device[1], 'FROM':device[2], 'TO': device[3]}, ignore_index=True)
device=[]
else:
device.append(df[col].name)
device.append(df[col][i])
if i == 0:
device.append(df[col].index[i])
else:
device.append(df[col].index[i-1])
device.append(df[col].index[i])
res = res.append({'BLK-ASB-INV':device[0], 'Scount':device[1], 'FROM':device[2], 'TO': device[3]}, ignore_index=True)
device=[]
For reference, here is the output datafarme

Using langdetect output to be imported into a new column in my dataframe

Being rather new to programming with python I tried to language detect segments of text in pandas data frame.
So first I made a function for the 'langdetect' package
import pandas as pd
from langdetect import detect
def language_detect(x):
lang = detect(x)
print(lang)
My second step would be to feed in the data frame for processing. All the segments that need detecting are in separate rows in the dataframe under the same column header.
result = [language_detect(x) for x in df['column_name']]
df['l_detect'] = pd.append(result)
In the output I see the texts being recognized properly.
But when I try to print result.
it returns me with only the value for every entry 'none'
So my questions are:
why do I get 'none' when the the print output from the function has the right values
How can I attach this to my current data frame, since when I try to append it I get 'none' on
every field as well.
Thanks in advance.
The problem is that result is empty because your function language_detect() doesn't return anything (it is only printing the results).
import pandas as pd
from langdetect import detect
lst = [('this is a test', 1), ('what language is this?', 4), ('stackoverflow is a website', 23)]
df = pd.DataFrame(lst, columns = ['text', 'something'])
def language_detect(x):
lang = detect(x)
print(lang)
result = [language_detect(x) for x in df['text']]
result
#Output:[None, None, None]
Just give it a return value:
def language_detect(x):
lang = detect(x)
return lang
df['l_detect'] = df['text'].apply(language_detect)
df.head()
#Output:
# text something l_detect
#0 this is a test 1 en
#1 what language is this? 4 en
#2 stackoverflow is a website 23 en
and it will work as expected.

Python function to loop through columns to replace strings

I'm new to python, and I've found this community to be quite helpful so far. I've found a lot of answers to my other questions, but I can't seem to figure this one out.
I'm trying to write a function to loop through columns and replace '%', '$', and ','. When I import the .csv in through pandas I have about 80/108 columns that are dtype == object that I need to convert to float.
I've found I can write:
df['column_name'] = df['column_name].str.replace('%', '')
and it successfully executes and strips the %.
Unfortunately I have a lot of columns(108) and want to write a function to take care of the problem. I have come up with the below code that will only execute on some of the columns and puts out an odd error:
# get column names
col_names = list(df.columns.values)
# start cleaning data
def clean_data(x):
for i in range(11, 109, 1):
if x[col_names[i]].dtype == object:
x[col_names[i]] = x[col_names[i]].str.replace('%', '')
x[col_names[i]] = x[col_names[i]].str.replace('$', '')
x[col_names[i]] = x[col_names[i]].str.replace(',', '')
AttributeError: 'DataFrame' object has no attribute 'dtype'
Even though the error stops the process, some of the columns are cleaned up. I can't seem to figure out why it's not cleaning up all columns and then returns the 'dtype' error.
I'm running python 3.6.
Welcome to stackoverflow.
If you want to do this for each columns, use the apply function of the dataframe, no need to loop:
df = pd.DataFrame([['1$', '2%'],] * 3, columns=['A', 'B'])
def myreplace(s):
for ch in ['%','$',',']:
s = s.map(lambda x: x.replace(ch, ''))
return s
df = df.apply(myreplace)
print(df)
If you want to do it for some columns, use the map function of the dataserie, no need to loop:
df = pd.DataFrame([['1$', '2%'],] * 3, columns=['A', 'B'])
def myreplace(s):
for ch in ['%','$',',']:
s = s.replace(ch, '')
return s
df['A'] = df['A'].map(myreplace)

Fuzzy logic for excel data -Pandas

I have two dataframes DF(~100k rows)which is a raw data file and DF1(15k rows), mapping file. I'm trying to match the DF.address and DF.Name columns to DF1.Address and DF1.Name. Once the match is found DF1.ID should be populated in DF.ID(if DF1.ID is not None) else DF1.top_ID should be populated in DF.ID.
I'm able to match the address and name with the help of fuzzy logic but i'm stuck how to connect the result obtained to populate the ID.
DF1-Mapping file
DF Raw Data file
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from operator import itemgetter
df=pd.read_excel("Test1", index=False)
df1=pd.read_excel("Test2", index=False)
df=df[df['ID'].isnull()]
zip_code=df['Zip'].tolist()
Facility_city=df['City'].tolist()
Address=df['Address'].tolist()
Name_list=df['Name'].tolist()
def fuzzy_match(x, choice, scorer, cutoff):
return (process.extractOne(x,
choices=choice,
scorer=scorer,
score_cutoff=cutoff))
for pin,city,Add,Name in zip(zip_code,Facility_city,Address,Name_list):
#====Address Matching=====#
choice=df1.loc[(df1['Zip']==pin) &(df1['City']==city),'Address1']
result=fuzzy_match(Add,choice,fuzz.ratio,70)
#====Name Matching========#
if (result is not None):
if (result[3]>70):
choice_1=(df1.loc[(df1['Zip']==pin) &(df1['City']==city),'Name'])
result_1=(fuzzy_match(Name,choice_1,fuzz.ratio,95))
print(ID)
if (result_1 is not None):
if(result_1[3]>95):
#Here populating the matching ID
print("ok")
else:
continue
else:
continue
else:
continue
else:
IIUC: Here is a solution:
from fuzzywuzzy import fuzz
import pandas as pd
#Read raw data from clipboard
raw = pd.read_clipboard()
#Read map data from clipboard
mp = pd.read_clipboard()
#Merge raw data and mp data as following
dfr = mp.merge(raw, on=['Hospital Name', 'City', 'Pincode'], how='outer')
#dfr will have many duplicate rows - eliminate duplicate
#To eliminate duplicate using toke_sort_ratio, compare address x and y
dfr['SCORE'] = dfr.apply(lambda x: fuzz.token_sort_ratio(x['Address_x'], x['Address_y']), axis=1)
#Filter only max ratio rows grouped by Address_x
dfr1 = dfr.iloc[dfr.groupby('Address_x').apply(lambda x: x['SCORE'].idxmax())]
#dfr1 shall have the desired result
This link has sample data to test the solution provided.

Resources