Categorize data from sentence column - python-3.x

I am trying to add a column of one-word categories from analyzing a column that contains a sentence in each row
I tried the following code but it kept giving me errors!
def loan_cat(row):
rows = df[df.columns[0]].count()
for i in rows:
data = df['purpose'][i]
if 'house' in data:
return 'house'
elif 'education' | 'university' in data:
return 'education'
elif 'wedding' in data:
return 'wedding'
elif 'car' in data:
return 'car'
elif 'real' in data:
return 'real estate'
elif 'property'in data:
return 'property'
return 'undefined'
df['purpose_1'] = df.apply(loan_cat, axis=1)
is there a better way to analyze and categorize the data?

Use a dict
import pandas
data = pandas.Series(["purchase a house",
"purchase car",
"supplemental education",
"burger",
"attend university"])
arr = {"house": "house",
"education": "education",
"university": "education",
"car": "car"}
def foo(s, d):
for k, v in d.items():
if k in s:
return v
return "NA"
data.apply(lambda x: foo(x, arr))
# 0 house
# 1 car
# 2 education
# 3 NA
# 4 education
# dtype: object

I figured out the answer:
def loan_cat(value):
if 'hous' in value:
return 'House'
elif 'educ' in value:
return 'Education'
elif 'university' in value:
return 'Education'
elif 'wedding' in value:
return 'Wedding'
elif 'car' in value:
return 'Car'
elif 'real' in value:
return 'Real Estate'
elif 'property'in value:
return 'Property'
return 'undefined'
df['purpose_cat'] = df['purpose'].apply(lambda value: loan_cat(value))
print(df['purpose_cat'].value_counts())

Related

Python3 - recursively replace all keys with periods in a nested dictionary

I am trying to clean up a nested dictionary before inserting it into Mongo. Some of the keys in the dict have periods in them so I need to replace them with underscores. Based on other posts I have seen I have come up with this (not working) code sample:
def get_recursively(search_dict):
new_dict = {}
for key, value in search_dict.items():
if '.' in key or ' ' in key:
new_dict[key.replace('.', '_').replace(' ', '_').lower()] = value
elif isinstance(value, dict):
results = get_recursively(value)
for key2, value2 in results.items():
new_dict[key] = dict(key2, value2)
elif isinstance(value, list):
for item in value:
if isinstance(item, dict):
more_results = get_recursively(item)
for key3, value3 in more_results.items():
new_dict[key] = dict(key3, value3)
else:
new_dict[key] = value
return new_dict
I am trying to make a new dictionary because when I tried to modify the existing dictionary I got an error about the dictionary changing during execution.
The code that is not valid (at least) is:
dict(key2, value2)
That is not valid syntax but hopefully shows my thought process at least.
Any help much appreciated.
If I understood right, is this want you meant?
def change_chars(string, chars, new_char):
new_string = string
for char in chars:
new_string = new_string.replace(char, new_char)
return new_string
def recursively_change_keys(obj, chars, new_char):
if isinstance(obj, list):
return [
recursively_change_keys(o, chars, new_char)
for o in obj
]
elif isinstance(obj, dict):
return {
change_chars(key, chars, new_char): recursively_change_keys(value, chars, new_char)
for key, value in obj.items()
}
return obj
So you just have to call it like recursively_change(search_dict, [ ".", " " ], "_")
Try:
import json
d = {
"some.key": [
{
"key.1": {"a": 1},
"key.2": 2,
"key.3": {"key.4": [3, 4, 5], "key.5": 6},
}
]
}
def transform(d):
if isinstance(d, dict):
return {k.replace(".", "_"): transform(v) for k, v in d.items()}
elif isinstance(d, list):
return [transform(v) for v in d]
else:
return d
# pretty print the dictionary:
print(json.dumps(transform(d), indent=4))
Prints:
{
"some_key": [
{
"key_1": {
"a": 1
},
"key_2": 2,
"key_3": {
"key_4": [
3,
4,
5
],
"key_5": 6
}
}
]
}

How to add currency sign to all integers in a list of nested dictionaries

Given a list of nested dictionaries how can I add the currency locale to all the integer values using the locale module. My current solution works however I could not figure out how to make it work with nested dictionaries nor does it feel pythonic.
Example input
[
{
'name':'Tom',
'total_salary': 70000,
'salary': {
'base': 65000,
'bonus': 5000
}
},
{
'name':'Andrew',
'total_salary': 50000,
'salary': {
'base': 45000,
'bonus': 5000
}
}
]
Wanted output
[
{
'name':'Tom',
'total_salary': '$70000',
'salary': {
'base': '$65000',
'bonus': '$5000'
}
},
{
'name':'Andrew',
'total_salary': '$50000',
'salary': {
'base': '$45000',
'bonus': '$5000'
}
}
]
current solution
import locale
locale.setlocale( locale.LC_ALL, 'en_CA.UTF-8' )
def add_currency_locale(_list):
new_list = []
for d in _list:
for k,v in list(d.items()):
try:
v = float(v)
new_value = locale.currency(v, grouping=True)
d[k] = new_value
except:
pass
new_list.append(d)
return new_list
Because you have the line locale.setlocale( locale.LC_ALL, 'en_CA.UTF-8' ), I think you don't want to have the local currency symbol, but want it to always be '$'. If so, here is my solution, otherwise you can easily replace the line where I set new_value. I'm using recursion to correctly handle cases when you have nested lists or dictionaries (the code you've provided seems to not work for those cases, but according to the example input and output you need this. If you don't, remove the part with instance checking and replace the line except ValueError: with except (ValueError, TypeError):). Pay attention to the notes I left in the comments
# Note: variable names with one leading underscore are "private" according to python code style.
# Use trailing underscore instead
def add_currency_locale(list_):
new_list = []
for d in list_:
# Note: no need to convert `d.items()` to list: you can iterate over the original
# object, and the conversion takes time
for k, v in d.items():
if isinstance(v, dict):
# Because `add_currency_locale` only works with arrays, make an array of one
# dictionary and then only use the first (and only) element of the returned list
d[k] = add_currency_locale([v])[0]
elif isinstance(v, list):
d[k] = add_currency_locale(v)
elif isinstance(v, (int, float)):
d[k] = f'${v}'
else:
d[k] = v
new_list.append(d)
return new_list
You can use this code to iterate through all the data elements and get the $ symbol assigned to each value.
def add_curr(dict_list):
new_lst = []
for dic in dict_list:
for k1, v1 in dic.items():
if not isinstance(v1, dict):
if isinstance(v1, (int, float)):
dic[k1] = '${}'.format(v1)
else:
dic[k1] = v1
else:
for k2,v2 in v1.items():
isinstance(v2, (int, float)):
dic[k1][k2] = '${}'.format(v2)
else:
dic[k1][k2] = v2
new_lst.append(dic)
return new_lst
add_curr(mylist)
You could decide to use the conversion part as a separate function and call it each time with a key and value
if isinstance(v1, (int, float)):
dic[k1] = '${}'.format(v1)
else:
dic[k1] = v1
This will give you the following dict:
[{'name': 'Tom',
'total_salary': '$70000',
'salary':
{'base': '$65000',
'bonus': '$5000'}},
{'name': 'Andrew',
'total_salary': '$50000',
'salary':
{'base': '$45000',
'bonus': '$5000'}}]
If you want to use the locale module:
lst = [
{
'name':'Tom',
'total_salary': 70000,
'salary': {
'base': 65000,
'bonus': 5000
}
},
{
'name':'Andrew',
'total_salary': 50000,
'salary': {
'base': 45000,
'bonus': 5000
}
}
]
import locale
locale.setlocale( locale.LC_ALL, 'en_CA.UTF-8' )
def cnvrt(dct):
for k, v in dct.items():
if isinstance(v, dict):
dct[k] = cnvrt(v)
elif isinstance(v, int) or isinstance(v, float):
dct[k] = locale.currency(v, grouping=True)
return dct
print([cnvrt(i) for i in lst])
However, that won't give you your expected output which is just a $ prepended to the value.
For that you can use.
def cnvrt(dct):
for k, v in dct.items():
if isinstance(v, dict):
dct[k] = cnvrt(v)
elif isinstance(v, int) or isinstance(v, float):
dct[k] = f'${v}'
return dct
print([cnvrt(i) for i in lst])
This works by recursively calling cnvrt if the value is a nested dict, otherwise, if it's an int or float it prepends a $. Since it's operating on the expectation of a dict object, you can use it in a nice list comprehension.
Finally, if you really want to you can handle lists with your function but IMO at this point it is doing too much.
def cnvrt(obj):
if isinstance(obj, list):
obj = [cnvrt(i) for i in obj]
elif isinstance(obj, dict):
for k, v in obj.items():
if isinstance(v, dict) or isinstance(v, list):
obj[k] = cnvrt(v)
elif isinstance(v, int) or isinstance(v, float):
obj[k] = f'${v}'
return obj
print(cnvrt(lst))
Although, it works with inner lists and dicts as well, There is a lot going on which makes it hard to follow.

replacing nan values with a function python

I have a big data set (100k+) with many more columns than in the snippet attached. I need to replace missing values with values from the reference table. I found countless articles of how to replace nan values with the same number but can't find relevant help to replace them with different values obtain from a function. My problem is that np.nan is not equal to np.nan so how can I make a comparison? I'm trying to say that if the value is null then replace it with the particular value from the reference table. I have found the way shown below but its a dangerous method as it replace it only as an exception so if anything goes wrong I wouldn't see it. Here is the snippet:
sampleData = {
'BI Business Name' : ['AAA', 'BBB', 'CCC', 'CCC','DDD','DDD'],
'BId Postcode' : ['NW1 8NZ', 'NW1 8NZ', 'WC2N 4AA','WC2N 4AA', 'CV7 9JY', 'CV7 9JY',],
'BI Website' : ['www#1', 'www#1', 'www#2', 'www#2','www#3', 'www#3'],
'BI Telephone' : ['999', '999', '666', '001', np.nan, '12345']
}
df = pd.DataFrame(sampleData)
df
and here is my method:
feature = 'BI Telephone'
df[[feature]] = df[[feature]].astype('string')
def missing_phone(row):
try:
old_value = row[feature]
if old_value == 'NaN' or old_value == 'nan' or old_value == np.nan or old_value is None or
old_value == '':
reference_value = row[reference_column]
new_value = reference_table[reference_table[reference_column]==reference_value].iloc[0,0]
print('changed')
return new_value
else:
print('unchanged as value is not nan. The value is {}'.format(old_value))
return old_value
except Exception as e:
reference_value = row[reference_column]
new_value = reference_table[reference_table[reference_column]==reference_value].iloc[0,0]
print('exception')
return new_value
df[feature]=df.apply(missing_phone, axis=1)
df
If I don't change the data type to string then the nan is just unchanged. How can I fix it?

for loop with if statement not showing list of lists result

below is my code and there are two version. First, for loop without if statement and second, for loop with if statement. From what I have found that, if I remove if statement on first version, the result will show all employees information. I'm still learning python and thanks in advance
def addNew():
global employees
newEmp = []
checkList = []
newEmp.append(input("Enter id: "))
newEmp.append(input("Enter name: "))
newEmp.append(input("Enter department: "))
newEmp.append(input("Enter position: "))
checkList.append(newEmp)
for new in checkList:
print(new)
for exist in employees:
print(exist)
result
['1001', 'das', 'das', 'das'] # from print(new)
['1000', 'tim', 'hr', 'admin'] # from print(exist)
['1003', 'jim', 'hr', 'clerk'] # from print(exist)
['1007', 'ida', 'hr', 'audit'] # from print(exist)
['1005', 'mia', 'itss', 'security'] # from print(exist)
However, on this second version code below, if I put the if statement in for loop the result will only show one employee information.
def addNew():
global employees
newEmp = []
checkList = []
newEmp.append(input("Enter id: "))
newEmp.append(input("Enter name: "))
newEmp.append(input("Enter department: "))
newEmp.append(input("Enter position: "))
checkList.append(newEmp)
for new in checkList:
print(new)
for exist in employees:
print(exist)
if new[0] == exist[0]:
print("entered id",new[0],"is already exist")
break
elif new[0] != exist[0]:
employees.extend(checkList)
break
result
['1001', 'das', 'das', 'das'] # from print(new)
['1000', 'tim', 'hr', 'admin'] # from print(exist)
It's because it either went inside the if statement or the elif statement and since both of them have a break statement, the inner loop is terminated immediately even though not all the employees have been gone through yet.

How to scale/drop NaN/0's from graph visually

I would like to start the graph from the first non-zero or non NaN value, also if possible, only connect non-zero/ non NaN terms.
def CreateAvgGraph(input_data):
KK = test3.loc[[input_data],:]
K = KK.T
K = K.fillna(0)
K = K.reset_index()
list1a = K['index'].tolist()
list2a = K[input_data].tolist()
return dcc.Graph(
id='example-graph2',
figure={
'data': [
{'x' : list1a , 'y': list2a, 'type':'line','name' :input_data},
],
'layout': {
'title': str(input_data) + ' Average Price'
}
}
)
[![enter image description here][1]][1]
Removing the fillNa doesn't really help as the view scale is too much.
def CreateAvgGraph(input_data):
KK = test3.loc[[input_data],:]
K = KK.T
K = K.reset_index()
list1a = K['index'].tolist()
list2a = K[input_data].tolist()
return dcc.Graph(
id='example-graph2',
figure={
'data': [
{'x' : list1a , 'y': list2a, 'type':'line','name' :input_data},
],
'layout': {
'title': str(input_data) + ' Average Price'
}
}
)
I have managed to do an ugly fix, but there has to be a better way?
def CreateAvgGraph(input_data):
KK = test3.loc[[input_data],:]
K = KK.T
K = K.fillna(0)
K = K.reset_index()
list1a = K['index'].tolist()
list2a = K[input_data].tolist()
list2aa = []
list1aa =[]
for i in range(0,len(list1a)):
if list2a[i] > 0:
list1aa.append(list1a[i])
list2aa.append(list2a[i])
else:
continue
return dcc.Graph(
id='example-graph2',
figure={
'data': [
{'x' : list1aa , 'y': list2aa, 'type':'line','name' :input_data},
],
'layout': {
'title': str(input_data) + ' Average Price'
If you simply want to plot all non-nan value, you should just drop the nan values rather than filling them with zeros, i.e. you should replace K.fillna(0) with K.dropna().

Resources