Replacing a string value in Python - python-3.x

I have a column named "status" full of string values either "legitimate" or "phishing". I'm trying to convert them into a 0 for "legitimate" or 1 for "phishing". Currently my approach is to replace "legitimate" with a string value of "0", and "phishing" with a string value of "1", then convert the strings "0" and "1" to the int values 0 and 1. I'm getting the error:
TypeError: '(0, status legitimate
Name: 0, dtype: object)' is an invalid key
with the following code, what am I doing wrong?
df2 = pd.read_csv('dataset_phishing.csv', usecols=[87], dtype=str)
leg = 'legitimate'
phi = 'phishing'
for i in df2.iterrows():
if df2[i] == leg:
df2[i].replace('legitimate', '0')
else if df2[i] == phi:
df2[i].replace('phishing', '1')

Here iterrow gives you tuple which can't be used as index, that why you get that error. Here is a simple solution:
import pandas as pd
df2=pd.DataFrame([["legitimate"],["phishing"]],columns=["status"])
leg = 'legitimate'
phi = 'phishing'
for i in range(len(df2)):
df2.iloc[i]["status"]='1' if df2.iloc[i]["status"]==phi else '0'
print(df2)
Here is more pythonic way to do this:
import pandas as pd
import numpy as np
df2=pd.DataFrame([["legitimate"],["phishing"]],columns=["status"])
leg = 'legitimate'
phi = 'phishing'
df2["status"]=np.where(df2["status"]==phi,'1','0')
print(df2)
Hope this helps you

Here is another way to do this
import pandas as pd
import numpy as np
data = {'status': ["legitimate", "phishing"]}
df = pd.DataFrame(data)
leg = 'legitimate'
phi = 'phishing'
df.loc[df["status"] == leg, "status"] = 0
df.loc[df["status"] == phi, "status"] = 1
print(df)

Related

Trying to plot a rolling corr line chart but Matplot keeps saying to bring in only valid columns?

Im trying to create a rolling corr using matplot but I get the error "select only valid columns before calling the operation. Dropped columns were Index(['time'], dtype='object')
I have dropped that field from my data frame but the error keeps on appearing ?
Is it something to do with my .iloc argument?
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import seaborn as sns
import scipy.stats as stats
import json
from datetime import timezone
from datetime import datetime
from pycoingecko import CoinGeckoAPI
pd.options.display.width = 0
def datetime_to_unix(year, month, day):
'''datetime_to_unix(2021, 6, 1) => 1622505600.0'''
dt = datetime(year, month, day)
timestamp = (dt - datetime(1970, 1, 1)).total_seconds()
return timestamp
def unix_to_datetime(unix_time):
'''unix_to_datetime(1622505700)=> ''2021-06-01 12:01am'''''
ts = int(unix_time/1000 if len(str(unix_time)) > 10 else unix_time) # /1000 handles milliseconds
return datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d %l:%M%p').lower()
# Initialize the client
cg = CoinGeckoAPI()
# Retrieve looksrare data in USD
result = cg.get_coin_market_chart_range_by_id(
id='looksrare',
vs_currency='usd',
from_timestamp=datetime_to_unix(2022, 1, 11),
to_timestamp=datetime_to_unix(2022, 4, 20)
)
time = [ unix_to_datetime(i[0]) for i in result['prices'] ]
p_array = np.array(result['prices'])
price = p_array[:,1]
v_array = np.array(result['total_volumes'])
volume = v_array[:,1]
df = pd.DataFrame({'time':time, 'price':price,})
df.head(100)
# Retrieve ETH data in USD
result = cg.get_coin_market_chart_range_by_id(
id='ethereum',
vs_currency='usd',
from_timestamp=datetime_to_unix(2022, 1, 11),
to_timestamp=datetime_to_unix(2022, 4, 20)
)
time = [ unix_to_datetime(i[0]) for i in result['prices'] ]
p_array = np.array(result['prices'])
price = p_array[:,1]
v_array = np.array(result['total_volumes'])
volume = v_array[:,1]
df2 = pd.DataFrame({'time':time, 'price':price,})
df2.head(100)
df_cd = pd.merge(df, df2, how='inner', on='time')
df_cd = df_cd.drop('time', 1)
output = df_cd.corr()
output1 = df_cd['price_x'].corr(df_cd['price_y'])
overall_pearson_r = df_cd.corr().iloc[0,1]
print(df_cd)
print(f"Pandas computed Pearson r: {overall_pearson_r}")
r, p = stats.pearsonr(df_cd.dropna()['price_x'], df_cd.dropna()['price_y'])
print(f"Scipy computed Pearson r: {r} and p-value: {p}")
# compute rolling window synchrony
f,ax=plt.subplots(figsize=(7,3))
df.rolling(window=30,center=True).median().plot(ax=ax)
ax.set(xlabel='Time',ylabel='Pearson r')
ax.set(title=f"Overall Pearson r = {np.round(overall_pearson_r,2)}");

How to add entire dataframe row as scatter plot annotation

I'm plotting two columns of a Pandas DataFrame on a scatterplot and I want each point to show all the row values of the DataFrame. I've looked at this post, and tried to do something similar with mplcursors:
import pandas as pd
from datetime import date, datetime, time, timedelta
import numpy as np
import matplotlib.pyplot as plt
from mplcursors import cursor
df = pd.DataFrame()
df['datetime'] = pd.date_range(start='2016-01-01', end='2016-01-14', freq='30T')
#df = df.set_index('datetime')
df['x1'] = np.random.randint(-30, 30, size=len(df))
df['x2'] = np.random.randint(-30, 20, size=len(df))
df['x3'] = np.random.randint(-20, 30, size=len(df))
df['y1'] = np.random.randint(-100, 100, size=len(df))
df['y2'] = np.random.randint(-300, 200, size=len(df))
df['y3'] = np.random.randint(-200, 300, size=len(df))
def conditions(s):
if (s['y1'] > 20) or (s['y3'] < 0):
return 'group1'
elif (s['x3'] < 20):
return 'group2'
elif (s['x2'] == 0):
return 'group3'
else:
return 'group4'
df['category'] = df.apply(conditions, axis=1)
fig = plt.figure(figsize=(12,4))
ax1 = plt.subplot(121)
ax1.scatter(df.x1, df.y1, label='test1')
ax1.scatter(df.x2, df.y2, label='test2')
#cursor(hover=True)
ax1.set_xlabel('test1')
ax1.set_ylabel('test2')
ax1.legend(['test1','test2'])
cr1 = cursor(ax1,hover=True)
#ax1.annotation_names = df.columns.tolist()
cr1.connect("add", lambda x: x.annotation.set_text(df.columns.tolist()[x.target.index]))
ax2 = plt.subplot(122)
ax2.scatter(df.x1, df.y1, label='test1')
ax2.scatter(df.x3, df.y3, label='test3')
ax2.set_xlabel('test1')
ax2.set_ylabel('test3')
ax2.legend(['test1','test3'])
cr2 = cursor(ax2,hover=True)
#ax2.annotation_names = df.columns.tolist()
cr2.connect("add", lambda x: x.annotation.set_text(df.columns.tolist()[x.target.index]))
# save figure
import pickle
pickle.dump(fig, open('FigureObject.fig.pickle', 'wb'))
plt.show()
When I hover over a point, I want to see a label containing (for example):
datetime = 2016-01-01 00:00:00
x1 = 1
x2 = -4
x3 = 22
y1 = -42
y2 = -219
y3 = -158
category = group1
but I get this type of error:
cr2.connect("add", lambda x: x.annotation.set_text(df.columns.tolist()[x.target.index]))
IndexError: list index out of range
How do I fix it?
The IndexError occurs because of df.columns.tolist()[x.target.index]
df.columns.tolist() is a list of 7 columns, which is then indexed by [x.target.index].
df.iloc[x.target.index, :].to_dict() will get the desired row data for the point as a dict
A list comprehension creates a list of strings for each key value pair
'\n'.join(...) creates a string with each column separated by a \n
In mplcursors v0.5.1, Selection.target.index is deprecated, use Selection.index instead.
df.iloc[x.index, :] instead of df.iloc[x.target.index, :]
cr1.connect("add", lambda x: x.annotation.set_text('\n'.join([f'{k}: {v}' for k, v in df.iloc[x.index, :].to_dict().items()])))
Alternatively, use .to_string()
cr1.connect("add", lambda x: x.annotation.set_text(df.iloc[x.index, :].to_string()))

Extract the string after particular pattern value before and after

I have pandas data frame and I'd like to extract the values after pb~ and before _ or ' ' or ''.
so it's like the string pb~value_ or pb~value' ' or pb~value''.
import pandas as pd
data = {'PName': ['ag~fbai-churnsoon_mk~de_at~lia_sa~fcs_tg~fbai_ts~alldevice-allgender-13-65_md~c_pb~fcbk_rt~cpm',
'pb~precision disclosed desktop_sz~300x600_pd~halfp-dmp-hubble w tablets_ch~dis_dt~dt_fm~ban_it~poe_vv~si_ad~as_rt~cpm_tg~rtg_sa~redc_ts~none_md~w_ff~pr-teas-rt']}
# Creates pandas DataFrame.
df = pd.DataFrame(data)
print(df)
# print the data
expected Output
PName Values
ag~fbai-churnsoon_mk~de_at~lia_sa~fcs_tg~fbai_ts~alldevice-allgender-13-65_md~c_pb~fcbk_rt~cpm fcbk
pb~precision disclosed desktop_sz~300x600_pd~halfp-dmp-hubble w tablets_ch~dis_dt~dt_fm~ban_it~poe_vv~si_ad~as_rt~cpm_tg~rtg_sa~redc_ts~none_md~w_ff~pr-teas-rt precision
I tried with
df['value'] = df['PName'].str.extract("")
but not able to figure out how can I extract the values.
import pandas as pd
import re
data = {'PName': ['ag~fbai-churnsoon_mk~de_at~lia_sa~fcs_tg~fbai_ts~alldevice-allgender-13-65_md~c_pb~fcbk_rt~cpm',
'pb~precision disclosed desktop_sz~300x600_pd~halfp-dmp-hubble w tablets_ch~dis_dt~dt_fm~ban_it~poe_vv~si_ad~as_rt~cpm_tg~rtg_sa~redc_ts~none_md~w_ff~pr-teas-rt']}
# Creates pandas DataFrame.
df = pd.DataFrame(data)
df['value'] = df['PName'].apply(lambda x :re.findall('pb~([\s\S]*?)(?:_| )',x)[0])
df
PName value
0 ag~fbai-churnsoon_mk~de_at~lia_sa~fcs_tg~fbai_... fcbk
1 pb~precision disclosed desktop_sz~300x600_pd~h... precision
Try non-greedy(lazy) matching
df['PName'].str.extract(r'pb~(.+?)[_ ]')
Out[55]:
0
0 fcbk
1 precision

Use if statement within .str.find()

I would like to know If I have an if statement that looks something like this:
if int(i) > 10:
return 0
else:
return -1
where i is equivalent to a row entry in df["price"] (df is a pandas dataframe) defined as follows:
import pandas as pd
df = pd.DataFrame(columns=["price", "Number"], data=[["10", "07367"], ["20", "08356"], ["9", "07745"]])
how can I use df["price"].str.find(...) together with the above if statement to filter the data by the true condition?
I would like output that looks like the following:
0 -1
1 0
2 -1
I have been struggling with how to implement it, please assist.
Generally its easiest to first convert to optimal dtypes. That way all operations will be quicker - of course, it depends on your application whether this matters. But if things are numbers, let them be numbers (explicit > implicit).
import pandas as pd
df = pd.DataFrame(columns=["price", "Number"], data=[["10", "07367"], ["20", "08356"], ["9", "07745"]])
df['price'] = df.price.astype(int) # or float
df['number'] = df.number.astype(int)
You can then add your criteria as a colum (or just use the output). Apply or map are not so quick, so its better to use the np.where suggested by others or any other comparison that will use numpy under the hood. For example:
df['criteria'] = -1 * (df.price <= 10).astype(int) # quicker to not use map or apply
df.criteria
You could use gt + map:
import pandas as pd
df = pd.DataFrame(columns=["price", "Number"], data=[["10", "07367"], ["20", "08356"], ["9", "07745"]])
result = df.price.astype(int).gt(10).map({False: -1, True: 0})
print(result)
Output
0 -1
1 0
2 -1
Name: price, dtype: int64
Or if you prefer, you could use np.where, as mentioned by #coldspeed in the comments.
import numpy as np
import pandas as pd
df = pd.DataFrame(columns=["price", "Number"], data=[["10", "07367"], ["20", "08356"], ["9", "07745"]])
result = np.where(df.price.astype(int) > 10, 0, -1)
print(result)
Output
[-1 0 -1]
You can use np.where:
df['price'] =df['price'].astype(int)
df['output'] = np.where(df['price']>10, 0, -1)
df
price Number output
0 10 07367 -1
1 20 08356 0
2 9 07745 -1
The syntax is: np.where(condition, valueIfTrue, valueIfFalse)
simply you can use lambda functions
df.price.apply(lambda x : 0 if int(x)>10 else -1)

Error Unorderable Types. Select rows where on hdf5 files

I work with python 3.5 and I have the next problem to import some datas from a hdf5 files.
I will show a very simple example which resume what happen. I have created a small dataframe and I have inserted it into a hdf5 files. Then I have tried to select from this hdf5 file the rows which have on the column "A" a value less that 1. So I get the error:
"Type error: unorderable types: str() < int()"
image
import pandas as pd
import numpy as np
import datetime
import time
import h5py
from pandas import DataFrame, HDFStore
def test_conected():
hdf_nombre_archivo ="1_Archivo.h5"
hdf = HDFStore(hdf_nombre_archivo)
np.random.seed(1234)
index = pd.date_range('1/1/2000', periods=3)
df = pd.DataFrame(np.random.randn(3, 4), index=index, columns=
['A', 'B','C','F'])
print(df)
with h5py.File(hdf_nombre_archivo) as f:
df.to_hdf(hdf_nombre_archivo, 'df',format='table')
print("")
with h5py.File(hdf_nombre_archivo) as f:
df_nuevo = pd.read_hdf(hdf_nombre_archivo, 'df',where= ['A' < 1])
print(df_nuevo )
def Fin():
print(" ")
print("FIN")
if __name__ == "__main__":
test_conected()
Fin()
print(time.strftime("%H:%M:%S"))
I have been investigating but I dont get to solve this error. Some idea?
Thanks
Angel
where= ['A' < 1]
in your condition statement 'A' is consider as string or char and 1 is int so first make them in same type by typecasting.
ex:
str(1)

Resources