Print REGEX using USER DEFINED FUNCTION - python-3.x

I'm trying to print the variables ccb_3, nome, data, taxa and parcela using the function I defined as "ext_ccb", but when I run the code it returns 3 times (because I defined q as 3) the variable ccb_3.
I tried splitting it into 2 functions (one with the variable ccb_3 e one with the rest that uses REGEX) but it didn't worked to.
'''
from PyPDF2 import PdfFileReader, PdfFileWriter
import re
x = 1
q = 3
def ext_ccb():
nome_ccb = str("Vazio (" + y + ").pdf")
ccb = PdfFileReader(nome_ccb)
ccb_obj_1 = ccb.getPage(0)
ccb_text_1 = ccb_obj_1.extractText()
ccb_obj_2 = ccb.getPage(1)
ccb_text_2 = ccb_obj_2.extractText()
ccb_3 = ccb_text_1[1:8]
print(ccb_3)
pattern_nome = re.compile(r'''[^\n][^CPF][A-Z](|\.)\w*\s*.*$
Nome Completo
''', re.M)
matches_nome = pattern_nome.finditer(ccb_text_1)
for match in matches_nome:
nome = str(match)
nome = nome[40:].replace(r"\n\nNome Completo\n'>", "")
print(nome)
pattern_data = re.compile(r'''5\.2\. Modalidade
\d{2}/\d{2}/\d{4}
''')
matches_data = pattern_data.findall(ccb_text_1)
for match in matches_data:
data = match[17:27]
print(data)
pattern_taxa = re.compile(r'''Taxa de Juros a\.m\. \(%\)
\d*,\d*''')
matches_taxa = pattern_taxa.findall(ccb_text_2)
for match in matches_taxa:
taxa = match[24:]
print(taxa)
pattern_vparcela = re.compile(r'''Valor das Parcelas
R\$ \d*,\d*''')
matches_vparcela = pattern_vparcela.findall(ccb_text_2)
for match in matches_vparcela:
parcela = match[23:]
print(parcela)
while x <= q:
y = str(x)
x += 1
ext_ccb()
'''
What I really need is to insert it into an csv, multiple times from different PDF's, which I already have the code for:
'''
from csv import writer
x = 5
q = 0
while q < x:
q += 1
ccb_3 += 1
nome += 2
data += 4
taxa += 4
parcela += 5
list_data = [ccb_3, nome, data, taxa, parcela]
with open('csv_teste.csv', 'a', newline = '') as f_object:
writer_object = writer(f_object)
writer_object.writerow(list_data)
f_object.close()
'''
How can I save each data from each PDF and put it into the CSV?

Related

python replace '?' with not successive or preceding character

I am trying to solve a riddle, the challenge is to replace the question mark in a string by not using the previous or the following character in that string
For example:-
riddle = 'abcd?ef?'
expected_out = 'abcdiefa'
riddle = '???'
expected_out = 'aea'
This is the solution that I have tried but for some reason it isn't working
successor_element = ''
predecessor_element = ''
my_pre_succ_elements = []
riddle = "ab?ac?"
required_list = []
def solution(riddle):
my_replacers = ['a','e','i']
j = len(riddle)
print(j)
for e in range(0,j):
req_element = riddle[e]
print(e)
print(req_element)
if req_element == '?':
if e == 0:
successor_element = riddle[e+1]
if e == j-1:
predecessor_element = riddle[e-1]
if (e!= 0) and (e != j-1):
successor_element = riddle[e+1]
predecessor_element = riddle [e-1]
my_pre_succ_elements.extend(successor_element)
my_pre_succ_elements.extend(predecessor_element)
required_list = list(set(my_pre_succ_elements)^set(my_replacers))
substitutor = required_list[0]
riddle = str(riddle[0:e]) + str(substitutor) + str(riddle[e + 1:])
print(riddle)
pass
You were pretty close. This should work:
if req_element == '?':
possibles = ['a','e','i']
if e > 0 and riddle[e-1] in possibles:
possibles.remove(riddle[e-1])
if e < j-1 and riddle[e+1] in possibles:
possibles.remove(riddle[e+1])
substitutor = possibles[0]
riddle = riddle[:e] + substitutor + riddle[e+1:]

Pandas Group By Weird Behaviour

def quant(X,col_):
print('X\n',X.head(5))
q25 = np.quantile(X[col_],0.15)
q75 = np.quantile(X[col_],0.85)
total = X[col_].tolist()
ltq = []
mtq = []
iqr = []
for i in total:
if i < q25:
ltq.append(i)
elif i > q75:
mtq.append(i)
else:
iqr.append(i)
p_l_q = 100*(len(ltq))/len(total)
p_l_m = 100*(len(mtq))/len(total)
percent_iqr = 100*len((iqr))/len(total)
X['p_l_q'] = p_l_q
X['p_l_m'] = p_l_m
X['p_l_i'] = percent_iqr
X['count'] = len(total)
X_short = X[['p_l_q','p_l_m','p_l_i','count']].copy(deep = True)
print(X_short[:1])
new = X_short[:1]
return new
X = pd.DataFrame()
X['G'] = ''
X['H'] = ''
X['M'] = ''
lst1 = ['a','a','a','a','a','b','b','b','b','b','b','c','c','c','c']
lst2 = [10,12,13,45,52,34,78,34,56,79,90,65,56,43,11]
lst3 = [1,2,3,4,5,1,2,3,4,5,6,1,2,3,4]
X['G'] = lst1
X['H'] = lst2
X['M'] = lst3
X_q = X.groupby('G').apply(quant,'H').reset_index()
I have used a print statement to give me head of dataframe block for each unique 'G' but I get weird print like in image.
There should be exactly three print outputs.(for each unqiue G) but it is showing 5 on top of that second print output (G='b') has H values as that of G ='a'.
Try replacing this:
def quant(X,col_):
print('X\n',X.head(5))
With this:
def quant(XX,col_):
X = XX.copy()
print('X\n',XX.head(5))
del XX # Delete 'XX' because 'X' copy is available
Output

Scipy optimize.minimize with multi- parameters

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import linalg, optimize
%matplotlib inline
Data load
data = pd.read_csv("D:/Stat/TimeSeries/KRW_month_0617_1.csv",index_col="Date") / 100
para = open("D:/Stat/TimeSeries/KRW_month_0617_1.txt").readlines()[0:2]
data.index = pd.to_datetime(data.index)
Parameters
cond = []
params = []
time = []
for i in para:
j = i.split()
for k in j:
cond.append(k)
cond = cond[1:]
for i in range(len(cond)):
cond[i] = round(float(cond[i]),4)
params = cond[0:23]
time = cond[23:]
maturity = np.array(time[1:])
timegap = 1/cond[23]
Functions We need
def Paramcheck(Params, checkStationary = 1):
result = 0
Kappa = np.array([[params[20],0,0], [0,params[21],0], [0,0,params[22]]])
Sigma = np.array([[params[1],0,0], [params[2],params[3],0], [params[4],params[5],params[6]]])
State = np.array([params[7], params[8], params[9]])
Lambda = params[0]
SigmaEps = np.identity(10)
for i in range(10):
SigmaEps[i][i] = params[i+10]
for i in range(len(Sigma)):
if Sigma[i][i] < 0:
result = 1
for j in SigmaEps:
if np.any(SigmaEps) < 0:
result = 1
if Lambda < 0.05 or Lambda > 2:
result = 1
elif State[0] < 0:
result = 1
elif Kappa[0][0] < 0:
result = 1
if result == 0 and checkStationary > 0:
if max(np.linalg.eigvals(-Kappa).real) > 0:
result = 2
return result
def CheckDet(x):
if x == np.inf or x == np.nan:
result = 1
elif x < 0:
result = 2
elif abs(x) < 10**-250:
result = 3
else:
result = 0
return result
def NS_factor(lambda_val, maturity):
col1 = np.ones(len(maturity))
col2 = (1 - np.exp(-lambda_val*maturity))/(lambda_val*maturity)
col3 = col2 - np.exp(-lambda_val*maturity)
factor = np.array([col1,col2,col3]).transpose()
return factor
def DNS_Kalman_filter(Params, *args):
N = Paramcheck(Params)
if N == 0:
Kappa = np.array([[params[20],0,0], [0,params[21],0], [0,0,params[22]]])
Sigma = np.array([[params[1],0,0], [params[2],params[3],0],
[params[4],params[5],params[6]]])
State = np.array([params[7], params[8], params[9]])
Lambda = params[0]
SigmaEps = np.identity(10)
for i in range(10):
SigmaEps[i][i] = params[i+10]
Obs_Yield = args[0]
Obs_Date = args[1]
Timegap = args[2]
Obs_Mty = args[3]
Finalstate = args[4]
Mty_length = len(Obs_Mty)
B = NS_factor(lambda_val = Lambda,maturity = Obs_Mty)
H_large = SigmaEps **2
N_obs = len(Obs_Date)
LLH_vec = np.zeros(N_obs)
phi1 = linalg.expm(-Kappa*Timegap)
phi0 = (np.identity(3)-phi1) # State
Eigenvalues = np.linalg.eig(Kappa)[0]
Eigen_vec = np.linalg.eig(Kappa)[1]
Eigen_vec_inv = np.linalg.inv(Eigen_vec)
S = Eigen_vec_inv # Sigma # Sigma.transpose() # Eigen_vec_inv.transpose()
Atilde = np.dot(Sigma[0], Sigma[0])
Btilde = np.dot(Sigma[1], Sigma[1])
Ctilde = np.dot(Sigma[2], Sigma[2])
Dtilde = np.dot(Sigma[0], Sigma[1])
Etilde = np.dot(Sigma[0], Sigma[2])
Ftilde = np.dot(Sigma[1], Sigma[2])
res1= Atilde* Obs_Mty* Obs_Mty/6
res2= Btilde*(1/(2*Lambda**2) - (1-np.exp(-Lambda*Obs_Mty))/(Lambda**3*Obs_Mty) + (1-
np.exp(-2*Lambda*Obs_Mty))/(4*Lambda**3*Obs_Mty))
res3= Ctilde*(1/(2*Lambda**2) + np.exp(-Lambda*Obs_Mty)/(Lambda**2)-
Obs_Mty*np.exp(-2*Lambda*Obs_Mty)/(4*Lambda) -
3*np.exp(-2*Lambda*Obs_Mty)/(4*Lambda**2) - 2*(1-np.exp(-
Lambda*Obs_Mty))/(Lambda**3*Obs_Mty) + 5*(1-
np.exp(-2*Lambda*Obs_Mty))/(8*Lambda**3*Obs_Mty))
res4= Dtilde*(Obs_Mty/(2*Lambda) + np.exp(-Lambda*Obs_Mty)/(Lambda**2) - (1-np.exp(-
Lambda*Obs_Mty))/(Lambda**3*Obs_Mty))
res5= Etilde*(3*np.exp(-Lambda*Obs_Mty)/(Lambda**2) + Obs_Mty/(2*Lambda)+Obs_Mty*np.exp(-
Lambda*Obs_Mty)/(Lambda) - 3*(1-np.exp(-Lambda*Obs_Mty))/(Lambda**3*Obs_Mty))
res6= Ftilde*(1/(Lambda**2) + np.exp(-Lambda*Obs_Mty)/(Lambda**2) -
np.exp(-2*Lambda*Obs_Mty)/(2*Lambda**2) - 3*(1-np.exp(-
Lambda*Obs_Mty))/(Lambda**3*Obs_Mty) + 3*(1-
np.exp(-2*Lambda*Obs_Mty))/(4*Lambda**3*Obs_Mty))
val = res1 + res2 + res3 + res4 + res5 + res6
V_mat = np.zeros([3,3])
V_lim = np.zeros([3,3])
for i in range(3):
for j in range(3):
V_mat[i][j] = S[i][j]*(1-np.exp(-(Eigenvalues[i] +
Eigenvalues[j])*Timegap))/(Eigenvalues[i] + Eigenvalues[j])
V_lim[i][j] = S[i][j]/(Eigenvalues[i] + Eigenvalues[j])
Q = (Eigen_vec # V_mat # Eigen_vec.transpose()).real
Sigma_lim = (Eigen_vec # V_lim # Eigen_vec.transpose()).real
for i in range(N_obs):
y = Obs_Yield[i]
xhat = phi0 + phi1 # State
y_implied = B # xhat
v = y - y_implied + val
Sigmahat = phi1 # Sigma_lim # phi1.transpose() + Q
F = B # Sigmahat # B.transpose() + H_large
detF = np.linalg.det(F)
if CheckDet(detF) > 0:
N = 3
break
Finv = np.linalg.inv(F)
State = xhat + Sigmahat # B.transpose() # Finv # v
Sigma_lim = Sigmahat - Sigmahat # B.transpose() # Finv # B # Sigmahat
LLH_vec[i] = np.log(detF) + v.transpose() # Finv # v
if N == 0:
if Finalstate:
yDate = Obs_Date[-1]
result = np.array([yDate,State])
else:
result = 0.5 * (sum(LLH_vec) + Mty_length*N_obs*np.log(2*np.pi))
else:
result = 7000000
return result
I made a code that does Arbitrage Free Nelson-Siegel model. Data is return rates of bond (1Y,1.5Y, ... ,20Y). I wanna optimize that function with scipy optimize.minimize function with fixed *args.
Suppose that Initial parmas are verified that it's close to optimized params from empirical experiments using Dynamic Nelson-Siegel Model.
LLC_new = 0
while True:
LLC_old = LLC_new
OPT = optimize.minimize(x0=params,fun=DNS_Kalman_filter, args=
(data.values,data.index,timegap,maturity,0))
params = OPT.x
LLC_new = round(OPT.fun,5)
print("Current LLC: %0.5f" %LLC_new)
if LLC_old == LLC_new:
OPT_para = params
FinalState = DNS_Kalman_filter(params,data.values,data.index,timegap,maturity,True)
break
Result is
Current LLC: -7613.70146
Current LLC: -7613.70146
LLC(log-likelihood value) isn't maximized. It's not a result I desire using Optimizer.
Is there any solution for that?
In R, there is optim() function works as similar as scipy.optimize.minimize() which works really well. I also have a R code for that very similar to this Python code.

Error: TypeError: cannot perform reduce with flexible type

i am using python version 3.7.Below is the code in which I am performing operation along the rows. i want the mean of the data which are along the rows but I get an error. i am new to numpy and python. i am reading the data from text file.
My code is:
import numpy as np
def getIndexFromDatetime(date_from, date_to):
'''date_from = [2, 10] : 10oclock of day2
'''
if date_from[1] > 24 or date_to[1] > 24: print('error')
start = (date_from[0] - 1) * 48 + date_from[1] * 2
end = (date_to[0] - 1) * 48 + date_to[1] * 2
return [start, end]
def is_num(s):
return s.replace(',', '').replace('.', '').replace('-', '').isnumeric()
def get_dataset(fpath):
with open(fpath, 'r') as f:
cnt = 0
DataWeather = {}
header = []
dtime = []
temp1 = []
temp2 = []
for line in f:
terms = line.split('\t')
#print(terms)
if cnt == 0: header1 = terms
if cnt == 1: header2 = terms
#header.append(terms[3])
cnt += 1
if cnt == 2:
for i in range(len(header1)):
header.append(header1[i]+header2[i])
#print(header)
for i in range(len(terms)):
DataWeather[header[i]] = []
#break
if cnt > 2:
for i in range(len(terms)):
if is_num(terms[i]):
DataWeather[header[i]].append(float(terms[i]))
else:
DataWeather[header[i]].append(terms[i])
for i in range(len(DataWeather[header[0]])):
dtime.append(DataWeather[header[0]][i]+' '+DataWeather[header[1]][i])
return DataWeather, header
def get_data(dataset, header, idx):
y = dataset[header][idx[0]:idx[1]]
return y
data_dir = 'weather_data'
month = 3
day = list(range(1,10))
header_idx = [2,3,4,5,7,16]
for d in day:
print(d)
dtime_from = [d, 9]
dtime_to = [d, 18]
dtime_idx = getIndexFromDatetime(dtime_from, dtime_to)
fpath = '{0}/2019-{1:02}.txt'.format(data_dir, month)
dataset, header = get_dataset(fpath)
for h in header_idx:
print(fpath)
print(header[h], dtime_from, dtime_to, dtime_idx)
data = get_data(dataset, header[h], dtime_idx)
#data= list(map(float,np.array(data)))
#data = map(np.array(data, dtype=np.float))
print(data)
print(np.mean(data))
i am getting the following error:
ret = umr_sum(arr, axis, dtype, out, keepdims)
TypeError: cannot perform reduce with flexible type
i also tried some functions like "map" and "list" as commented in the code still it gives error of converting string to float.

str does not support the buffer interface using .find

trying to search within a .thumbdata3 file for thumbnail images. This was someone else's sample code, but I am getting an error
"str does not support the buffer interface using .find"
"""extract files from Android thumbdata3 file"""
f=open('thumbdata3.dat','rb')
tdata = f.read()
f.close()
ss = '\xff\xd8'
se = '\xff\xd9'
count = 0
start = 0
while True:
x1 = tdata.find(ss,start)
if x1 < 0:
break
x2 = tdata.find(se,x1)
jpg = tdata[x1:x2+1]
count += 1
fname = 'extracted%d03.jpg' % (count)
fw = open(fname,'wb')
fw.write(jpg)
fw.close()
start = x2+2
ok, turned out to be very simple.
just add b in front of the data I am trying to match
so
ss = '\xff\xd8'
se = '\xff\xd9'
becomes
ss = b'\xff\xd8'
se = b'\xff\xd9'
It's all right.
With Python 3.x like python-3.6.2
Rename .thumbdata3-1763508120 file to thumbdata3.dat
Rename .thumbdata3--1967290299 file to thumbdata4.dat
enter code here
"""extract files from Android thumbdata3 file"""
f=open('thumbdata3.dat','rb')
tdata = f.read()
f.close()
ss = b'\xff\xd8'
se = b'\xff\xd9'
count = 0
start = 0
while True:
x1 = tdata.find(ss,start)
if x1 < 0:
break
x2 = tdata.find(se,x1)
jpg = tdata[x1:x2+1]
count += 1
fname = 'extracted%d03.jpg' % (count)
fw = open(fname,'wb')
fw.write(jpg)
fw.close()
start = x2+2
enter code here
"""extract files from Android thumbdata4 file"""
f=open('thumbdata4.dat','rb')
tdata = f.read()
f.close()
ss = b'\xff\xd8'
se = b'\xff\xd9'
count = 0
start = 0
while True:
x1 = tdata.find(ss,start)
if x1 < 0:
break
x2 = tdata.find(se,x1)
jpg = tdata[x1:x2+1]
count += 1
fname = 'extracted%d04.jpg' % (count)
fw = open(fname,'wb')
fw.write(jpg)
fw.close()
start = x2+2

Resources