pyspark extracting specific value to variable - apache-spark

I have the below script.
I am a bit stuck with this specific piece:
datex = datetime.datetime.strptime(df1.start_time,'%Y-%m-%d %H:%M:%S')
I can't figure out how to extract the actual value from the start_time field & store it in the datex variable.
Can anyone help me please?
while iters <10:
time_to_add = iters * 900
time_to_checkx = time_to_check + datetime.timedelta(seconds=time_to_add)
iters = iters + 1
session = 0
for row in df1.rdd.collect():
datex = datetime.datetime.strptime(df1.start_time,'%Y-%m-%d %H:%M:%S')
print(datex)
filterx = df1.filter(datex < time_to_checkx)
session = session + filterx.count()
print('current session value' + str(session))
print(session)

Check this out. I have converted your for loop in general. If you can get me more info on iters variable or the explanation of how you want it to work:
import pyspark.sql.functions a F
spark_date_format = "YYYY-MM-dd hh:mm:ss"
session = 0
time_to_checkx = time_to_check + datetime.timedelta(seconds=time_to_add)
df1 = df1.withColumn('start_time', F.to_timestamp(F.col(date_column), spark_date_format))
filterx = df1.filter(df1.start_time < time_to_checkx)
session = session + filterx.count()

Related

Print REGEX using USER DEFINED FUNCTION

I'm trying to print the variables ccb_3, nome, data, taxa and parcela using the function I defined as "ext_ccb", but when I run the code it returns 3 times (because I defined q as 3) the variable ccb_3.
I tried splitting it into 2 functions (one with the variable ccb_3 e one with the rest that uses REGEX) but it didn't worked to.
'''
from PyPDF2 import PdfFileReader, PdfFileWriter
import re
x = 1
q = 3
def ext_ccb():
nome_ccb = str("Vazio (" + y + ").pdf")
ccb = PdfFileReader(nome_ccb)
ccb_obj_1 = ccb.getPage(0)
ccb_text_1 = ccb_obj_1.extractText()
ccb_obj_2 = ccb.getPage(1)
ccb_text_2 = ccb_obj_2.extractText()
ccb_3 = ccb_text_1[1:8]
print(ccb_3)
pattern_nome = re.compile(r'''[^\n][^CPF][A-Z](|\.)\w*\s*.*$
Nome Completo
''', re.M)
matches_nome = pattern_nome.finditer(ccb_text_1)
for match in matches_nome:
nome = str(match)
nome = nome[40:].replace(r"\n\nNome Completo\n'>", "")
print(nome)
pattern_data = re.compile(r'''5\.2\. Modalidade
\d{2}/\d{2}/\d{4}
''')
matches_data = pattern_data.findall(ccb_text_1)
for match in matches_data:
data = match[17:27]
print(data)
pattern_taxa = re.compile(r'''Taxa de Juros a\.m\. \(%\)
\d*,\d*''')
matches_taxa = pattern_taxa.findall(ccb_text_2)
for match in matches_taxa:
taxa = match[24:]
print(taxa)
pattern_vparcela = re.compile(r'''Valor das Parcelas
R\$ \d*,\d*''')
matches_vparcela = pattern_vparcela.findall(ccb_text_2)
for match in matches_vparcela:
parcela = match[23:]
print(parcela)
while x <= q:
y = str(x)
x += 1
ext_ccb()
'''
What I really need is to insert it into an csv, multiple times from different PDF's, which I already have the code for:
'''
from csv import writer
x = 5
q = 0
while q < x:
q += 1
ccb_3 += 1
nome += 2
data += 4
taxa += 4
parcela += 5
list_data = [ccb_3, nome, data, taxa, parcela]
with open('csv_teste.csv', 'a', newline = '') as f_object:
writer_object = writer(f_object)
writer_object.writerow(list_data)
f_object.close()
'''
How can I save each data from each PDF and put it into the CSV?

Target Labeling Using Sliding Window On Stock Data In Python

I'm trying to label BUY, SELL, and HOLD values to the closing stock prices based on the algorithm I found in a paper. I'm not quite able to figure out the error I'm getting. I'd very much appreciate your help. Thank you.
Algorigthm:
[EDITED]
My implementation:
window_size = 11
counter = 0
result = []
window_begin_idx=0; window_end_idx=0; window_middle_idx=0; min_idx=0; max_idx=0;
while counter < len(closing_price):
if counter > window_size:
window_begin_idx = counter - window_size
window_end_idx = window_begin_idx + window_size - 1
window_middle_idx = (window_begin_idx + window_end_idx)//2
for i in range(window_begin_idx, window_end_idx+1):
rng = closing_price[window_begin_idx:window_end_idx+1]
number = closing_price[i]
mins = rng.min()
maxs = rng.max()
if number < mins:
mins=number
min_idx = np.argmin(rng)
if number > maxs:
maxs=number
max_idx = np.argmax(rng)
if max_idx == window_middle_idx:
result.append("SELL")
elif min_idx == window_middle_idx:
result.append("BUY")
else:
result.append("HOLD")
mins = 0.0
maxs = 10000.0
counter+=1
After the edit based on the author's JAVA code, I'm only getting the HOLD label. The author's implementation is here.
You need to initialize mins, maxs, min_idx and max_idx with appropriate values before the main loop.
In your case if max_idx == occurs earlier than any max_idx assignment
Edit after questing change:
Seems in Python you can make similar behavior replacing the whole for-loop with:
rng = closing_price[window_begin_idx:window_end_idx+1]
mins = rng.min()
maxs = rng.max()
min_idx = rng.index(mins)
max_idx = rng.index(maxs)
After reading through the author's implementation and following the suggestions provided by MBo, I have managed to solve this issue. So, now anyone who wants this algorithm in python, below is the code:
window_size = 11
counter = 0
result = []
window_begin_idx=0; window_end_idx=0; window_middle_idx=0; min_idx=0; max_idx=0;
number=0.0; mins=10000.0; maxs=0.0
while counter < len(closing_price):
if counter > window_size:
window_begin_idx = counter - window_size
window_end_idx = window_begin_idx + window_size - 1
window_middle_idx = (window_begin_idx + window_end_idx)//2
for i in range(window_begin_idx, window_end_idx+1):
number = closing_price[i]
if number < mins:
mins=number
min_idx = np.where(closing_price==mins)[0][0]
if number > maxs:
maxs=number
max_idx = np.where(closing_price==maxs)[0][0]
if max_idx == window_middle_idx:
result.append("SELL")
elif min_idx == window_middle_idx:
result.append("BUY")
else:
result.append("HOLD")
mins = 10000.0
maxs = 0.0
counter+=1

Track database history

def startlog():
id = enteruser.id
x = time.localtime()
sec = x.tm_sec
min = x.tm_min
hour = x.tm_hour + 1
day = x.tm_mday
date = f"{x.tm_mon}-{x.tm_mday}-{x.tm_year}"
starttime = (day * 86400) + (hour * 3600) + (min * 60) + sec
updatestart = "UPDATE log SET start = ?, date = ? WHERE ID = ?"
c.execute(updatestart, (starttime, date, id,))
conn.commit()
I have this function startlog, and a clone of it endlog.
My database log is consisted of (name, starttime, endtime, date)
Is there any way to keep track of the changes?
Desired output:
Name / Time / Date
x / time1 / date1
x / time2 / date2
I tried creating a list so everytime I'm calling out the function it will append on the list but it disappears after the session.
I used csv for my case since it's just a personal project. I used columns like ID/Time in / Time out / Total Time and used ID to determine which value to display. This is the snippet of my code (using tkinter for gui)
def csvwrite():
with open ('test.cvs', 'a', newline="") as csvfile:
writer = csv.writer(csvfile)
tup1 = (enteruser.id, log.start, log.end)
writer.writerow(tup1)
csvfile.close()
def csvread():
with open('test.cvs', 'r') as csvfile:
reader = csv.reader(csvfile)
filtered = filter(filterer, reader)
res = []
for i in filtered:
print(i)
historylbl = Label(historyWindow.historywndw, text = i)
historylbl.pack()

How to get different predation values every year for my simulation?

I am trying to run this model of seed predation and population dynamics but I am new to coding and I am only getting one predation value that gets repeated over different generations. How can I get different predation values for different year?
Also, Is there an issue with the normalizing method used?
import numpy as np
import matplotlib.pyplot as plt
def is_odd(year):
return ((year % 2) == 1)
def reproduction(p_iter, year, dead):
if is_odd(year):
predation = dead
seedsProd = p_iter*s_oddd
seedsPred = K*predation*200*(seedsProd/np.sum(seedsProd))
return (seedsProd - seedsPred) + np.array([0,0,p_iter[2]])
else:
predation = dead
seedsProd = p_iter*s_even
seedsPred = K*predation*200*(seedsProd/np.sum(seedsProd))
return (seedsProd - seedsPred) +np.array([0,p_iter[1],0])
def normalize(p_iter):
if is_odd(year):
x = np.copy(p_iter)
x[2] = 0
x = (K-p_iter[2]) * x / sum(x)
x[2] = p_iter[2]
return x
else:
x = np.copy(p_iter)
x[1] = 0
x = (K-p_iter[1]) * x / sum(x)
x[1] = p_iter[1]
return x
Predation is defined here:
def predation():
return (np.array(np.round(np.random.uniform(0.4,0.6),2)))
#max_years
Y = 100
#carrying capacity
K = 1000
#initial populaton
p_1, p_2, p_3 = 998., 1., 1.
#seed released per plant
s_1, s_2, s_3 = 200, 260, 260
p_init = np.array([p_1, p_2, p_3],dtype=float)
s_oddd = np.array([s_1, s_2, 0.0])
s_even = np.array([s_1, 0.0, s_3])
n = len(p_init)
m = np.append(p_init,s_oddd)
p_iter = p_init
dead = 0
norm = 0
for year in range(1,Y+1):
dead = predation()
seeds = reproduction(p_iter, year, dead)
p_iter = np.maximum(seeds,np.zeros(p_iter.shape))
p_iter = normalize(p_iter)
m = np.vstack((m, [*p_iter]+[*seeds] ))

Calculating entropy in ID3 log2(0) in formula

import numpy as np
udacity_set = np.array(
[[1,1,1,0],
[1,0,1,0],
[0,1,0,1],
[1,0,0,1]])
label = udacity_set[:,udacity_set.shape[1]-1]
fx = label.size
positive = label[label == 1].shape[0]
positive_probability = positive/fx
negative = label[label == 0].shape[0]
negative_probability = negative/fx
entropy = -negative_probability*np.log2(negative_probability) - positive_probability*np.log2(positive_probability)
atribute = 0
V = 1
attribute_set = udacity_set[np.where(udacity_set[:,atribute] == 1)] #selecting positive instance of occurance in attribute 14
instances = attribute_set.shape[0]
negative_labels = attribute_set[np.where(attribute_set[:,attribute_set.shape[1]-1]== 0)].shape[0]
positive_labels = attribute_set[np.where(attribute_set[:,attribute_set.shape[1]-1]== 1)].shape[0]
p0 = negative_labels/instances
p1 = positive_labels/instances
entropy2 = - p0*np.log2(p0) - p1*np.log2(p1)
attribute_set2 = udacity_set[np.where(udacity_set[:,atribute] == 0)] #selecting positive instance of occurance in attribute 14
instances2 = attribute_set2.shape[0]
negative_labels2 = attribute_set[np.where(attribute_set2[:,attribute_set2.shape[1]-1]== 0)].shape[0]
positive_labels2 = attribute_set[np.where(attribute_set2[:,attribute_set2.shape[1]-1]== 1)].shape[0]
p02 = negative_labels2/instances2
p12 = positive_labels2/instances2
entropy22 = - p02*np.log2(p02) - p12*np.log2(p12)
Problem is when attribute is pure and entropy is meant to be 0. But when i put this into a formula i get NaN. I know how to code workaround, but why is this formula rigged?

Resources