Parallel processing in Python with input output file

Parallel processing in Python with input output file - python-3.x

import numpy as np
import pandas as pd
import math
j = 0
k = 0
time_array = []
average_pa = []
for i in range(3600):
time_array.append(j)
average_pa.append(k)
j += 0.1
time_array1 = tuple(time_array)
pa1 = 0.0
time1 = 0.0
chunk_size = 1000
for chunk in pd.read_csv("data_fiftydyne.txt", header=1, delimiter='\t',
chunksize=chunk_size,
skip_blank_lines=True, error_bad_lines=False, keep_default_na=False):
file2 = chunk.to_numpy()
for jj in range(file2.shape[0]):
time1 = file2[jj][9]
if type(time1) == str:
time1 = 362
pa1 = file2[jj][11]
ff = open("aver_pa_time_fiftydyne.csv", "a")
ff.truncate(0)
ff.seek(0)
for ii in range(3600):
if math.floor(10 * time_array1[ii]) == math.floor(float(10 * time1)):
average_pa[ii] += pas1
print(pa1)
print(time1)
ff.write(str(average_pa[ii]) + ',' + str(time_array1[ii]) + '\n')
ff.close()
This is code works in serial. How to parallelize this code? The size of the file is 67GB. It takes 5 days to run in series on CPU with 4.2GHz and 8GB RAM.

Related

Create a Multiprocess script from a system consisting of multiple files

Hi guys im not too into python but need to do some research. The problem mainly consists of a file that calculates a large number of non linear equations which takes quite some time. The idea is to implement Multiprocessing in some way. I was wondering if there is a "correct" way to do this, since the main file calls the "computational" script, should i focus on the main or the computational file for multiprocessing? There are more files involved but this should be a start.
Main file:
import numpy as np
import pandas as pd
from properties_basic import *
import sys
sys.path.append('../physics_sup/')
import os, glob, shutil
for filename in glob.glob("obl_point_data_*"):
os.remove(filename)
for filename in glob.glob("restart.pkl"):
os.remove(filename)
for filename in glob.glob("data.pvd"):
os.remove(filename)
for filename in glob.glob("__pycache__/conversio*"):
os.remove(filename)
for filename in glob.glob("__pycache__/mode*"):
os.remove(filename)
for filename in glob.glob("__pycache__/operato*"):
os.remove(filename)
for filename in glob.glob("__pycache__/physi*"):
os.remove(filename)
for filename in glob.glob("__pycache__/prope*"):
os.remove(filename)
from model_benchmark import Model
from darts.engines import value_vector, redirect_darts_output
import matplotlib.pyplot as plt
grid_1D = False
redirect_darts_output('run.log')
n = Model()
n.init()
injectionsrate = np.genfromtxt('injectionrate.txt')[0:].astype(float) #np.genfromtxt('InjectionMonthly.txt')[0:].astype(float)#
injectionsrate = injectionsrate / 20
#mu_w = CP.PropsSI('V', 'T', 22, 'P|liquid', bar2pa(130), 'Water') * 1000
#n.property_container.viscosity_ev = dict([('wat', ViscosityConst(mu_w))])
NT = 16 # 16
runtime = 50 # 365
#increase = np.repeat(0.000005,37)
#print(increase)
for i in range(NT):
n.inj_rate = injectionsrate[i]
n.injection_temperature = 273.15 + 22
n.set_boundary_conditions(injectionrate=injectionsrate[i], tempinj=273.15+22)
#n.property_container.kinetic_rate_ev = kinetic_advanced(comp_min=1e-11, rsa=int(2e-05 + increase[NT]))
n.run_python(runtime)
time_data = pd.DataFrame.from_dict(n.physics.engine.time_data)
time_data.to_pickle("darts_time_data.pkl")
writer = pd.ExcelWriter('time_data.xlsx')
time_data.to_excel(writer, 'Sheet1')
writer.save()
writer.close()
n.export_vtk()
n.save_restart_data()
n.load_restart_data()
injectionsrate2 = np.genfromtxt('injectionrate.txt')[15:].astype(float) #np.genfromtxt('InjectionMonthly.txt')[191:].astype(float)#
injectionsrate2 = injectionsrate2 / 20 #*2
#mu_w2 = CP.PropsSI('V', 'T', 10, 'P|liquid', bar2pa(130), 'Water') * 1000
#n.property_container.viscosity_ev = dict([('wat', ViscosityConst(1.3))])
n.property_container.kinetic_rate_ev = kinetic_advanced(comp_min=1e-11, rsa=2e-03)
days = 200
NT2 = 21 #21 # 252
runtime2 = 50 # 30
for i in range(NT2):
n.inj_rate = injectionsrate2[i]
n.injection_temperature = 273.15 + 10
n.set_boundary_conditions(injectionrate=injectionsrate2[i], tempinj=273.15 + 10)
n.run_python(runtime2)
time_data2 = pd.DataFrame.from_dict(n.physics.engine.time_data)
time_data2.to_pickle("darts_time_data2.pkl")
writer = pd.ExcelWriter('time_data2.xlsx')
time_data2.to_excel(writer, 'Sheet1')
writer.save()
writer.close()
n.export_vtk()
n.print_timers()
n.print_stat()
import darts.tools.plot_darts
from darts.tools.plot_darts import *
p_w = 'I1'
#ax = plot_water_rate_darts(p_w, time_data)
time_dataInjection = pd.read_pickle("darts_time_data.pkl")
time_dataInjection2= pd.read_pickle("darts_time_data2.pkl")
#ax = darts.tools.plot_darts.plot_water_rate_darts(p_w, time_dataInjection)
ax2 = darts.tools.plot_darts.plot_water_rate_darts(p_w, time_dataInjection2)
p_w2 = 'P1'
#ax3 = darts.tools.plot_darts.plot_water_rate_darts(p_w2, time_dataInjection)
ax4 = darts.tools.plot_darts.plot_water_rate_darts(p_w2, time_dataInjection2)
ax5 = darts.tools.plot_darts.plot_bhp_darts(p_w, time_dataInjection2)
plt.show()
The Non linear calculator:
from math import fabs
import pickle
import os
import numpy as np
from darts.engines import *
from darts.engines import print_build_info as engines_pbi
from darts.physics import print_build_info as physics_pbi
from darts.print_build_info import print_build_info as package_pbi
class DartsModel:
def __init__(self):
# print out build information
engines_pbi()
physics_pbi()
package_pbi()
self.timer = timer_node() # Create time_node object for time record
self.timer.start() # Start time record
self.timer.node["simulation"] = timer_node() # Create timer.node called "simulation" to record simulation time
self.timer.node["newton update"] = timer_node()
self.timer.node[
"initialization"] = timer_node() # Create timer.node called "initialization" to record initialization time
self.timer.node["initialization"].start() # Start recording "initialization" time
self.params = sim_params() # Create sim_params object to set simulation parameters
self.timer.node["initialization"].stop() # Stop recording "initialization" time
def init(self):
self.reservoir.init_wells()
self.physics.init_wells(self.reservoir.wells)
self.set_initial_conditions()
self.set_boundary_conditions()
self.set_op_list()
self.reset()
def reset(self)
self.physics.engine.init(self.reservoir.mesh, ms_well_vector(self.reservoir.wells),
op_vector(self.op_list),
self.params, self.timer.node["simulation"])
def set_initial_conditions(self):
pass
def set_boundary_conditions(self):
pass
def set_op_list(self):
self.op_list = [self.physics.acc_flux_itor]
def run(self, days=0):
if days:
runtime = days
else:
runtime = self.runtime
self.physics.engine.run(runtime)
def run_python(self, days=0, restart_dt=0, log_3d_body_path=0, timestep_python=False):
if days:
runtime = days
else:
runtime = self.runtime
mult_dt = self.params.mult_ts
max_dt = self.params.max_ts
self.e = self.physics.engine
t = self.e.t
if fabs(t) < 1e-15:
dt = self.params.first_ts
elif restart_dt > 0:
dt = restart_dt
else:
dt = self.params.max_ts
runtime += t
ts = 0
if log_3d_body_path and self.physics.n_vars == 3:
self.body_path_start()
while t < runtime:
if timestep_python:
converged = self.e.run_timestep(dt, t)
else:
converged = self.run_timestep_python(dt, t)
if converged:
t += dt
ts = ts + 1
print("# %d \tT = %3g\tDT = %2g\tNI = %d\tLI=%d"
% (ts, t, dt, self.e.n_newton_last_dt, self.e.n_linear_last_dt))
dt *= mult_dt
if dt > max_dt:
dt = max_dt
if t + dt > runtime:
dt = runtime - t
if log_3d_body_path and self.physics.n_vars == 3:
self.body_path_add_bodys(t)
nb_begin = self.reservoir.nx * self.reservoir.ny * (self.body_path_map_layer - 1) * 3
nb_end = self.reservoir.nx * self.reservoir.ny * (self.body_path_map_layer) * 3
self.save_matlab_map(self.body_path_axes[0] + '_ts_' + str(ts), self.e.X[nb_begin:nb_end:3])
self.save_matlab_map(self.body_path_axes[1] + '_ts_' + str(ts), self.e.X[nb_begin + 1:nb_end:3])
self.save_matlab_map(self.body_path_axes[2] + '_ts_' + str(ts), self.e.X[nb_begin + 2:nb_end:3])
else:
dt /= mult_dt
print("Cut timestep to %2.3f" % dt)
if dt < 1e-8:
break
self.e.t = runtime
print("TS = %d(%d), NI = %d(%d), LI = %d(%d)" % (self.e.stat.n_timesteps_total, self.e.stat.n_timesteps_wasted,
self.e.stat.n_newton_total, self.e.stat.n_newton_wasted,
self.e.stat.n_linear_total, self.e.stat.n_linear_wasted))
def load_restart_data(self, filename='restart.pkl'):
if os.path.exists(filename):
with open(filename, "rb") as fp:
data = pickle.load(fp)
days, X, arr_n = data
self.physics.engine.t = days
self.physics.engine.X = value_vector(X)
self.physics.engine.Xn = value_vector(X)
self.physics.engine.op_vals_arr_n = value_vector(arr_n)
def save_restart_data(self, filename='restart.pkl'):
"""
Function to save the simulation data for restart usage.
:param filename: Name of the file where restart_data stores.
"""
t = np.copy(self.physics.engine.t)
X = np.copy(self.physics.engine.X)
arr_n = np.copy(self.physics.engine.op_vals_arr_n)
data = [t, X, arr_n]
with open(filename, "wb") as fp:
pickle.dump(data, fp, 4)
def check_performance(self, overwrite=0, diff_norm_normalized_tol=1e-10, diff_abs_max_normalized_tol=1e-7,
rel_diff_tol=1, perf_file=''):
fail = 0
data_et = self.load_performance_data(perf_file)
if data_et and not overwrite:
data = self.get_performance_data()
nb = self.reservoir.mesh.n_res_blocks
nv = self.physics.n_vars
for v in range(nv):
sol_et = data_et['solution'][v:nb * nv:nv]
diff = data['solution'][v:nb * nv:nv] - sol_et
sol_range = np.max(sol_et) - np.min(sol_et)
diff_abs = np.abs(diff)
diff_norm = np.linalg.norm(diff)
diff_norm_normalized = diff_norm / len(sol_et) / sol_range
diff_abs_max_normalized = np.max(diff_abs) / sol_range
if diff_norm_normalized > diff_norm_normalized_tol or diff_abs_max_normalized > diff_abs_max_normalized_tol:
fail += 1
print(
'#%d solution check failed for variable %s (range %f): L2(diff)/len(diff)/range = %.2E (tol %.2E), max(abs(diff))/range %.2E (tol %.2E), max(abs(diff)) = %.2E' \
% (fail, self.physics.vars[v], sol_range, diff_norm_normalized, diff_norm_normalized_tol,
diff_abs_max_normalized, diff_abs_max_normalized_tol, np.max(diff_abs)))
for key, value in sorted(data.items()):
if key == 'solution' or type(value) != int:
continue
reference = data_et[key]
if reference == 0:
if value != 0:
print('#%d parameter %s is %d (was 0)' % (fail, key, value))
fail += 1
else:
rel_diff = (value - data_et[key]) / reference * 100
if abs(rel_diff) > rel_diff_tol:
print('#%d parameter %s is %d (was %d, %+.2f%%)' % (fail, key, value, reference, rel_diff))
fail += 1
if not fail:
print('OK, \t%.2f s' % self.timer.node['simulation'].get_timer())
return 0
else:
print('FAIL, \t%.2f s' % self.timer.node['simulation'].get_timer())
return 1
else:
self.save_performance_data(perf_file)
print('SAVED')
return 0
def get_performance_data(self):
perf_data = dict()
perf_data['solution'] = np.copy(self.physics.engine.X)
perf_data['reservoir blocks'] = self.reservoir.mesh.n_res_blocks
perf_data['variables'] = self.physics.n_vars
perf_data['OBL resolution'] = self.physics.n_points
perf_data['operators'] = self.physics.n_ops
perf_data['timesteps'] = self.physics.engine.stat.n_timesteps_total
perf_data['wasted timesteps'] = self.physics.engine.stat.n_timesteps_wasted
perf_data['newton iterations'] = self.physics.engine.stat.n_newton_total
perf_data['wasted newton iterations'] = self.physics.engine.stat.n_newton_wasted
perf_data['linear iterations'] = self.physics.engine.stat.n_linear_total
perf_data['wasted linear iterations'] = self.physics.engine.stat.n_linear_wasted
sim = self.timer.node['simulation']
jac = sim.node['jacobian assembly']
perf_data['simulation time'] = sim.get_timer()
perf_data['linearization time'] = jac.get_timer()
perf_data['linear solver time'] = sim.node['linear solver solve'].get_timer() + sim.node[
'linear solver setup'].get_timer()
interp = jac.node['interpolation']
perf_data['interpolation incl. generation time'] = interp.get_timer()
return perf_data
def save_performance_data(self, file_name=''):
import platform
if file_name == '':
file_name = 'perf_' + platform.system().lower()[:3] + '.pkl'
data = self.get_performance_data()
with open(file_name, "wb") as fp:
pickle.dump(data, fp, 4)
#staticmethod
def load_performance_data(file_name=''):
import platform
if file_name == '':
file_name = 'perf_' + platform.system().lower()[:3] + '.pkl'
if os.path.exists(file_name):
with open(file_name, "rb") as fp:
return pickle.load(fp)
return 0
def print_timers(self):
print(self.timer.print("", ""))
def print_stat(self):
self.physics.engine.print_stat()
def plot_layer_map(self, map_data, k, name, transpose=0):
import plotly
import plotly.graph_objs as go
nxny = self.reservoir.nx * self.reservoir.ny
layer_indexes = np.arange(nxny * (k - 1), nxny * k)
layer_data = np.zeros(nxny)
# for correct vizualization of inactive cells
layer_data.fill(np.nan)
active_mask = np.where(self.reservoir.discretizer.global_to_local[layer_indexes] > -1)
layer_data[active_mask] = map_data[self.reservoir.discretizer.global_to_local[layer_indexes][active_mask]]
layer_data = layer_data.reshape(self.reservoir.ny, self.reservoir.nx)
if transpose:
layer_data = layer_data.transpose()
y_axis = dict(scaleratio=1, scaleanchor='x', title='X, block')
x_axis = dict(title='Y, block')
else:
x_axis = dict(scaleratio=1, scaleanchor='x', title='X, block')
y_axis = dict(title='Y, block')
data = [go.Heatmap(
z=layer_data)]
layout = go.Layout(title='%s, layer %d' % (name, k),
xaxis=x_axis,
yaxis=y_axis)
fig = go.Figure(data=data, layout=layout)
plotly.offline.plot(fig, filename='%s_%d_map.html' % (name, k))
def plot_layer_map_offline(self, map_data, k, name, transpose=0):
import plotly
plotly.offline.init_notebook_mode()
self.plot_layer_map(map_data, k, name, transpose)
def plot_layer_surface(self, map_data, k, name, transpose=0):
import plotly
import plotly.graph_objs as go
nxny = self.reservoir.nx * self.reservoir.ny
layer_indexes = np.arange(nxny * (k - 1), nxny * k)
layer_data = np.zeros(nxny)
# for correct vizualization of inactive cells
layer_data.fill(np.nan)
active_mask = np.where(self.reservoir.discretizer.global_to_local[layer_indexes] > -1)
layer_data[active_mask] = map_data[self.reservoir.discretizer.global_to_local[layer_indexes][active_mask]]
layer_data = layer_data.reshape(self.reservoir.ny, self.reservoir.nx)
if transpose:
layer_data = layer_data.transpose()
data = [go.Surface(z=layer_data)]
plotly.offline.plot(data, filename='%s_%d_surf.html' % (name, k))
def plot_geothermal_temp_layer_map(self, X, k, name, transpose=0):
import plotly
import plotly.graph_objs as go
import numpy as np
from darts.models.physics.iapws.iapws_property import iapws_temperature_evaluator
nxny = self.reservoir.nx * self.reservoir.ny
temperature = iapws_temperature_evaluator()
layer_pres_data = np.zeros(nxny)
layer_enth_data = np.zeros(nxny)
layer_indexes = np.arange(nxny * (k - 1), nxny * k)
active_mask = np.where(self.reservoir.discretizer.global_to_local[layer_indexes] > -1)
layer_pres_data[active_mask] = X[2 * self.reservoir.discretizer.global_to_local[layer_indexes][active_mask]]
layer_enth_data[active_mask] = X[2 * self.reservoir.discretizer.global_to_local[layer_indexes][active_mask] + 1]
# used_data = map_data[2 * nxny * (k-1): 2 * nxny * k]
T = np.zeros(nxny)
T.fill(np.nan)
for i in range(0, nxny):
if self.reservoir.discretizer.global_to_local[nxny * (k - 1) + i] > -1:
T[i] = temperature.evaluate([layer_pres_data[i], layer_enth_data[i]])
layer_data = T.reshape(self.reservoir.ny, self.reservoir.nx)
if transpose:
layer_data = layer_data.transpose()
y_axis = dict(scaleratio=1, scaleanchor='x', title='X, block')
x_axis = dict(title='Y, block')
else:
x_axis = dict(scaleratio=1, scaleanchor='x', title='X, block')
y_axis = dict(title='Y, block')
data = [go.Heatmap(
z=layer_data)]
layout = go.Layout(title='%s, layer %d' % (name, k),
xaxis=x_axis,
yaxis=y_axis)
fig = go.Figure(data=data, layout=layout)
plotly.offline.plot(fig, filename='%s_%d_map.html' % (name, k))
def plot_1d(self, map_data, name):
import plotly
import plotly.graph_objs as go
import numpy as np
nx = self.reservoir.nx
data = [go.Scatter(x=np.linspace(0, 1, nx), y=map_data[1:nx])]
plotly.offline.plot(data, filename='%s_surf.html' % name)
def plot_1d_all(self, map_data):
import plotly
import plotly.graph_objs as go
import numpy as np
nx = self.reservoir.nx
nc = self.physics.n_components
data = []
for i in range(nc - 1):
data.append(go.Scatter(x=np.linspace(0, 1, nx), y=map_data[i + 1::nc][1:nx], dash='dash'))
plotly.offline.plot(data, filename='Compositions.html')
def plot_cumulative_totals_mass(self):
import plotly.offline as po
import plotly.graph_objs as go
import numpy as np
import pandas as pd
nc = self.physics.n_components
darts_df = pd.DataFrame(self.physics.engine.time_data)
total_df = pd.DataFrame()
total_df['time'] = darts_df['time']
time_diff = darts_df['time'].diff()
time_diff[0] = darts_df['time'][0]
for c in range(nc):
total_df['Total injection c %d' % c] = 0
total_df['Total production c %d' % c] = 0
search_str = ' : c %d rate (Kmol/day)' % c
for col in darts_df.columns:
if search_str in col:
inj_mass = darts_df[col] * time_diff
prod_mass = darts_df[col] * time_diff
# assuming that any well can inject and produce over the whole time
inj_mass[inj_mass < 0] = 0
prod_mass[prod_mass > 0] = 0
total_df['Total injection c %d' % c] += inj_mass
total_df['Total production c %d' % c] -= prod_mass
data = []
for c in range(nc):
data.append(go.Scatter(x=total_df['time'], y=total_df['Total injection c %d' % c].cumsum(),
name='%s injection' % self.physics.components[c]))
data.append(go.Scatter(x=total_df['time'], y=total_df['Total production c %d' % c].cumsum(),
name='%s production' % self.physics.components[c]))
layout = go.Layout(title='Cumulative total masses (kmol)', xaxis=dict(title='Time (days)'),
yaxis=dict(title='Mass (kmols)'))
fig = go.Figure(data=data, layout=layout)
po.plot(fig, filename='Cumulative_totals_mass.html')
def plot_mass_balance_error(self):
import plotly.offline as po
import plotly.graph_objs as go
import numpy as np
import pandas as pd
nc = self.physics.n_components
darts_df = pd.DataFrame(self.physics.engine.time_data)
total_df = pd.DataFrame()
total_df['time'] = darts_df['time']
time_diff = darts_df['time'].diff()
time_diff[0] = darts_df['time'][0]
for c in range(nc):
total_df['Total source-sink c %d' % c] = 0
search_str = ' : c %d rate (Kmol/day)' % c
for col in darts_df.columns:
if search_str in col:
mass = darts_df[col] * time_diff
total_df['Total source-sink c %d' % c] += mass
data = []
for c in range(nc):
total_df['Total mass balance error c %d' % c] = darts_df['FIPS c %d (kmol)' % c] - total_df[
'Total source-sink c %d' % c].cumsum()
total_df['Total mass balance error c %d' % c] -= darts_df['FIPS c %d (kmol)' % c][0] - \
total_df['Total source-sink c %d' % c][0]
data.append(go.Scatter(x=total_df['time'], y=total_df['Total mass balance error c %d' % c],
name='%s' % self.physics.components[c]))
layout = go.Layout(title='Mass balance error (kmol)', xaxis=dict(title='Time (days)'),
yaxis=dict(title='Mass (kmols)'))
fig = go.Figure(data=data, layout=layout)
po.plot(fig, filename='Mass_balance_error.html')
def plot_FIPS(self):
import plotly.offline as po
import plotly.graph_objs as go
import numpy as np
import pandas as pd
nc = self.physics.n_components
darts_df = pd.DataFrame(self.physics.engine.time_data)
data = []
for c in range(nc):
data.append(go.Scatter(x=darts_df['time'], y=darts_df['FIPS c %d (kmol)' % c],
name='%s' % self.physics.components[c]))
layout = go.Layout(title='FIPS (kmol)', xaxis=dict(title='Time (days)'),
yaxis=dict(title='Mass (kmols)'))
fig = go.Figure(data=data, layout=layout)
po.plot(fig, filename='FIPS.html')
def plot_totals_mass(self):
import plotly.offline as po
import plotly.graph_objs as go
import numpy as np
import pandas as pd
nc = self.physics.n_components
darts_df = pd.DataFrame(self.physics.engine.time_data)
total_df = pd.DataFrame()
total_df['time'] = darts_df['time']
for c in range(nc):
total_df['Total injection c %d' % c] = 0
total_df['Total production c %d' % c] = 0
search_str = ' : c %d rate (Kmol/day)' % c
for col in darts_df.columns:
if search_str in col:
inj_mass = darts_df[col].copy()
prod_mass = darts_df[col].copy()
# assuming that any well can inject and produce over the whole time
inj_mass[inj_mass < 0] = 0
prod_mass[prod_mass > 0] = 0
total_df['Total injection c %d' % c] += inj_mass
total_df['Total production c %d' % c] -= prod_mass
data = []
for c in range(nc):
data.append(go.Scatter(x=total_df['time'], y=total_df['Total injection c %d' % c],
name='%s injection' % self.physics.components[c]))
data.append(go.Scatter(x=total_df['time'], y=total_df['Total production c %d' % c],
name='%s production' % self.physics.components[c]))
layout = go.Layout(title='Total mass rates (kmols/day)', xaxis=dict(title='Time (days)'),
yaxis=dict(title='Rate (kmols/day)'))
fig = go.Figure(data=data, layout=layout)
po.plot(fig, filename='Totals_mass_rates.html')
def plot_1d_compare(self, map_data1, map_data2):
import plotly
import plotly.graph_objs as go
import numpy as np
nx = self.reservoir.nx
nc = self.physics.n_components
data = []
for i in range(nc - 1):
data.append(go.Scatter(x=np.linspace(0, 1, nx), y=map_data1[i + 1::nc][1:nx],
name="Comp = %d, dt = 5 days" % (i + 1)))
for i in range(nc - 1):
data.append(go.Scatter(x=np.linspace(0, 1, nx), y=map_data2[i + 1::nc][1:nx],
name="Comp = %d, dt = 50 days" % (i + 1), line=dict(dash='dot')))
plotly.offline.plot(data, filename='Compositions.html')
def body_path_start(self):
with open('body_path.txt', "w") as fp:
itor = self.physics.acc_flux_itor
self.processed_body_idxs = set()
for i, p in enumerate(itor.axis_points):
fp.write('%d %lf %lf %s\n' % (p, itor.axis_min[i], itor.axis_max[i], self.body_path_axes[i]))
fp.write('Body Index Data\n')
def body_path_add_bodys(self, time):
with open('body_path.txt', "a") as fp:
fp.write('T=%lf\n' % time)
itor = self.physics.acc_flux_itor
all_idxs = set(itor.body_data.keys())
new_idxs = all_idxs - self.processed_body_idxs
for i in new_idxs:
fp.write('%d\n' % i)
self.processed_body_idxs = all_idxs
def save_matlab_map(self, name, np_arr):
import scipy.io
scipy.io.savemat(name + '.mat', dict(x=np_arr))
def export_vtk(self, file_name='data', local_cell_data={}, global_cell_data={}, vars_data_dtype=np.float32,
export_grid_data=True):
# get current engine time
t = self.physics.engine.t
nb = self.reservoir.mesh.n_res_blocks
nv = self.physics.n_vars
X = np.array(self.physics.engine.X, copy=False)
for v in range(nv):
local_cell_data[self.physics.vars[v]] = X[v:nb * nv:nv].astype(vars_data_dtype)
self.reservoir.export_vtk(file_name, t, local_cell_data, global_cell_data, export_grid_data)
# destructor to force to destroy all created C objects and free memory
def __del__(self):
for name in list(vars(self).keys()):
delattr(self, name)
def run_timestep_python(self, dt, t):
max_newt = self.params.max_i_newton
max_residual = np.zeros(max_newt + 1)
self.e.n_linear_last_dt = 0
well_tolerance_coefficient = 1e2
self.timer.node['simulation'].start()
for i in range(max_newt+1):
self.e.run_single_newton_iteration(dt)
self.e.newton_residual_last_dt = self.e.calc_newton_residual()
max_residual[i] = self.e.newton_residual_last_dt
counter = 0
for j in range(i):
if abs(max_residual[i] - max_residual[j])/max_residual[i] < 1e-3:
counter += 1
if counter > 2:
print("Stationary point detected!")
break
self.e.well_residual_last_dt = self.e.calc_well_residual()
self.e.n_newton_last_dt = i
# check tolerance if it converges
if ((self.e.newton_residual_last_dt < self.params.tolerance_newton and self.e.well_residual_last_dt < well_tolerance_coefficient * self.params.tolerance_newton )
or self.e.n_newton_last_dt == self.params.max_i_newton):
if (i > 0): # min_i_newton
break
r_code = self.e.solve_linear_equation()
self.timer.node["newton update"].start()
self.e.apply_newton_update(dt)
self.timer.node["newton update"].stop()
# End of newton loop
converged = self.e.post_newtonloop(dt, t)
self.timer.node['simulation'].stop()
return converged

Improving speed when dealing with big numbers and big shape of arrays in Python

I have a task:
How many pairs of (i,j): array_1[ i ] + array_1[ j ] > array_2[ i ] + array_2[ j ]
This is my code:
import numpy as np
import pandas as pd
n = 200000
series_1 = np.random.randint(low = 1,high = 1000,size = n)
series_1_T = series_1.reshape(n,1)
series_2 = np.random.randint(low = 1,high = 1000,size = n)
series_2_T = series_2.reshape(n,1)
def differ(x):
count = 0
tabel_1 = series_1 + series_1_T[x:x+2000]
tabel_2 = series_2 + series_2_T[x:x+2000]
diff= tabel_1[tabel_1>tabel_2].shape[0]
count += diff
return count
arr = pd.DataFrame(data = np.arange(0,n,2000),columns = ["numbers"])
count_each_run = arr["numbers"].apply(differ) #this one take about 8min 40s
print(count_each_run.sum())
Are there any ways to speedup this?

If you don't run in memory error you can do:
n = 200_000
s1 = np.random.randint(low=1, high=1000, size=(n,1))
s2 = np.random.randint(low=1, high=1000, size=(n,1))
t1 = s1 + s1.T
t2 = s2 + s2.T
tot = np.sum(t1>t2)
Otherwise you can create batches, and again depending on what you can fit in memory you can use one or two for loops:
n = 200_000
s1 = np.random.randint(low=1, high=1000, size=(n,1))
s2 = np.random.randint(low=1, high=1000, size=(n,1))
bs = 10_000 # batchsize
tot = 0
for i in range(0, n, bs):
for j in range(0, n, bs):
t1 = s1[i:i+bs] + s1[j:j+bs].T
t2 = s2[i:i+bs] + s2[j:j+bs].T
tot += np.sum(t1>t2)
If you need speed you can try something like numba or cython.

Label x-Axis in steps (ticks) with Date from big Database

I have a SQL Database with Dates in one row and values in the other
0 2019-09-30 12:03:35 363
1 2019-09-30 12:03:35 362
2 2019-09-30 12:03:35 363
3 2019-09-30 12:03:35 363
4 2019-09-30 12:03:35 363
I want to create a graph with >1000 values. My Code works as i want it, but the labeling on the x-axis is overloaded. I searched for a solution and it should work somehow with "ticks", but I cant make it fit my specific Datetime from my database.
The x-axis Labels are crowded and not readable.
I want to Label only in specific periods.
I tried several tutorials and documentation about the "ticks" but they dont handle given Datestrings like i have in my Database already.
import sqlite3 as lite
import pandas as pd
import matplotlib as matplt
import matplotlib.pyplot as plt
from fpdf import FPDF
import numpy as np
from matplotlib.ticker import (MultipleLocator, FormatStrFormatter, AutoMinorLocator)
datei = lite.connect("C://Users/Loos/Desktop/datenbankVersuch01.db")
conn = datei.cursor()
befehl = conn.execute('''SELECT * FROM sensorNeu''')
ausgeben = befehl.fetchall()
dataframeeins = pd.DataFrame(ausgeben)
print(dataframeeins.head(5))
print(dataframeeins.tail(5))
# info = dataframeeins[1].describe()
# print(dataframeeins.describe())
anzahlWerte = 400
periode = 400000
i = 0 # Zähler für die Werte
k = 0 # Nummerierung der Zwischenwerte
dates = []
values = []
zwischenspeicherSekunden = 0
zwischenspeicherWert = 0
for x in range(0, anzahlWerte):
# values.append(dataframeeins[1][(i * 4) + periode])
# dates.append(dataframeeins[0][(i * 4) + periode])
aktuelleZeit = dataframeeins[0][i + periode]
letzteZeichen = aktuelleZeit[17:19]
umwandeln = int(letzteZeichen)
zweiteZeit = dataframeeins[0][i + periode + 1]
letzteZeichenNeu = zweiteZeit[17:19]
umwandelnNeu = int(letzteZeichenNeu)
if k == 0:
#zwischenspeicherSekunden = umwandeln
zwischenspeicherSekunden = dataframeeins[0][i + periode]
# print(zwischenspeicherSekunden)
zwischenspeicherWert = dataframeeins[1][i+periode]
# print(zwischenspeicherWert)
k += 1
elif k != 0 and umwandeln == umwandelnNeu:
zwischenspeicherWert = (zwischenspeicherWert+dataframeeins[1][i+periode])/2
k += 1
elif k != 0 and umwandeln != umwandelnNeu:
# Wie groß ist der Abstand zwischen "umwandeln" und "umwandelnNeu"? eine oder mehrere sekunden
values.append(zwischenspeicherWert)
dates.append(zwischenspeicherSekunden)
k = 0
else:
print("FEHLER 0001: Keine Bedinung erfüllt!!!")
i += 1
aktuelleZeiten = dataframeeins[0][500000]
letzteZeichenn = aktuelleZeiten[17:19]
umwandeln = int(letzteZeichenn)
if umwandeln == 41:
print("Das letzte Zeichen ist 41")
#print(letzteZeichenn)
#print(dates[0])
#print(dates[1])
#print(dates[2])
#print(dates[3])
f, ax = plt.subplots()
# ax.set_xticks(np.arange(0))
# ax.xaxis.set_major_formatter(FormatStrFormatter('%s'))
plt.title("Der erste Versuch")
plt.suptitle("Prototyp01")
plt.plot(dates, values, marker='o', linestyle='--') # add linestyle=' ' or marker='o' or color='red'
plt.xlabel("Verlauf in Sekunden")
plt.ylabel("Sensorwert in mV/g")
plt.grid(True)
plt.show()
datei.commit()
datei.close()
How can I Label the x-Axis like just Label every 2,5 or 10 seconds and not every second?

Error: TypeError: cannot perform reduce with flexible type

i am using python version 3.7.Below is the code in which I am performing operation along the rows. i want the mean of the data which are along the rows but I get an error. i am new to numpy and python. i am reading the data from text file.
My code is:
import numpy as np
def getIndexFromDatetime(date_from, date_to):
'''date_from = [2, 10] : 10oclock of day2
'''
if date_from[1] > 24 or date_to[1] > 24: print('error')
start = (date_from[0] - 1) * 48 + date_from[1] * 2
end = (date_to[0] - 1) * 48 + date_to[1] * 2
return [start, end]
def is_num(s):
return s.replace(',', '').replace('.', '').replace('-', '').isnumeric()
def get_dataset(fpath):
with open(fpath, 'r') as f:
cnt = 0
DataWeather = {}
header = []
dtime = []
temp1 = []
temp2 = []
for line in f:
terms = line.split('\t')
#print(terms)
if cnt == 0: header1 = terms
if cnt == 1: header2 = terms
#header.append(terms[3])
cnt += 1
if cnt == 2:
for i in range(len(header1)):
header.append(header1[i]+header2[i])
#print(header)
for i in range(len(terms)):
DataWeather[header[i]] = []
#break
if cnt > 2:
for i in range(len(terms)):
if is_num(terms[i]):
DataWeather[header[i]].append(float(terms[i]))
else:
DataWeather[header[i]].append(terms[i])
for i in range(len(DataWeather[header[0]])):
dtime.append(DataWeather[header[0]][i]+' '+DataWeather[header[1]][i])
return DataWeather, header
def get_data(dataset, header, idx):
y = dataset[header][idx[0]:idx[1]]
return y
data_dir = 'weather_data'
month = 3
day = list(range(1,10))
header_idx = [2,3,4,5,7,16]
for d in day:
print(d)
dtime_from = [d, 9]
dtime_to = [d, 18]
dtime_idx = getIndexFromDatetime(dtime_from, dtime_to)
fpath = '{0}/2019-{1:02}.txt'.format(data_dir, month)
dataset, header = get_dataset(fpath)
for h in header_idx:
print(fpath)
print(header[h], dtime_from, dtime_to, dtime_idx)
data = get_data(dataset, header[h], dtime_idx)
#data= list(map(float,np.array(data)))
#data = map(np.array(data, dtype=np.float))
print(data)
print(np.mean(data))
i am getting the following error:
ret = umr_sum(arr, axis, dtype, out, keepdims)
TypeError: cannot perform reduce with flexible type
i also tried some functions like "map" and "list" as commented in the code still it gives error of converting string to float.

Looping over pandas DataFrame

I have a weird issue that the result doesn't change for each iteration. The code is the following:
import pandas as pd
import numpy as np
X = np.arange(10,100)
Y = X[::-1]
Z = np.array([X,Y]).T
df = pd.DataFrame(Z ,columns = ['col1','col2'])
dif = df['col1'] - df['col2']
for gap in range(100):
Up = dif > gap
Down = dif < -gap
df.loc[Up,'predict'] = 'Up'
df.loc[Down,'predict'] = 'Down'
df_result = df.dropna()
Total = df.shape[0]
count = df_result.shape[0]
ratio = count/Total
print(f'Total: {Total}; count: {count}; ratio: {ratio}')
The result is always
Total: 90; count: 90; ratio: 1.0
when it shouldn't be.

Found the root of the problem 5 mins after posting this question. I just needed to reset the dataFrame to the original to fix the problem.
import pandas as pd
import numpy as np
X = np.arange(10,100)
Y = X[::-1]
Z = np.array([X,Y]).T
df = pd.DataFrame(Z ,columns = ['col1','col2'])
df2 = df.copy()#added this line to preserve the original df
dif = df['col1'] - df['col2']
for gap in range(100):
df = df2.copy()#reset the altered df back to the original
Up = dif > gap
Down = dif < -gap
df.loc[Up,'predict'] = 'Up'
df.loc[Down,'predict'] = 'Down'
df_result = df.dropna()
Total = df.shape[0]
count = df_result.shape[0]
ratio = count/Total
print(f'Total: {Total}; count: {count}; ratio: {ratio}')

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Parallel processing in Python with input output file - python-3.x

Related

Create a Multiprocess script from a system consisting of multiple files

Improving speed when dealing with big numbers and big shape of arrays in Python

Label x-Axis in steps (ticks) with Date from big Database

Error: TypeError: cannot perform reduce with flexible type

Looping over pandas DataFrame

Categories

Resources