Have a pytorch class produce 2 separate datasets - pytorch

Does anyone have any suggestions for creating 2 datasets at once? When I run dataset = ImbCircuitDataset('test'), I want it to save 2 separate sets, even if I have to go all the way back to self.data and split it there, I just want the class to produce 2 separate datasets.
Basically run set1, set2 = ImbCircuitDataset('test')
I started doing it at the bottom of the def process(self) but I have a feeling its wrong. I am thinking in def processed_file_names(self) I need to return 'set1.pt', 'set2.pt'.
from scipy.io import loadmat
import pandas as pd
from torch_geometric.data import Data, InMemoryDataset
import torch
import neptune.new as neptune
import numpy as np
import torch_geometric
from tqdm import tqdm
from typing import Tuple, Union
from torch import Tensor
from collections.abc import Sequence
from torch_geometric.utils import from_networkx
import networkx as nx
import matplotlib.pyplot as plt
IndexType = Union[slice, Tensor, np.ndarray, Sequence]
print(f"Torch version: {torch.__version__}")
print(f"Cuda available: {torch.cuda.is_available()}")
print(f"Torch geometric version: {torch_geometric.__version__}")
class ImbCircuitDataset(InMemoryDataset):
def __init__(self, root, transform=None, pre_transform=None,
pre_filter=None):
super().__init__(root, transform, pre_transform, pre_filter)
self.data, self.slices = torch.load(self.processed_paths[0])
#property
def raw_file_names(self):
return 'shuffled_data.mat'
#property
def processed_file_names(self):
return 'data.pt'
def download(self):
pass
def process(self):
self.raw_data = loadmat(self.raw_paths[0], squeeze_me=True)
self.data = pd.DataFrame(self.raw_data['Graphs'])
self.data = self.data.sort_values('Labels', ascending=True, ignore_index=True)
for i in range(len(self.data)):
self.data['Ln'][i] = dict(enumerate(self.data['Ln'][i]))
data_list = []
for index, cir in tqdm(self.data.iterrows(), total=self.data.shape[0]):
nxg = nx.Graph(self.data['A'][index])
nx.set_node_attributes(nxg, self.data['Ln'][index], 'component')
pt_graph = self._get_graph_object(nxg)
pt_graph.x = self._get_node_features(nxg, self.data['Ln'][index])
pt_graph.performance = torch.tensor(self.data['Labels'][index], dtype=torch.float)
data_list.append(pt_graph)
if self.pre_filter is not None:
data_list = [d for d in data_list if self.pre_filter(d)]
if self.pre_transform is not None:
data_list = [self.pre_transform(d) for d in data_list]
split = 0.5
N = len(data_list)
set_1 = data_list[:N*split]
set_2 = data_list[N*split:]
data1, slices1 = self.collate(set_1)
data2, slices2 = self.collate(set_2)
torch.save((data1, slices1), self.processed_paths[0])
torch.save((data2,slices2), self.processed_paths[0])
def _get_node_features(self, nxgraph, node_labels):
betweenness = list(nx.betweenness_centrality(nxgraph).values())
eigenvector = list(nx.eigenvector_centrality(
nxgraph, max_iter=600).values())
mapping_dict = {'C': 0, 'G': 1, 'I': 2, 'O': 3, 'P': 4, 'R': 5}
component_labels = []
for value in node_labels.values():
if value in mapping_dict:
component_labels.append(mapping_dict[value])
all_feats = [component_labels, betweenness, eigenvector]
all_feats = np.asarray(all_feats).transpose()
return torch.tensor(all_feats, dtype=torch.float)
def _get_graph_object(self, nx_graph):
nxg = from_networkx(nx_graph)
return nxg
#property
def num_node_features(self) -> int:
return 3
#property
def num_classes(self) -> int:
return 2

Related

Is there a way in an external .py to catch data populated by a kivy button (row by row) and then get len() of that dataset as well?

How do I catch tuples generated by clicks on the kivy button in file 1 and the corresponding number of rows, i.e. len(), of that number of rows in file 2? Any support out there is much, much appreciated....
View explanation below...
I created a kivy app delivering a row of tupled values every time I click my button. That works fine. Now I want to pick up, e.g. a dataset of five clicks on button, i.e. 5 rows of tuples. Below is what I did in file 1:
file 1.py:
kv = '''
<Launch>:
BoxLayout:
Button:
size:(80,80)
size_hint:(None,None)
text:"..."
on_press: root.build()
'''
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
import pandas as pd
import numpy as np
from kivy.app import App
from kivy.uix.button import Button
def test(t):
size = t
a = pd.DataFrame(columns=['col1', 'col2', 'col3'])
a['col1'] = pd.DataFrame(np.random.randint(1, 50, size))
a['col2'] = pd.DataFrame(np.random.randint(1, 50, size))
a['col3'] = pd.DataFrame(np.random.rand(size))
t = a
return t
def vars_n(self):
a = test(t=1)
# Define objects for dataframe and col inputs
self.a_num = pd.DataFrame(test(1))
self.a_limit = a[(a.col3) < 1 & (a.col3 > 0.8)]
self.a_col1 = a['col1']
self.a_col2 = a['col2']
self.a_col3 = a['col3']
cols = self.a_col1, self.a_col2, self.a_col3
lst = []
self.a_col1, self.a_col2, self.a_col3 = 'src', 'dest', 'col3'
for a in range(1):
lst.append([self.a_col1, self.a_col2, self.a_col3])
self.a_col1, self.a_col2, self.a_col3 = \
np.random.randint(1, 40, size=1), np.random.randint(1, 40, size=1), np.random.rand(1)
df = pd.DataFrame(lst, columns=cols)
tuple1 = self.a_col1
tuple2 = self.a_col2
tuple3 = self.a_col3
q = tuple(zip(tuple1, tuple2, tuple3))
return q
class MyDf(App, object):
def __init__(self):
super().__init__()
def test_def(self):
msg = test(1)
print(msg)
def test_vars_n(self):
msg = vars_n(test(t=1))
print(msg)
def length(self):
result = len(vars_n(test(t=1)))
print(result)
# Define output for activation of kivy button
def press(self, instance):
print(vars_n(test(t=1)))
# Define kivy button configuration
def build(self):
butt=Button(text="...")
butt.bind(on_press=self.press)
return butt
MyDf().run()
Result after e.g. five clicks, could generate below dataset:
((6, 22, 0.8525529856428397),)
((12, 7, 0.3912468711230911),)
((30, 14, 0.979806646854341),)
((21, 27, 0.618131650972481),)
((8, 20, 0.9164440407619223),)
So, in file 2, I'd like to pull above five lines in the dataset above and, at the same time, get the len of that dataset, i.e. 5. Tried this, but it does not seem to catch the output of file 1:
file 2.py:
import pandas as pd
import numpy as np
my_instance = MyDf()
interactions = my_instance.test_vars_n()
interactions = np.array(interactions)
print(len(interactions)) # testing result
Got this error:
Traceback (most recent call last):
File "G:\...\...\...\....\file2.py", line 38, in <module>
print(len(interactions))
TypeError: len() of unsized object
EDITING with example I was inspired by in above attempts:
File 1
import pandas as pd
import numpy as np
def test(t):
size = t
a = pd.DataFrame(columns=['col1', 'col2', 'col3'])
a['col1'] = pd.DataFrame(np.random.randint(1, 50, size))
a['col2'] = pd.DataFrame(np.random.randint(1, 50, size))
a['col3'] = pd.DataFrame(np.random.rand(size))
t = a
return t
class ClassTest(object):
def test_def(self):
msg = test(1)
print(msg)
File 2:
from Call_an_outside_function_from_class_file_1 import ClassTest
my_new_instance = ClassTest()
ClassTest().test_def()
Got this result, and without using the App.get_running_app() replacement:
col1 col2 col3
0 48 3 0.514489
Process finished with exit code 0

Can't get pyqtgraph chart to update

I'm trying to import tick data from MT5 and display it on a candlestick chart in pyqtgraph but the graph only displays the first two candles which are preloaded to meet minimum data requirements to prevent an exception.
Beyond the original two candles the chart does not update with new values.
import pyqtgraph as pg
import numpy as np
from PyQt5 import QtWidgets, QtCore, QtGui
from pyqtgraph import PlotWidget, plot, QtCore, QtGui
import sys
import os
from random import randint
import time
import threading
import os
import queue
import random
import copy
import MetaTrader5 as mt5
from datetime import datetime
#------------------------------------------------------------------------------
'''chart items'''
class CandlestickItem(pg.GraphicsObject):
_boundingRect = QtCore.QRectF()
# ...
def __init__(self):
pg.GraphicsObject.__init__(self)
self.flagHasData = False
def set_data(self, data):
self.data = data
self.flagHasData = True
self.generatePicture()
self.informViewBoundsChanged()
def generatePicture(self):
self.picture = QtGui.QPicture()
path = QtGui.QPainterPath()
p = QtGui.QPainter(self.picture)
p.setPen(pg.mkPen('w'))
w = (self.data[1][0] - self.data[0][0]) / 3.
for (t, open, close) in self.data:
# line = QtCore.QLineF(t, min, t, max)
# path.moveTo(line.p1())
# path.lineTo(line.p2())
# p.drawLine(line)
rect = QtCore.QRectF(t-w, open, w*2, close-open)
path.addRect(rect)
if open > close:
p.setBrush(pg.mkBrush('r'))
else:
p.setBrush(pg.mkBrush('g'))
p.drawRect(rect)
p.end()
self._boundingRect = path.boundingRect()
def paint(self, p, *args):
if self.flagHasData:
p.drawPicture(0, 0, self.picture)
def boundingRect(self):
return self._boundingRect
#------------------------------------------------------------------------------
# establish connection to the MetaTrader 5 terminal
if not mt5.initialize():
print("initialize() failed, error code =",mt5.last_error())
quit()
# attempt to enable the display of the GBPUSD in MarketWatch
selected=mt5.symbol_select("EURUSD",True)
if not selected:
print("Failed to select EURUSD")
mt5.shutdown()
quit()
#------------------------------------------------------------------------------
class tick:
last_tick = 0
current_tick = 0
current_tick_number = 1
def __init__(self):
self.tick_array = np.zeros((100000,3), dtype = float)
self.pass_data = False
def _time(self, tick_index_number):
return self.tick_array[self.tick_index(tick_index_number),0]
def _open(self, tick_index_number):
# print(tick_index_number)
return self.tick_array[self.tick_index(tick_index_number),1]
def _close(self, tick_index_number):
return self.tick_array[self.tick_index(tick_index_number),2]
def _min(self, tick_index_number):
return self.tick_array[self.tick_index(tick_index_number),3]
def _max(self, tick_index_number):
return self.tick_array[self.tick_index(tick_index_number),4]
#return a negative index of n
def tick_index(self, n = 0):
return self.current_tick_number - n
#gets EURUSD current tick values
def get_tick(self):
return mt5.symbol_info_tick("EURUSD")
#add self.time/bid/ask to get_tick instead
#------------------------------------------------------------------------------
#updates tick array
def update_tick(self):
while True:
#get current tick value
current_tick = self.get_tick()
#if current tick is unique, add that value to last_tick and continue
if self.last_tick != current_tick:
self.last_tick = current_tick
#update the array with the new tick values
self.tick_array[self.current_tick_number,0], self.tick_array[self.current_tick_number,1], self.tick_array[self.current_tick_number,2] = datetime.fromtimestamp(current_tick[5] / 1000.0).strftime("%m%d%Y%I%M%S"), self.tick_array[self.current_tick_number-1, 2] , current_tick[1]
self.current_tick_number += 1
q.put(self.tick_array[:self.current_tick_number])
def tick_datafeed(self):
return self.tick_array[:self.current_tick_number]
tick = tick()
#------------------------------------------------------------------------------
''' launch threads that work the chart'''
class Threads:
def __init__(self):
pass
def thread_launch_update_tick(self):
t1 = threading.Thread(target = tick.update_tick, args = ())
t1.start()
def thread_launch_get_data_from_update(self):
t2 = threading.Thread(target = get_data_from_update, args = ())
t2.start()
def get_data_from_update():
while True:
i = q.get()
item.set_data(i)
print(tick.tick_array[:tick.current_tick_number])
q.task_done()
#------------------------------------------------------------------------------
app = QtWidgets.QApplication([])
item = CandlestickItem()
item.set_data(tick.tick_array[:tick.current_tick_number + 1])
plt = pg.plot()
plt.addItem(item)
plt.setWindowTitle('pyqtgraph example: customGraphicsItem')
q = queue.Queue()
threads = Threads()
threads.thread_launch_get_data_from_update()
threads.thread_launch_update_tick()
#------------------------------------------------------------------------------
if __name__ == '__main__':
# window = Window()
import sys
if (sys.flags.interactive != 1) or not hasattr(QtCore, 'PYQT_VERSION'):
QtWidgets.QApplication.instance().exec_()
The resulting array data looks like this
which is comprised of the [date in unix, last tick, current tick] aka time/open/close
[[0.00000000e+00 0.00000000e+00 0.00000000e+00]
[7.26202211e+12 0.00000000e+00 1.01348000e+00]
[7.26202211e+12 1.01348000e+00 1.01349000e+00]
[7.26202211e+12 1.01349000e+00 1.01348000e+00]
[7.26202211e+12 1.01348000e+00 1.01347000e+00]
[7.26202211e+12 1.01347000e+00 1.01348000e+00]
[7.26202211e+12 1.01348000e+00 1.01347000e+00]
[7.26202211e+12 1.01347000e+00 1.01348000e+00]
[7.26202211e+12 1.01348000e+00 1.01347000e+00]
[7.26202211e+12 1.01347000e+00 1.01346000e+00]
[7.26202211e+12 1.01346000e+00 1.01347000e+00]
[7.26202211e+12 1.01347000e+00 1.01346000e+00]]
Can anyone point out my flaw?
The get_data_from_update() function should update the chart but it simply doesn't work for reasons that are beyond me.

How to set Timestamp (pandas Datetime) to xlim of plot with FunctAnimation?

I want to limit my graph from left and set (the current time - 2 hours) to xlim. I tried to add to the "update" function this
self.ax.set_xlim(left=max(self.data.iloc[0, 0], self.data.iloc[-1, 0] - pd.Timedelta(hours=2)))
But this is doesn't work. Could anyone help me do this, please?
from matplotlib import pyplot as plt
from matplotlib.animation import FuncAnimation
from datetime import datetime
import pandas as pd
def to_pd(wt, wf):
p = pd.DataFrame({"Date": [wt], "Cost": [wf]})
p["Date"] = pd.to_datetime(p["Date"], format='%Y-%m-%d %H:%M:%S')
return p
fig = plt.figure(figsize=(18,8), dpi=90)
class Graph():
def __init__(self):
self.ax = fig.add_subplot()
self.start_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
self.data = to_pd(self.start_time,0)
self.line, = self.ax.plot(self.data.Date,self.data.Cost)
def update(self,i):
self.current_time = (self.data.iloc[-1,0]+pd.Timedelta(minutes=1)).strftime('%Y-%m-%d %H:%M:%S')
self.data = self.data.append(to_pd(self.current_time,(-1)**i))
self.line.set_data(self.data.Date, self.data.Cost)
self.ax.relim()
self.ax.autoscale_view()
return self.line,
object = Graph()
def animate(i):
return object.update(i)
anim = FuncAnimation(fig, animate, frames=200,interval=100, blit=True)
plt.show()
Solved, but I think there is more convenient way. But, here what am done, just added it into the 'update' function, it's clear all old data:
if self.data.iloc[-1, 0] - pd.Timedelta(hours=2) >= self.data.iloc[0, 0]:
self.data = self.data[self.data.Date > self.data.iloc[0, 0]]

How do you parse the bin file from INT8 Calibration of TensorRT?

I have created a python script for calibrating(INT8) the dynamic scales of the activation of TinyYOLO V2 using TensorRT. The script gave me a file called calibration_cache.bin. How do I parse the .bin file ? What do the values inside the .bin file mean ?
calibrator.py
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
from PIL import Image
import ctypes
import tensorrt as trt
import os
CHANNEL = 3
HEIGHT = 416
WIDTH = 416
class PythonEntropyCalibrator(trt.IInt8EntropyCalibrator):
def __init__(self, input_layers, stream):
trt.IInt8EntropyCalibrator.__init__(self)
self.input_layers = input_layers
self.stream = stream
self.d_input = cuda.mem_alloc(self.stream.calibration_data.nbytes)
stream.reset()
def get_batch_size(self):
return self.stream.batch_size
def get_batch(self, bindings, names):
batch = self.stream.next_batch()
if not batch.size:
return None
cuda.memcpy_htod(self.d_input, batch)
for i in self.input_layers[0]:
assert names[0] != i
bindings[0] = int(self.d_input)
return bindings
def read_calibration_cache(self, length=0):
if os.path.exists('calibration_cache.bin'):
with open('calibration_cache.bin', 'rb') as f:
return f.read()
return None
def write_calibration_cache(self, cache, size=0):
with open('calibration_cache.bin', 'wb') as f:
f.write(cache)
return None
class ImageBatchStream():
def __init__(self, batch_size, calibration_files, preprocessor):
self.batch_size = batch_size
self.max_batches = (len(calibration_files) // batch_size) + \
(1 if (len(calibration_files) % batch_size) \
else 0)
self.files = calibration_files
self.calibration_data = np.zeros((batch_size, CHANNEL, HEIGHT, WIDTH), \
dtype=np.float32)
self.batch = 0
self.preprocessor = preprocessor
#staticmethod
def read_image_chw(path):
img = Image.open(path).resize((WIDTH,HEIGHT), Image.NEAREST)
im = np.array(img, dtype=np.float32, order='C')
im = im[:,:,::-1]
im = im.transpose((2,0,1))
return im
def reset(self):
self.batch = 0
def next_batch(self):
if self.batch < self.max_batches:
imgs = []
files_for_batch = self.files[self.batch_size * self.batch : \
self.batch_size * (self.batch + 1)]
for f in files_for_batch:
self.batch_size * (self.batch + 1)]
for f in files_for_batch:
print("[ImageBatchStream] Processing ", f)
img = ImageBatchStream.read_image_chw(f)
img = self.preprocessor(img)
imgs.append(img)
for i in range(len(imgs)):
self.calibration_data[i] = imgs[i]
self.batch += 1
return np.ascontiguousarray(self.calibration_data, dtype=np.float32)
else:
return np.array([])
test.py
from random import shuffle
from PIL import Image
import glob
import numpy as np
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import os
from calibrator import *
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
model_file = './tiny_yolov2/Model.onnx'
dataset_loc = './Dataset/*.jpg'
def normalize(data):
data /= 255.0
return data
def create_calibration_dataset():
calibration_files = glob.glob(dataset_loc)
shuffle(calibration_files)
return calibration_files[:20]
calibration_files = create_calibration_dataset()
NUM_IMAGES_PER_BATCH = 5
batchstream = ImageBatchStream(NUM_IMAGES_PER_BATCH, calibration_files, normalize)
Int8_calibrator = PythonEntropyCalibrator(["conv2d_91_input"], batchstream)
builder = trt.Builder(TRT_LOGGER)
builder.int8_calibrator = Int8_calibrator
builder.refittable = True
builder.int8_mode = True
network = builder.create_network()
parser = trt.OnnxParser(network, TRT_LOGGER)
print(builder.int8_mode, builder.platform_has_fast_int8,builder.refittable)
with open(model_file, 'rb') as model:
parser.parse(model.read())
print('Done reading ONNX File\n')
engine = builder.build_cuda_engine(network)
print(engine, TRT_LOGGER)
with open("model.trt", "wb") as f:
f.write(engine.serialize())
print("Done converting the ONNX to TRT\n")
tinyolo_fitter = trt.Refitter(engine, TRT_LOGGER)
print(tinyolo_fitter.refit_cuda_engine())
print(tinyolo_fitter.get_tensors_with_dynamic_range())
calibration_cache.bin
TRT-5105-EntropyCalibration
image: 3c010a14
scalerPreprocessor_scaled: 38018ba0
image2: 38018ba0
batchnormalization_1_output: 3d07b31d
leakyrelu_1_output: 3c98a317
maxpooling2d_1_output: 3c1e5b30
batchnormalization_2_output: 3ca6aa67
leakyrelu_2_output: 3ca6aa67
maxpooling2d_2_output: 3c82cf7d
batchnormalization_3_output: 3ce07ce8
leakyrelu_3_output: 3ce52236
maxpooling2d_3_output: 3cc8ed6f
batchnormalization_4_output: 3d3df55f
leakyrelu_4_output: 3c651727
maxpooling2d_4_output: 3cec84fc
batchnormalization_5_output: 3d0f51e3
leakyrelu_5_output: 3cb52377
maxpooling2d_5_output: 3d026049
batchnormalization_6_output: 3d387291
leakyrelu_6_output: 3ccc009a
maxpooling2d_6_output: 3c8d0f0c
batchnormalization_7_output: 3e0de3d2
leakyrelu_7_output: 3d7b4397
batchnormalization_8_output: 3cc459d6
leakyrelu_8_output: 3cbd9562
grid: 3ddc32dc
def read_calibration_cache(self, length=0):
if os.path.exists('calibration_cache.bin'):
with open('calibration_cache.bin', 'rb') as f:
return f.read()
return None
This does the work i guess. If there is a calibration_cache.bin file in your dir, calibrator parses it instead of calibrating again.

How can one parallelize geopandas "to_file" function

I am trying to implement a parallelized function for Geopandas that takes a single vector data (i.e.: a Shapefile containing a Multipolygon data type), and converts it to a standard celular grid with cell x and y sizes defined by the user.
As this function may result in serious Memory issues (i.e.: caused by too high spatial resolution), I was wondering whether it would be possible to save the data iteratively in the given destinated file. That way, as each parallel process runs the "GRID" function, the same process can save the data iteratively in appended mode. That way, I believe that one wouldn't have Memory issues.
Here is my "SHP_to_GRID_Function". Note that the code below still requires that the whole data generated by the multiprocessing be handled by memory directly (so overflow is more than certain for large datasets).
import pandas as pd
import numpy as np
import geopandas as gpd
from shapely.geometry import Polygon
from multiprocessing import Pool
import os
from functools import partial
def info(title):
print(title)
print('module name:', __name__)
print('parent process:', os.getppid())
print('process id:', os.getpid())
def parallelize_df(gdf, func, n_cores, dx=100, dy=100, verbose=False):
Geometries= gdf.loc[:, 'geometry'].values
pool = Pool(processes=n_cores)
func_partial=partial(func, dx, dy, verbose) # prod_x has only one argument x (y is fixed to 10)
results = pool.map(func_partial, Geometries)
pool.close()
pool.join()
print(np.shape(results))
GRID = gpd.GeoSeries(np.array(results).ravel())
print("GRID well created")
return GRID
def generate_grid_from_Poligon(dx=100, dy=100, verbose=False, polygon=None):
if verbose == True:
info('function parallelize_df')
else:
None
xmin,ymin,xmax,ymax = polygon.bounds
lenght = dx
wide = dy
cols = list(np.arange(int(np.floor(xmin)), int(np.ceil(xmax)), wide))
rows = list(np.arange(int(np.floor(ymin)), int(np.ceil(ymax)), lenght))
rows.reverse()
subpolygons = []
for x in cols:
for y in rows:
subpolygons.append( Polygon([(x,y), (x+wide, y), (x+wide, y-lenght), (x, y-lenght)]) )
return subpolygons
def main(GDF, n_cores='standard', dx=100, dy=100, verbose= False):
"""
GDF: geodataframe
n_cores: use standard or a positive numerical (int) value. It will set the number of cores to use in the multiprocessing
args: (dx: dimension in the x coordinate to make the grid
dy: dimenion in the y coordinate to make the grid)
"""
if isinstance(n_cores, str):
import multiprocessing
N_cores = multiprocessing.cpu_count() -1
elif isinstance(n_cores, int):
N_cores =n_cores
GRID_GDF = parallelize_df(GDF, generate_grid_from_Poligon, n_cores=N_cores, dx=dx, dy=dy, verbose=verbose)
return GRID_GDF
I thank you for you time,
Sincerely yours,
Philipe Leal
I finally have come across a solution for my question. It is not perfect, since it requires several writing processes and one final concatenation process over all temporary files created during the run.
Feel free to suggest alternatives.
Here is the solution I found.
import numpy as np
import geopandas as gpd
import pandas as pd
from shapely.geometry import Polygon
from multiprocessing import Pool, Lock, freeze_support
import os
from functools import partial
import time
def info(time_value):
print('module name:', __name__)
print('parent process:', os.getppid())
print('process id:', os.getpid())
print("Time spent: ", time.time() - time_value)
def init(l):
global lock
lock=l
def Data_Arranger(to_filename):
"""This function concatenates and deletes temporary files. It is an arranger
of the multicessing data results"
"""
Base = os.path.join(os.path.dirname(to_filename), 'temp')
Strings = [file for file in os.listdir(Base)]
Strings = [os.path.join(Base, S) for S in Strings]
if not os.path.exists(os.path.dirname(to_filename)):
os.mkdir(os.path.dirname(to_filename))
Sq = [S for S in Strings if S.endswith('.shp')]
gpd.GeoDataFrame(pd.concat([gpd.read_file(sq1) for sq1 in Sq]), crs=GDF.crs).to_file(to_filename)
for sq1 in Sq:
os.remove(sq1)
import shutil
shutil.rmtree(Base, ignore_errors=True)
def parallelize_df(gdf, func, n_cores, dx=100, dy=100, verbose=False, to_filename=None):
Geometries= gdf.loc[:, 'geometry'].values
crs = gdf.crs
pool = Pool(processes=n_cores, initializer=init, initargs=(Lock(), ) )
func_partial=partial(func, dx, dy, verbose, to_filename, crs) # prod_x has only one argument x (y is fixed to 10)
pool.map(func_partial, Geometries)
pool.close()
pool.join()
def generate_grid_from_gdf(dx=100, dy=100, verbose=False, to_filename=None, crs=None, polygon=None):
if verbose == True:
info(time.time())
else:
None
xmin,ymin,xmax,ymax = polygon.bounds
lenght = dx
wide = dy
cols = list(np.arange(int(np.floor(xmin)), int(np.ceil(xmax)), wide))
rows = list(np.arange(int(np.floor(ymin)), int(np.ceil(ymax)), lenght))
rows.reverse()
subpolygons = []
for x in cols:
for y in rows:
subpolygons.append( Polygon([(x,y), (x+wide, y), (x+wide, y-lenght), (x, y-lenght)]) )
lock.acquire()
print('parent process: ', os.getppid(), ' has activated the Lock')
GDF = gpd.GeoDataFrame(geometry=subpolygons, crs=crs)
to_filename = os.path.join(os.path.dirname(to_filename), 'temp', str(os.getpid()) + '_' + str(time.time()) + '.' + os.path.basename(to_filename).split('.')[-1])
if not os.path.exists(os.path.dirname(to_filename)):
os.mkdir(os.path.dirname(to_filename))
try:
print("to_filename: ", to_filename)
GDF.to_file(to_filename)
except:
print("error in the file saving")
lock.release()
print('parent process: ', os.getppid(), ' has unlocked')
def main(GDF, n_cores='standard', dx=100, dy=100, verbose= False, to_filename=None):
"""
GDF: geodataframe
n_cores: use standard or a positive numerical (int) value. It will set the number of cores to use in the multiprocessing
dx: dimension in the x coordinate to make the grid
dy: dimenion in the y coordinate to make the grid)
verbose: whether or not to show info from the processing. Appliable only if applying the function not
in Windows (LINUX, UBUNTU, etc.), or when running in separte console in Windows.
to_filename: the path which will be used to save the resultant file.
"""
if isinstance(n_cores, str):
import multiprocessing
N_cores = multiprocessing.cpu_count() -1
elif isinstance(n_cores, int):
N_cores =n_cores
parallelize_df(GDF, generate_grid_from_gdf, n_cores=N_cores, dx=dx, dy=dy, verbose=verbose, to_filename=to_filename)
Data_Arranger(to_filename)
####################################################################################
if "__main__" == __name__:
freeze_support()
GDF = gpd.read_file("Someone's_file.shp")
to_filename = "To_file_directory/To_file_name.shp"
dx = 500 # resampling to 500 units. Ex: assuming the coordinate reference system is in meters, this function will return polygons of the given geometries in 500m for the longitudinal dimension.
dy = 500 # same here. Assuming CRS is in meters units, the resultant file will be have polygons of 500m in latitudinal dimension
main(GDF, dx=dx, dy=dy, verbose=True, to_filename=to_filename)
I thank you for your time.
Philipe Leal

Resources