I am trying to execute multi-process to pull the data from Cassandra. But, I'm facing the issue.I want to pull it for single key or multiple keys using the multi-process provided my Cassandra
My cassandra_db class
from cassandra.cluster import Cluster
import cassandra
import pandas as pd
import numpy as np
from datetime import datetime
import sys
import os
from threading import Event
import itertools
from multiprocessing import Pool
from cassandra.concurrent import execute_concurrent_with_args
from cassandra.query import tuple_factory
ip_address = '127.0.0.1'
class cassandra_db(object):
concurrency = 2 # chosen to match the default in execute_concurrent_with_args
def __init__(self,process_count=None):
self.pool = Pool(processes=process_count, initializer=self._setup)
#classmethod
def _setup(cls):
cls.session = Cluster([ip_address]).connect(keyspace='test')
cls.session.row_factory = pandas_factory
cls.prepared = cls.session.prepare('SELECT * FROM tr_test WHERE key=?')
def close_pool(self):
self.pool.close()
self.pool.join()
def get_results(self, params):
try:
xrange
except NameError:
xrange = range
params = list(params)
print("-----> ",params)
print("-----+>",self.concurrency)
self.pool.map(_multiprocess_get, (params[n:n + self.concurrency] for n in xrange(0, len(params), self.concurrency)))
#classmethod
def _results_from_concurrent(cls, params):
return execute_concurrent_with_args(cls.session, cls.prepared, params)
def _multiprocess_get(params):
return cassandra_db._results_from_concurrent(params)
My calling class
import os
import pandas as pd
import sys
relative_path='/home/anji'
sys.path.append(os.path.join(relative_path ,'commons','Database Operations'))
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider
from cassandra_db import cassandra_db
from cassandra.policies import ConstantReconnectionPolicy
processes =2
con_db = cassandra_db(processes)
keys=[(1,),(2,)]
df = con_db.get_results(keys)
print("Result",df.head())
Error:
multiprocessing.pool.MaybeEncodingError: Error sending result: '[[ExecutionResult(success=True, result_or_exc=<cassandra.cluster.ResultSet object at 0x7fa93658bbe0>), ExecutionResult(success=True, result_or_exc=<cassandra.cluster.ResultSet object at 0x7fa936a2e0f0>)]]'. Reason: 'PicklingError("Can't pickle <class 'importlib._bootstrap.ExecutionResult'>: attribute lookup ExecutionResult on importlib._bootstrap failed",)'
My trying to execute for 2 keys but facing the issue. Can any help me to solve this issue
Related
I created a code (using Tkinter, Python3 and matplotlid) that could read data from different serial ports, save them to csv, then create graphs and finally preview data in GUI. The code was splited in two different scripts. The main script contained reading data, save data to csv an priview of data and the other script contained the graph creation.
Today I rewrote the code using the answer of #user2464430 here. The code is working, but I can't update the GUI. Opens once and then no refresh with new data.
The following code is a part of total code.
My code is:
from PIL import ImageTk, Image
import tkinter as Tk
import multiprocessing
from queue import Empty, Full
from time import strftime
import serial
import numpy as np
import matplotlib.pyplot as plt
from drawnow import *
from pylab import *
import pandas as pd
from datetime import timedelta
from datetime import datetime
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import locale
import os
class GuiApp(object):
def __init__(self, image):
self.root = Tk.Tk()
self.root.resizable(width=False, height=False)
self.root.geometry("1600x800+0+0")
C = Canvas(self.root, bg="black", width=1600, height=800)
def BasicLabels():
....... # in this stage create multiple axis labels
Î¥AxisLabels()
BasicLabels()
def ValueLabels():
....... # Read and munipulate datas from CSV file and print in in labels
ValueLabels()
C.pack()
def GenerateData(q): #Read Serial Ports and store data to CSV file
file_exists = os.path.isfile("BigData.csv")
header = [["Daytime,T1"]]
if not file_exists:
with open("BigData.csv", "a+") as csvfile:
np.savetxt(csvfile, header, delimiter=",", fmt="%s", comments="")
while True:
try:
ser1 = serial.Serial(port="COM4", baudrate=9600)
read_ser1 = ser1.readline()
if read_ser1 == "":
read_ser1 = "Missing Value"
else:
read_ser1 = ser1.readline()
read_ser1 = str(read_ser1[0 : len(read_ser1)].decode("utf-8"))
# print("COM4:", read_ser1)
ser1.close()
except:
print("Failed 1")
read_ser1 = "9999,9999,9999,9999,9999"
daytime = strftime(" %d-%m-%Y %H:%M:%S")
rows = [
daytime
+ ","
+ read_ser1.strip()
]
with open("BigData.csv", "a+") as csvfile:
np.savetxt(csvfile, rows, delimiter=",", fmt="%s", comments="")
CreateGraphs()
def CreateGraphs():
#Code to generate graph. Called every time i have new line in CSV.
if __name__ == "__main__":
# Queue which will be used for storing Data
q = multiprocessing.Queue()
q.cancel_join_thread() # or else thread that puts data will not term
gui = GuiApp(q)
t1 = multiprocessing.Process(target=GenerateData, args=(q,))
t1.start()
gui.root.mainloop()
t1.join()
The graphs are generating after while True in GenerateData.
All datas for labels and graphs are coming from CSV file and not directly from serial port.
Is it possible to update GUI with latest datas from CSV and created graphs?
Thank for your time.
I am trying to read from excel and load into Mongodb using Pymongo.
The Error I got cannot is "encode object: , of type: <class 'pandas._libs.missing.NAType'>", when researched, I was told to use utf-8-sign format to insert it into monogodb, but in pandas dataframe there is no option to use utf-8
from pymongo import MongoClient
from datetime import datetime
import pandas as pd
import Parameters
import pandasql as pf
import json
import pymongo
import xlrd
from pathlib import Path
import os
import constants
try:
class conn:
def __init__(self):
client = pymongo.MongoClient( "mongodb://" + constants.USER_NAME + ":" + constants.PWD + constants.server + constants.CA_CERTIFICATES_PATH)
db = client[Parameters.STG_QC_Hub_Files]
week="08-02-2021"
out_col = db[Parameters.col]
filename = "1.xlsx"
path1 = Path('//test3'+'/'+filename)
data_load_date = datetime.today().strftime('%m-%d-%Y')
df1=pd.read_excel(path1,sheet_name="AU-ARCM Details",keep_default_na=False)
# df1 = pd.read_excel(xls+filename,keep_default_na=False,encoding='utf-8-sig')
# df1 = pd.read_csv(xls,keep_default_na=False,encoding='utf-8-sig').iloc[:, : 86]
df1["Week"]=week
df1["Data Load Date"]=data_load_date
df1 = df1.astype('string')
# df1.index = df1.index.str.encode('utf-8')
df1=df1.drop(['Source.Name'], axis=1)
records = json.loads(df1.T.to_json()).values()
out_col.insert_many(df1.to_dict('records'))
print("Imported File " +str(filename)+" with " +str(len(records) )+ " records")
c = conn()
except Exception as e:
print(e)
Traceback:
File "C:\Users\PycharmProjects\ReMs\venv\lib\site-packages\pymongo\message.py", line 1323, in _do_batched_op_msg
operation, command, docs, check_keys, ack, opts, ctx)
bson.errors.InvalidDocument: cannot encode object: <NA>, of type: <class 'pandas._libs.missing.NAType'>
You have some blank cells in your spreadsheet that pandas has its own type (NAT) for; pymongo doesn't know what to do with this type, hence the error. You will need to remove any of these in order to load the values into mongodb using the method you are using.
Consider something like this just before you attempt the insert:
import numpy as np
df1 = df1.replace(np.nan, None)
The following error occurs when running the below example test, what am I doing wrong?
Error: Exception('Column(s) id already selected',)
Code:
import unittest
import dask
import pandas as pd
import dask.dataframe as dd
class TestDaskCustomAgg(unittest.TestCase):
def mode(self, x):
val = pd.Series.mode(x)
if val.empty:
return np.NaN
return val[0]
def test_get_transactions(self):
df = dask.datasets.timeseries()
custom_agg = dd.Aggregation('custom_agg', agg=lambda x: self.mode(x), chunk=lambda x0: self.mode(x0))
df.groupby('name').agg(custom_agg).compute()
I have init.py and blobquickstartv12.py within the same Azure Function "Test-v3". While init.py is a blob trigger, "blobquickstartv12.py " has the python code that I want to run. The only way I am able to run my code in blobquickstartv12.py is if I paste the entire code within the main() function of init.py.
I tried using from blobquickstartv12 import load where load is a function in my blobquickstartv12.py code but that gave me Exception: ModuleNotFoundError: No module named 'blobquickstartv12'
Can anyone tell me how can I call my custom code from within init.py
This is how the structure of my Azure Function looks like:
Here is my code in init.py:
import azure.functions as func
import pandas as pd
import numpy as np
from datetime import datetime
from pandas import ExcelFile
from pandas import ExcelWriter
from datetime import datetime, timedelta
from azure.storage.blob import BlockBlobService
import pyodbc
import sys
import os
from io import StringIO
import pkgutil
from . import blobquickstartv12
def main(myblob: func.InputStream):
logging.info(f"Python blob trigger function processed blob \n"
f"Name: {myblob.name}\n"
f"Blob Size: {myblob.length} bytes")
load=blobquickstartv12.load()
Here is my code for blobquickstart.py:
class load:
#CODE FOR CONNECTING TO THE SQL DATABASE
SERVER = 'xxxxxx.database.windows.net'
DATABASE = 'XYZ'
username = 'USERNAME'
pwd = 'PASSWORD'
driver= '{ODBC Driver 17 for SQL Server}'
cnxn = pyodbc.connect('DRIVER='+driver+';SERVER='+SERVER+';PORT=1433;DATABASE='+DATABASE+';UID='+username+';PWD='+ pwd)
cursor = cnxn.cursor()
print("Connected to Azure SQL")
#sqlcommand = ("INSERT INTO Stage.File(File_ID,File_type) VALUES (1235,'D')")
Curr_dt = datetime.now()
BLOB_STORAGEACCOUNTNAME="blobstorage"
BLOB_STORAGEACCOUNTKEY="AccountKey"
BLOBNAME="BlobName"
CONTAINERNAME= "ContainerName"
Update:
Please check the structure. On my side it is no problem. The code can import blobquickstartv12 fine.
This is the structure of azure function:
https://learn.microsoft.com/en-us/azure/azure-functions/functions-reference-python#folder-structure
This is the doc of how to import:
https://learn.microsoft.com/en-us/azure/azure-functions/functions-reference-python#import-behavior
Original Answer:
import module in the module should be like this:
For example, I have a dog.py and I want to use it.
This is the dog.py:
class Dog:
def __init__(self,name):
super().__init__()
self.name = name
def showdog(self):
print("This is a dog!")
In the _init_.py, you should use this:
from . import dog
mydog = dog.Dog("Woodie")
It work fine on my side.
This is the structure:
I'm trying to implement multiprocessing library for splitting up a dataframe into parts, process it on multiple cores of CPU and then concatenate the results back into a final dataframe in a python dash application. The code works fine when I try it outside of the dash application (when I run the code standalone without enclosing it in a dash application). But when I enclose the same code in a dash application, I get an error. I have shown the code below:
I have tried the multiprocessing code out of the dash framework and it works absolutely fine.
import dash
from dash.dependencies import Input, Output, State
import dash_core_components as dcc
import dash_html_components as html
import flask
import dash_table_experiments as dt
import dash_table
import dash.dependencies
import base64
import time
import os
import pandas as pd
from docx import *
from docx.text.paragraph import Paragraph
from docx.text.paragraph import Run
import xml.etree.ElementTree as ET
import multiprocessing as mp
from multiprocessing import Pool
from docx.document import Document as doctwo
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.table import _Cell, Table
from docx.text.paragraph import Paragraph
import io
import csv
import codecs
import numpy as np
app = dash.Dash(__name__)
application = app.server
app.config.supress_callback_exceptions = True
app.layout = html.Div(children=[
html.Div([
html.Div([
html.H4(children='Reader'),
html.Br(),
],style={'text-align':'center'}),
html.Br(),
html.Br(),
html.Div([
dcc.Upload(html.Button('Upload File'),id='upload-data',style = dict(display = 'inline-block')),
html.Br(),
]
),
html.Div(id='output-data-upload'),
])
])
#app.callback(Output('output-data-upload', 'children'),
[Input('upload-data', 'contents')],
[State('upload-data', 'filename')])
def update_output(contents, filename):
if contents is not None:
content_type, content_string = contents.split(',')
decoded = base64.b64decode(content_string)
document = Document(io.BytesIO(decoded))
combined_df = pd.read_csv('combined_df.csv')
def calc_tfidf(input1):
input1 = input1.reset_index(drop=True)
input1['samplecol'] = 'sample'
return input1
num_cores = mp.cpu_count() - 1 #number of cores on your machine
num_partitions = mp.cpu_count() - 1 #number of partitions to split dataframe
df_split = np.array_split(combined_df, num_partitions)
pool = Pool(num_cores)
df = pd.concat(pool.map(calc_tfidf, df_split))
pool.close()
pool.join()
return len(combined_df)
else:
return 'No File uploaded'
app.css.append_css({'external_url': 'https://codepen.io/plotly/pen/EQZeaW.css'})
if __name__ == '__main__':
app.run_server(debug=True)
The above dash application takes as input any file. Upon uploading the file in the front end, a local CSV file (any file, in my case it is combined_df.csv) is loaded into a dataframe. Now I want to split the dataframe into parts using multiprocessing, process it and combine it back. But the above code results in the following error:
AttributeError: Can't pickle local object 'update_output..calc_tfidf'
What's wrong with this piece of code?
Okay I've figured it out now!. The problem is that the function calc_tfidf was not defined as a global function. I changed the function to be a global function and it worked perfect.
Simple checks when left unsolved at times might lead to days of redundant efforts! :(