Dataframe becoming empty after calling a method in DataWriter class that deletes records from delta table - apache-spark

How can I prevent the dataframe data from becoming empty after calling the delete_processed_data() method in my DataWriter class that also has a register_processed_data() method which inserts data into a delta table?
I'm not overwriting the dataframe and it's only being used as a condition to check if it has any data with a count.
Here's my complete code (databricks notebook):
from datetime import *
import pandas as pd
from dn.utils import table
import pyspark.sql.functions as F
from delta.tables import *
from pyspark.sql.types import *
import json
import pytz
import calendar
list_countries = (
table.get_silver_table(table_name='stpos_dim_itemticket')
.select('pais')
.distinct()
)
list_countries = [row.pais for row in list_countries.collect()]
# Include "Todos" option
list_countries.insert(0, 'Todos')
dbutils.widgets.removeAll()
dbutils.widgets.text(name='category', defaultValue='Todos', label='Categoria')
dbutils.widgets.text(name='today', defaultValue=str(date.today()), label='Fecha proceso')
dbutils.widgets.dropdown(name="country", defaultValue='Todos', choices=list_countries, label="Pais")
dbutils.widgets.dropdown(name='forced_load', defaultValue='no', choices=['si', 'no'], label='Forzar carga')
dbutils.widgets.dropdown(name="reprocessing", defaultValue='si', choices=['si', 'no'], label="Reproceso")
country = dbutils.widgets.get('country').strip()
category = dbutils.widgets.get("category").strip()
today = datetime.strptime(dbutils.widgets.get('today').strip(), '%Y-%m-%d')
wave_date = today.replace(day=1)
forced_load = dbutils.widgets.get('forced_load').strip()
reprocessing = dbutils.widgets.get('reprocessing').lower().strip()
print(f"Categoria: {category}")
print(f"Fecha proceso: {today.strftime('%Y-%m-%d')}")
print(f"Pais: {country}")
print(f"Forzar carga: {forced_load}")
print(f'Reproceso: {reprocessing}')
print(f"Fecha ola: {wave_date.strftime('%Y-%m-%d')}")
class DataExtractor():
def __init__(self, category, today, country, list_countries, wave_date, reprocessing, forced_load):
self.category = category
self.today = today
self.country = country
self.list_countries = list_countries
self.wave_date = wave_date
self.reprocessing = reprocessing
self.forced_load = forced_load
if self.reprocessing == 'no' or self.forced_load == 'si':
self.days_for_wave = self.get_days_for_wave()
if self.country.lower() == 'todos':
self.country_condition = "lower(pais) = lower(pais)"
else:
self.country_condition = f"lower(pais) = lower('{country}')"
if self.category.lower() == 'todos':
self.category_condition = "lower(categoria) = lower(categoria)"
else:
self.category_condition = f"lower(categoria) = lower('{category}')"
def get_days_for_wave_by_country(self, country, path_file):
days_for_wave = (
spark.read.format("com.crealytics.spark.excel")
.option("header", "true")
.option("treatEmptyValuesAsNulls", "true")
.option("inferSchema", "true")
.load(path_file)
.where(f"fecha_ola = '{self.wave_date}'")
.where(f"lower(pais) = lower('{country}')")
.selectExpr(
"fecha_ola",
"to_date(fecha) as fecha_transaccion",
"pais")
)
if days_for_wave.count() == 0:
# Año y mes deseado
year = self.wave_date.year
month = self.wave_date.month
# Obtener el número de días en el mes especificado
_, num_days = calendar.monthrange(year, month)
# Crear una lista con todos los días del mes
days = [(date(year, month, day),) for day in range(1, num_days+1)]
# Convertir cada fecha a una cadena de texto
days_str = [(day[0].strftime("%Y-%m-%d"),) for day in days]
# Convert list to dataframe
days_for_wave = (
spark.createDataFrame(days_str)
.withColumnRenamed("_1", "fecha_transaccion")
.withColumn("fecha_ola", F.lit(self.wave_date))
.withColumn("pais", F.lit(country))
.selectExpr(
"fecha_ola",
"to_date(fecha_transaccion) AS fecha_transaccion",
"pais")
)
print(f"Loaded {days_for_wave.count()} days for wave {self.wave_date.strftime('%Y-%m-%d')} and country {country}")
return days_for_wave
def get_days_for_wave(self):
"""
Get the days for the wave
"""
# Load dim_dia_ola.xlsx with wave definition
path_file = "dbfs:/mnt/storetrack/transitraw/dichterneira/storelive/dim_dia_ola.xlsx"
print(f'Loading days for wave from file: {path_file}...')
if self.country.lower() == 'todos':
# Get list of countries (excluding 'Todos')
list_of_countries = self.list_countries[1:]
else:
list_of_countries = [self.country]
schema = StructType([
StructField("fecha_ola", TimestampType(), nullable=True),
StructField("fecha_transaccion", DateType(), nullable=True),
StructField("pais", StringType(), nullable=True)
])
# Crear un DataFrame vacío con el esquema especificado
days_for_wave = spark.createDataFrame([], schema=schema)
for country in list_of_countries:
days_for_wave_by_country = self.get_days_for_wave_by_country(country, path_file)
max_day_of_wave = days_for_wave_by_country.agg(F.max("fecha_transaccion")).collect()[0][0]
if self.today.date() > max_day_of_wave and self.forced_load == 'no':
print(f"Today {self.today.strftime('%Y-%m-%d')} is not included in wave days for country {country} and wave {self.wave_date.strftime('%Y-%m-%d')}")
else:
if country == list_of_countries[0]:
days_for_wave = days_for_wave_by_country
else:
days_for_wave = days_for_wave.union(days_for_wave_by_country)
return days_for_wave
def get_data_items(self):
"""
Filter sales by category, wave and country
"""
if self.reprocessing == 'si' and self.forced_load == 'no':
sales_filtered = (
table.get_silver_table(table_name='sl_fact_item_ticket')
.where(f"fecha_ola = '{self.wave_date}'")
.where(self.country_condition)
.where(self.category_condition)
)
else:
sales_filtered = (
table.get_silver_table(table_name='stpos_dim_itemticket')
.drop("fecha_ola")
.where(self.country_condition)
.where(self.category_condition)
.selectExpr("*", "to_date(date) as fecha_transaccion")
.join(self.days_for_wave, ["fecha_transaccion", "pais"], how="inner")
.drop("fecha_transaccion")
)
print(f"{sales_filtered.count()} items loaded. [Get data items]")
return sales_filtered
def get_product_catalog(self):
product_catalog = (
table.get_bronze_table(table_name='brz_catalogo_productos', module_name='catalogo')
.where(self.country_condition)
.selectExpr(
"upc as barcode",
"pais",
"categoria",
"marca",
"submarca",
"fabricante",
"""CASE WHEN lower(split(contenido, ' ')[1]) = 'ml' THEN 'L'
WHEN lower(split(contenido, ' ')[1]) = 'gr' THEN 'Kg'
WHEN lower(split(contenido, ' ')[1]) = 'und' THEN 'Und'
END AS unidad_std""",
"conversion AS contenido_std",
"split(contenido, ' ')[0] AS contenido",
"split(contenido, ' ')[1] AS unidad_medida",
"idref AS id_ref"
)
)
return product_catalog
class DataEnricher():
def __init__(self, reprocessing, forced_load):
self.reprocessing = reprocessing
self.forced_load = forced_load
def rename_fields(self, df_item):
if self.reprocessing == 'no' or self.forced_load == 'si':
print("Renaming fields...")
df_item = (
df_item
.selectExpr(
'CAST(fecha_ola AS DATE) AS fecha_ola',
'pdv AS nombre_pdv',
'marca',
'submarca',
'pais',
'contenido',
'unidad_medida',
'CAST(cantidad AS DOUBLE) as cantidad',
'CAST(precio_local AS DOUBLE) as precio_local',
'barcode',
'date AS fecha_transaccion',
'categoria',
'categoria_name',
'descripcion',
'id_ref',
'posdb_id',
'id_ticket',
'id_item',
'id_pdv',
'venta_usd',
'venta_local',
'precio_usd',
'id_canasto'
)
)
return df_item
def calculate_standard_fields(self, df_item):
if self.reprocessing == 'no' or self.forced_load == 'si':
print("Caculating standard fields...")
df_item = (
df_item
# Add column with converted Ml to L and Gr to Kg
.withColumn("contenido_std",
F.when(F.col("unidad_medida") == "Ml", F.col("contenido") / 1000)
.when(F.col("unidad_medida") == "Gr", F.col("contenido")/1000)
.otherwise(F.col("contenido")))
.withColumn("unidad_std",
F.when(F.col("unidad_medida") == "Ml", F.lit("L"))
.when(F.col("unidad_medida") == "Gr", F.lit("Kg")))
)
return df_item
def calculate_fields(self, df_items):
"""
Set the time zone of the dataframe
"""
if self.reprocessing == 'no' or self.forced_load == 'si':
print("Calulating time zone field...")
# Create dataframe with the time zone
time_zone = [(1, '05:00:00', '09:59:59'),
(2, '10:00:00', '13:59:59'),
(3, '14:00:00', '19:59:59'),
(4, '20:00:00', '23:59:59'),
(4, '00:00:00', '04:59:59')]
time_zone = spark.createDataFrame(time_zone, ['id_franja', 'inicio', 'fin'])
# Convert inicio and fin to datetime
time_zone = (
time_zone
.withColumn("inicio", F.to_timestamp(F.col("inicio"), "HH:mm:ss"))
.withColumn("fin", F.to_timestamp(F.col("fin"), "HH:mm:ss"))
)
df_items = (
df_items
.withColumn("hora_transaccion", F.substring(F.col("fecha_transaccion"), 12, 8))
.withColumn("hora_transaccion", F.to_timestamp(F.col("hora_transaccion"), "HH:mm:ss"))
.join(time_zone, on=F.col("hora_transaccion").between(F.col("inicio"), F.col("fin")), how="left")
.drop("hora_transaccion", "inicio", "fin")
)
return df_items
def update_product_features(self, data, product_catalog):
if data.count() > 0:
print("Updating fields from brz_catalogo_productos")
data = (
data
.drop("categoria", "marca", "submarca", "fabricante", "unidad_std", "contenido_std", "contenido", "unidad_medida", "id_ref")
.join(product_catalog, on=["barcode", "pais"], how="left")
)
return data
class DataWriter():
def __init__(self, wave_date, country, category):
self.wave_date = wave_date
self.country = country
self.category = category
if self.country.lower() == 'todos':
self.country_condition = "lower(pais) = lower(pais)"
else:
self.country_condition = f"lower(pais) = lower('{country}')"
if self.category.lower() == 'todos':
self.category_condition = "lower(categoria) = lower(categoria)"
else:
self.category_condition = f"lower(categoria) = lower('{category}')"
def delete_processed_data(self, datos):
df_categoria_activa = (
table.get_bronze_table(
table_name='sl_configuracion_procesamiento_zona_silver',
module_name='storetrack'
)
.where(f"fecha_ola = '{wave_date}' and lower(trim(procesar)) = 'si'")
.where(self.country_condition)
.where(self.category_condition)
.selectExpr(
"categoria",
"pais",
"fecha_ola"
)
)
if datos.count() > 0:
display(datos.where("categoria = 'Galletas dulces'"))
table_path = table.get_silver_table_path(table_name="sl_fact_item_ticket")
deltaTableToWrite = DeltaTable.forPath(spark, table_path)
print("Deleting old rows...")
deltaTableToWrite.alias('current')\
.merge(
df_categoria_activa.alias('delete'),
'current.pais = delete.pais AND current.categoria = delete.categoria AND current.fecha_ola = delete.fecha_ola')\
.whenMatchedDelete()\
.execute()
display(datos.where("categoria = 'Galletas dulces'"))
def register_processed_data(self, data):
if data.count() > 0:
print("Inserting new rows...")
display(data.where("categoria = 'Galletas dulces'"))
table_path = table.get_silver_table_path(table_name="sl_fact_item_ticket")
deltaTableToWrite = DeltaTable.forPath(spark, table_path)
deltaTableToWrite.alias('current')\
.merge(
data.alias('new'),
'current.id_item = new.id_item AND current.fecha_ola = new.fecha_ola')\
.whenNotMatchedInsert(values =
{
"fecha_ola": "new.fecha_ola",
"marca": "new.marca",
"submarca": "new.submarca",
"pais": "new.pais",
"contenido": "new.contenido",
"unidad_medida": "new.unidad_medida",
"cantidad": "new.cantidad",
"precio_local": "new.precio_local",
"barcode": "new.barcode",
"fecha_transaccion": "new.fecha_transaccion",
"categoria": "new.categoria",
"categoria_name": "new.categoria_name",
"descripcion": "new.descripcion",
"id_ref": "new.id_ref",
"posdb_id": "new.posdb_id",
"id_ticket": "new.id_ticket",
"id_item": "new.id_item",
"id_pdv": "new.id_pdv",
"venta_usd": "new.venta_usd",
"venta_local": "new.venta_local",
"precio_usd": "new.precio_usd",
"nombre_pdv": "new.nombre_pdv",
"contenido_std": "new.contenido_std",
"unidad_std": "new.unidad_std",
"id_canasto": "new.id_canasto",
"id_franja": "new.id_franja"
}
)\
.execute()
display(data.where("categoria = 'Galletas dulces'"))
print(f"{data.count()} items loaded. [Write processed data]")
else:
print("No data to save in silver.sl_fact_item_ticket")
if __name__ == '__main__':
data_extractor = DataExtractor(category, today, country, list_countries, wave_date, reprocessing, forced_load)
data = data_extractor.get_data_items()
product_catalog = data_extractor.get_product_catalog()
cleaner = DataCleaner(wave_date, country, category, reprocessing, forced_load)
data = cleaner.clean_data(data)
data_enricher = DataEnricher(reprocessing, forced_load)
data = data_enricher.rename_fields(data)
data = data_enricher.calculate_standard_fields(data)
data = data_enricher.calculate_fields(data)
data = data_enricher.update_product_features(data, product_catalog)
data_write = DataWriter(wave_date, country, category)
data_write.delete_processed_data(data)
data_write.register_processed_data(data)
The parameters with which I am running the notebook are:
Categoria: Todos
Fecha proceso: 2022-12-01
Pais: Todos
Forzar carga: no
Reproceso: si
Fecha ola: 2022-12-01
The following output is displayed:
993313 items loaded. [Get data items]
62023 items loaded. [Remove blocked categories]
Updating fields from brz_catalogo_productos
[DISLPLAY ROWS DATAFRAME data]
Deleting old rows...
Query returned no results
No data to save in silver.sl_fact_item_ticket
Any insights on why the dataframe is getting cleared would be greatly appreciated.

Related

Inserting pandas dataframe into django model

I am having an issue writing a dataframe to my django models.py.
The file is long, but is quite simple in its methodology:
-import modules
-create django database
-requests.get necessary data
-alter data some to fit my goals, save as df
-connect to django db and insert df
My models.py is the following:
from django.db import models
import requests
import pandas as pd
from datetime import timezone
from datetime import datetime
from datetime import date
from datetime import timedelta
import time
from django.conf import settings
from sqlalchemy.engine import create_engine
class cryptoData(models.Model):
coin = models.CharField(max_length=10)
asset_id = models.SmallIntegerField()
time = models.DateTimeField()
close = models.FloatField()
volume = models.BigIntegerField()
market_cap = models.FloatField()
reddit_posts = models.IntegerField()
reddit_comments = models.IntegerField()
tweets = models.IntegerField()
tweet_favorites = models.IntegerField()
social_volume = models.IntegerField()
lunarcrush_key = 'fakekey1234'
def top_coins():
lc_market = requests.get(
url = 'https://api.lunarcrush.com/v2?data=market&',
params = {
'key': lunarcrush_key,
}
)
all_coins = []
for entry in lc_market.json().get('data'):
coin = []
coin.append(entry.get('s'))
coin.append(entry.get('mc'))
all_coins.append(coin)
all_coins.sort(key = lambda x : x[1], reverse = True)
top_ten_coins = all_coins[:10]
return(top_ten_coins)
top_coins_lst = top_coins()
top_coin_names_lst = [x[0] for x in top_coins_lst]
def get_coin_data(key, coin, date_diff, start_date, end_date):
lc = requests.get(
url = 'https://api.lunarcrush.com/v2?data=assets&',
params = {
'key': lunarcrush_key,
'symbol': coin,
'interval': 'day',
'data_points': date_diff,
'start': int(start_date.replace(tzinfo=timezone.utc).timestamp()),
'end': int(end_date.replace(tzinfo=timezone.utc).timestamp())
}
)
metric_names = []
for entry in lc.json().get('data')[0].get('timeSeries'):
for key in entry:
metric_names.append(key) if key not in metric_names else metric_names
metrics_list = []
for entry in lc.json().get('data')[0].get('timeSeries'):
row_list = []
for key in entry:
row_list.append(entry.get(key))
metrics_list.append(row_list)
metrics_df = pd.DataFrame(metrics_list, columns = metric_names)
metrics_df['time'] = metrics_df['time'].apply(lambda x : datetime.utcfromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S'))
metrics_df['coin'] = coin
cols = list(metrics_df)
cols.insert(0, cols.pop(cols.index('coin')))
metrics_df = metrics_df.loc[:, cols]
return(metrics_df)
def get_all_coins_data(coins_list):
appended_data = []
end_date = datetime.now()
start_date = end_date - timedelta(days = 700)
date_diff = (end_date - start_date).days
for coin in coins_list:
appended_data.append(get_coin_data(lunarcrush_key, coin, date_diff, start_date, end_date))
time.sleep(.1)
output = pd.concat(appended_data)
return(output)
df = get_all_coins_data(top_coin_names_lst)
focused_df = df[['coin', 'asset_id', 'time', 'close', 'volume', 'market_cap', 'reddit_posts', 'reddit_comments', 'tweets', 'tweet_favorites', 'social_volume']]
user = settings.DATABASES['default']['USER']
password = settings.DATABASES['default']['PASSWORD']
database_name = settings.DATABASES['default']['NAME']
database_url = 'sqlite://{user}:{password}#localhost:5432/{database_name}'.format(
user=user,
password=password,
database_name=database_name,
)
engine = create_engine(database_url, echo=False)
focused_df.to_sql(cryptoData, con=engine)
When I run the manage.py runserver command, I get the following error:
sqlalchemy.exc.ArgumentError: Invalid SQLite URL: sqlite://user:password#localhost:5432/C:\Users\user\Programming\django_crypto_v6\source\db.sqlite3
Valid SQLite URL forms are:
sqlite:///:memory: (or, sqlite://)
sqlite:///relative/path/to/file.db
sqlite:////absolute/path/to/file.db
I'm struggling to resolve this issue. Any thoughts?
you are using the wrong pattern for SQLite database_url
see the docs at https://docs.sqlalchemy.org/en/14/core/engines.html#sqlite

How to create variables based on multiple conditions?

In the below code, I need to create two variables, namely flag1 and flag2. They are created based on multiple conditions. I used np.select approach as below. However, I wonder what would be the other ways to do this? In my real work situation, there would be more conditions to create the flag. Any advices or suggestions would be great.
import numpy as np
import pandas as pd
start_date = '2020-04-01'
end_date = '2020-05-01'
d1 = {'customer type':['walk in', 'online app', 'phone app', 'referral'], 'office visit':
['location1','location1','location1','location1'],'date1':['2020-04-17','2020-05-17','2020-03-
01','2020-05-01'],'date2':['2020-05-18','2020-04-18','2020-04-03','2020-05-19']}
df1=pd.DataFrame(data=d1)
con1 = [ (df1['date1'] >= start_date ) & (df1['date1'] < end_date )]
result1 = ['yes']
df1['flag1'] = np.select(con1, result1)
con2 = [ (df1['date2'] >= start_date ) & (df1['date2'] < end_date )]
result2 = ['yes']
df1['flag2'] = np.select(con2, result2)
You could use a dictionary and dynamically update the keys to the variable names and add the corresponding value of the variables.
For example:
import numpy as np
import pandas as pd
start_date = '2020-04-01'
end_date = '2020-05-01'
flags = dict()
flag_string = 'flag'
# This creates the strings flag1 and flag2 automatically
for i in range(1, 3):
# concatenate the flag_string with the index of the loop
flags[flag_string + str(i)] = flag_string + str(i)
print(flags)
d1 = {'customer type': ['walk in', 'online app', 'phone app', 'referral'],
'office visit': ['location1','location1','location1','location1'],'date1':['2020-04-17','2020-05-17','2020-03- \
01','2020-05-01'],'date2':['2020-05-18','2020-04-18','2020-04-03','2020-05-19']}
df1=pd.DataFrame(data=d1)
con1 = [ (df1['date1'] >= start_date ) & (df1['date1'] < end_date )]
result1 = ['yes']
df1[flags['flag1']] = np.select(con1, result1)
con2 = [ (df1['date2'] >= start_date ) & (df1['date2'] < end_date )]
result2 = ['yes']
df1[flags['flag2']] = np.select(con2, result2)
This is how you can substitute dictionary values as variables. I've also included a for loop that builds your flag dictionary.

Initialize Model Class Variable At Runtime

I am trying to import student data from an Excel workbook. I have to select column_name of the class StudentMasterResource dynamically which is present in the file. I got all column name in constants module has one dictionary which name column_name. When I do it for the first time, it works, then it fails
constants.py
column_name = dict()
resource.py
from common_account import constants
from import_export import widgets, fields, resources
def getClassName(key):
if key in constants.column_name:
return constants.column_name[key]
return key
class StudentMasterResource(resources.ModelResource):
organisation_id = fields.Field(
column_name=getClassName('organisation_id'),
attribute='organisation_id',
widget=widgets.ForeignKeyWidget(OrganisationMaster, 'organisation_name'),
saves_null_values=True
)
name = fields.Field(
column_name=getClassName('Name'),
attribute='name',
saves_null_values=True,
widget=widgets.CharWidget()
)
date_of_birth = fields.Field(
column_name=getClassName('date'),
attribute='date_of_birth',
saves_null_values=True,
widget=widgets.DateWidget()
)
views.py
from common_account import constants
from tablib import Dataset
#api_view(['POST'])
#permission_classes([IsAuthenticated])
def student_import(request):
if request.method == 'POST':
context_data = dict()
data_set = Dataset()
file = request.FILES['myfile']
extension = file.name.split(".")[-1].lower()
column_data = request.data
is_import = column_name['is_import']
constants.valid_data.clear()
constants.invalid_data.clear()
if extension == 'csv':
data = data_set.load(file.read().decode('utf-8'), format=extension)
else:
data = data_set.load(file.read(), format=extension)
constants.column_name = {
'date' : column_data.get('birth'),
'name' : column_data.get('name'),
}
if is_import == 'No':
result = student_resource.import_data(data_set, organisation_id = request.user.organisation_id,
offering_id = offering_id,all_invalid_data = False, dry_run=True, raise_errors=True)
context_data['valid_data'] = constants.valid_data
context_data['invalid_data'] = constants.invalid_data
context_data[constants.RESPONSE_RESULT] = {"Total records":student_resource.total_cnt,
"skip records":len(constants.invalid_data),
"Records imported": len(constants.valid_data),
}
return JsonResponse(context_data)
elif is_import == 'Yes':
result = student_resource.import_data(data_set, organisation_id = request.user.organisation_id,
offering_id = offering_id,all_invalid_data = False, dry_run=False, raise_errors=False)
context_data[constants.RESPONSE_ERROR] = False
context_data[constants.RESPONSE_MESSAGE] = 'Data Imported !!!'
context_data[constants.RESPONSE_RESULT] = {"Total records":student_resource.total_cnt,
"skip records":len(constants.invalid_data),
"Records imported": len(constants.valid_data),
}
return JsonResponse(context_data)

pyspark modify class attributes using spark.sql.rdd.foreach()

The main task is to connect Hive and read data using spark rdd.
I have tried the code below. Connection and reading are both successful, but when I want to modify the value of self.jobUserProfile, I failed. Then I print this value in three positions(masking in #1,#2 and #3). In the first position, the value is valid, but in the second and third position, the dict is empty. It seems that the modification has not been assigned into the class attribute.
I have tried response = spark.sql('select userid, logtime from hive.dwd_log_login_i_d limit 10').collect() and iterate this dataframe, but when the data volume is too large, the performance may decline.
When I change response.rdd.foreach(lambda x: self.readLoginFunction(x)) to response.rdd.map(lambda x: self.readLoginFunction(x)), the target value in three position are all empty.
I'm a newbie in spark. Any advice could be helpful. Thanks in advance.
from analysis.common.db.hive.connectHive import *
import collections
class OperateHive():
def __init__(self):
self.jobUserProfile = collections.defaultdict(dict)
def readLoginFunction(self, e):
dic = collections.defaultdict()
dic['userid'] = e[0]
dic['logtime'] = e[1]
self.jobUserProfile[e[0]] = dic
print(self.jobUserProfile) #1
def readLogin(self, spark):
response = spark.sql('select userid, logtime from hive.dwd_log_login_i_d limit 10')
response.rdd.foreach(lambda x: self.readLoginFunction(x))
print(self.jobUserProfile) #2
if __name__ == '__main__':
spark = connectHive(['conf/hdfs-site.xml', 'conf/hive-site.xml'], 'utf-8')
operateHive = OperateHive()
operateHive.readLogin(spark)
print(operateHive.jobUserProfile) #3
Finally the code below works.
from analysis.common.db.hive.connectHive import *
import collections
class OperateHive():
def readLoginFunction(self, e,jobUserProfile, devAppProfile):
dic = collections.defaultdict()
dic['userid'] = e[0]
dic['logtime'] = e[1]
jobUserProfile[e[0]] = dic
devAppProfile[e[0]] = dic
print(jobUserProfile)
return jobUserProfile, devAppProfile
def readLogin(self, spark, jobUserProfile,devAppProfile):
response = spark.sql('select userid, logtime from hive.dwd_log_login_i_d limit 10')
rdd1 = response.rdd.map(lambda x: self.readLoginFunction(x, jobUserProfile, devAppProfile))
return rdd1.top(1)[0][0]
if __name__ == '__main__':
spark = connectHive(['conf/hdfs-site.xml', 'conf/hive-site.xml'], 'utf-8')
jobUserProfile = collections.defaultdict(dict)
devAppProfile = collections.defaultdict(dict)
operateHive = OperateHive()
jobUserProfile = operateHive.readLogin(spark, jobUserProfile, devAppProfile)
print(jobUserProfile)
But when I remove devAppProfile, the code show like below:
from analysis.common.db.hive.connectHive import *
import collections
class OperateHive():
def readLoginFunction(self, e,jobUserProfile, devAppProfile):
dic = collections.defaultdict()
dic['userid'] = e[0]
dic['logtime'] = e[1]
jobUserProfile[e[0]] = dic
devAppProfile[e[0]] = dic
print(jobUserProfile)
return jobUserProfile
def readLogin(self, spark, jobUserProfile,devAppProfile):
response = spark.sql('select userid, logtime from hive.dwd_log_login_i_d limit 10')
response.rdd.map(lambda x: self.readLoginFunction(x, jobUserProfile, devAppProfile))
if __name__ == '__main__':
spark = connectHive(['conf/hdfs-site.xml', 'conf/hive-site.xml'], 'utf-8')
jobUserProfile = collections.defaultdict(dict)
devAppProfile = collections.defaultdict(dict)
operateHive = OperateHive()
operateHive.readLogin(spark, jobUserProfile, devAppProfile)
The rdd.map() won't work as there is no print in print(jobUserProfile).
Then I change the code like below, which works again.
from analysis.common.db.hive.connectHive import *
import collections
class OperateHive():
def readLoginFunction(self, e,jobUserProfile, devAppProfile):
dic = collections.defaultdict()
dic['userid'] = e[0]
dic['logtime'] = e[1]
jobUserProfile[e[0]] = dic
devAppProfile[e[0]] = dic
print(jobUserProfile)
return jobUserProfile
def readLogin(self, spark, jobUserProfile,devAppProfile):
response = spark.sql('select userid, logtime from hive.dwd_log_login_i_d limit 10')
rdd1 = response.rdd.map(lambda x: self.readLoginFunction(x, jobUserProfile, devAppProfile))
return rdd1.collect()[-1]
if __name__ == '__main__':
spark = connectHive(['conf/hdfs-site.xml', 'conf/hive-site.xml'], 'utf-8')
jobUserProfile = collections.defaultdict(dict)
devAppProfile = collections.defaultdict(dict)
operateHive = OperateHive()
jobUserProfile = operateHive.readLogin(spark, jobUserProfile, devAppProfile)
print(jobUserProfile)
The problem on the post is about closure. But I don't work out why the three versions on the answer work differently.

failed to execute script python exe

I'm trying to convert my python script to EXE.
What the script does is a basic analysis of an excel file, and generates a report in pdf.
Also in the script, I create a png file, then reload it to the pdf through the script.
I'm trying to convert the py file to EXE but it doesn't work :(
the script (works great as py file):
import pandas as pd
import os
from pandasql import sqldf
from datetime import datetime
import numpy as nu
from tkinter import *
import tkinter as tk
from fpdf import FPDF
import matplotlib.pyplot as plt
def start_gui(root):
myLabel = Label(root, text='Hi! Here you can output the sessions report').grid(row=0, column=0)
start_button = Button(root, text='Produce Report', padx=30, pady=20, command=main, fg='blue').grid(row=50, column=0)
root.mainloop()
pass
def print_full_results(df):
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
print(df)
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')
pass
def load_data(path):
df = pd.read_csv(path)
df = pd.DataFrame(df)
return df
def clean_raw_data(raw_data):
raw_data = raw_data.dropna(how='all') # Drop the rows where all elements are missing.
raw_data = raw_data.dropna(axis=1, how='all') # Drop the columns where all elements are missing.
raw_data = raw_data.reset_index() # Reset the indexes after droping rows
raw_data = raw_data.drop(columns=['index'])
raw_data = raw_data.rename(
columns={'Meeting ID': 'Meeting_ID', 'User Name': 'Admin_User_Name', 'Uzer Eam1l': 'Admin_Email',
'Has Zoom Rooms?': 'Has_Zoom_Rooms', 'Creation Time': 'Meeting_Creation_Time',
'Start Time': 'Meeting_Start_Time', 'End Time': 'Meeting_End_Time',
'Duration (Minutes)': 'Meeting_Duration_min', 'Ncmf (prjgjncl Ncmf)': 'User_Name',
'Usfr fncil': 'User_Email', 'Join Time': 'User_Join_Time', 'Leave Time': 'User_Leave_Time',
'Duration (Minutes).1': 'User_Duration_min'})
raw_data = convert_relevant_types(raw_data)
raw_data = fill_null_emails(raw_data)
return raw_data
def convert_relevant_types(db):
pd.options.mode.chained_assignment = None # default='warn'
# relevant columns (Meeting_Creation_Time,Meeting_Start_Time,Meeting_End_Time,User_Join_Time,User_Leave_Time): convert string to date
for i in range(len(db['Meeting_Start_Time'])):
creation_date = datetime.strptime(db['Meeting_Creation_Time'][i], '%m/%d/%y %H:%M')
start_date = datetime.strptime(db['Meeting_Start_Time'][i], '%m/%d/%y %H:%M')
end_date = datetime.strptime(db['Meeting_End_Time'][i], '%m/%d/%y %H:%M')
user_join_date = datetime.strptime(db['User_Join_Time'][i], '%m/%d/%y %H:%M')
user_leave_date = datetime.strptime(db['User_Leave_Time'][i], '%m/%d/%y %H:%M')
db['Meeting_Creation_Time'][i] = creation_date
db['Meeting_Start_Time'][i] = start_date
db['Meeting_End_Time'][i] = end_date
db['User_Join_Time'][i] = user_join_date
db['User_Leave_Time'][i] = user_leave_date
# relevant columns (Meeting_Duration_min,User_Duration_min): convert string to int
for i in range(len(db['Meeting_Duration_min'])):
db['Meeting_Duration_min'][i] = int(db['Meeting_Duration_min'][i])
db['User_Duration_min'][i] = int(db['User_Duration_min'][i])
return db
def fill_null_emails(db):
for i in range(len(db['User_Email'])):
if pd.isnull(db['User_Email'][i]):
db['User_Email'][i] = db['User_Name'][i] + ' Missing Mail'
return db
def pdff_space_down(pdf):
pdf.cell(0, 10, '', ln=1, align='L')
return pdf
def pdff_write(pdf, text, space=5, align='L'):
pdf.cell(0, space, text, ln=1, align='L')
return pdf
def pdff_write_table(pdf, data, spacing=1.5):
col_width = pdf.w / 4.5
row_height = pdf.font_size
for row in data:
for item in row:
pdf.cell(col_width, row_height * spacing,
txt=item, border=1)
pdf.ln(row_height * spacing)
return pdf
def create_pdf(today,min_date, max_date, sessions_num, total_cost, costs_table, num_of_users, avg_users_come):
pdf = FPDF(orientation='p', unit='mm', format='A4')
pdf.add_page()
pdf.set_font('Arial', size=10)
pdf.cell(0, 10, 'Date:{}'.format(today), ln=1, align='L')
pdf.set_font('times', 'B', size=24)
pdf.cell(0, 8, 'Home Assignment - Ziv Mor', ln=1, align='C')
pdf.set_font('times', size=18)
pdf.cell(0, 10, 'Zoom-Sessions Report (Automated by Python)', ln=1, align='C')
pdf.cell(0, 10, '({}'.format(min_date) + ' To {})'.format(max_date), ln=1, align='C')
pdf.set_font('times', 'U', size=15)
pdf = pdff_write(pdf, 'Sessions Analysis', space=20)
pdf.set_font('times', size=13)
pdf = pdff_write(pdf, 'Total Number of Sessions: {} (Team meetings are not include)'.format(sessions_num), space=15)
pdf.set_font('times', 'UB', size=13)
pdf.cell(0, 10, 'Number Of Sessions By Dates', ln=1.5, align='C')
pdf.image('sessions_by_day_plot.png', x=55, y=None, w=100, h=70, type='', link='')
pdf = pdff_space_down(pdf)
pdf.set_font('times', size=13)
pdf = pdff_write(pdf, 'Sessions Participants Segmentation:', space=10)
pdf = pdff_write_table(pdf, costs_table)
pdf.set_font('times', 'UB', size=13)
pdf.cell(0, 20, 'Sessions Total Cost: {} NIS'.format(total_cost), ln=1, align='C')
pdf.set_font('times', 'U', size=15)
pdf = pdff_write(pdf, 'Users Analysis', space=17)
pdf.set_font('times', size=13)
pdf = pdff_write(pdf, 'Total Number of Users Engaged: {}'.format(num_of_users), space=10)
pdf = pdff_write(pdf, 'The Average Frequency of Arrival of Each User : {} Sessions'.format(avg_users_come),
space=10)
pdf.output('Zoom Report_{}.pdf'.format(str(datetime.today()).replace(':', '.', 3)))
def main():
path = os.path.join(sys.path[0], 'participant sessions data.csv')
raw_data = load_data(path)
zoom_db = clean_raw_data(raw_data)
'''------------------------------SQL Queries---------------------------------'''
# todo asume פגישת צוות - not counted
question_1_query = 'Select date(Meeting_Start_Time)date, count(distinct Meeting_Start_Time)Num_Of_Sessions From zoom_db where Topic <>"פגישת צוות" Group by date(Meeting_Start_Time)'
answer_1_table = sqldf(question_1_query)
num_of_sessions = nu.sum(list(answer_1_table['Num_Of_Sessions']))
# count for each meeting the number of participants
question_2_query = 'Select Topic, Meeting_Start_Time, count(Distinct User_Email)num_of_Users From zoom_db Group by Meeting_Start_Time, Meeting_ID'
answer_2_table = sqldf(question_2_query)
# count for each user number of times the user arrived to session
# todo - mention I didnt concluded rows that user got in for less than 1 minute + there are a lot of users without mail so I assume for
question_3_query = 'select User_Email, count(*)num_of_arrivals from(Select User_Email, Meeting_Start_Time, Meeting_ID From zoom_db Where User_Duration_min <> 0 Group by User_Email, Meeting_ID , Meeting_Start_Time) group by User_Email Order by num_of_arrivals desc'
answer_3_table = sqldf(question_3_query)
# Calculate the avg times of arrival of users (Using the result of 3'rd question query #todo - asumming not conclud the host
participants_arrivals_list = list(answer_3_table['num_of_arrivals'])[1:]
avg_users_come = round((nu.average(participants_arrivals_list)), 2)
'''---------------------More Calculates for the report------------------------'''
# Calculate the intervals of dates
min_date_qu = sqldf('select min(date(Meeting_Start_Time)) from zoom_db')
min_date_qu = list(min_date_qu['min(date(Meeting_Start_Time))'])[0]
max_date_qu = sqldf('select max(date(Meeting_Start_Time)) from zoom_db')
max_date_qu = list(max_date_qu['max(date(Meeting_Start_Time))'])[0]
num_meetings0_5 = sqldf('select count(*) from answer_2_table where num_of_users<=5 and Topic <>"פגישת צוות"')
num_meetings0_5 = list(num_meetings0_5['count(*)'])[0]
num_meetings5_10 = sqldf(
'select count(*) from answer_2_table where num_of_users>5 and num_of_users<=10 and Topic <>"פגישת צוות"')
num_meetings5_10 = list(num_meetings5_10['count(*)'])[0]
num_meetings10_15 = sqldf(
'select count(*) from answer_2_table where num_of_users>10 and num_of_users<=15 and Topic <>"פגישת צוות"')
num_meetings10_15 = list(num_meetings10_15['count(*)'])[0]
num_meetings_15_plus = sqldf('select count(*) from answer_2_table where num_of_users>15 and Topic <>"פגישת צוות"')
num_meetings_15_plus = list(num_meetings_15_plus['count(*)'])[0]
total_cost = 50 * num_meetings0_5 + 100 * num_meetings5_10 + 150 * num_meetings10_15 + 200 * num_meetings_15_plus
costs_table = [['Session type', 'Number of sessions', 'Cost'],
['0-5 participants', str(num_meetings0_5), str(50 * num_meetings0_5)],
['5-10 participants', str(num_meetings5_10), str(100 * num_meetings5_10)],
['10-15 participants', str(num_meetings10_15), str(150 * num_meetings10_15)],
['15+ participants', str(num_meetings_15_plus), str(200 * num_meetings_15_plus)]]
sessions_by_day_plot = answer_1_table.plot.bar(x='date', y='Num_Of_Sessions', rot=80)
plt.savefig('sessions_by_day_plot.png')
num_of_users = sqldf('select count(*) From answer_3_table')
num_of_users = list(num_of_users['count(*)'])[0]
today = datetime.today().strftime("%b-%d-%Y")
'''----------------------------------Out-Put Results------------------------'''
create_pdf(today = today , max_date=max_date_qu, min_date=min_date_qu, sessions_num=num_of_sessions,
total_cost=total_cost, costs_table=costs_table, num_of_users=num_of_users, avg_users_come=avg_users_come)
writer = pd.ExcelWriter('Zoom Report_{}.xlsx'.format(str(datetime.today()).replace(':', '.', 3)))
(answer_2_table).to_excel(writer , sheet_name='Sessions Number of Participants')
(answer_3_table).to_excel(writer, sheet_name='Participants show-up')
writer.save()
'''---------------------Delete not relevant files------------------------'''
plot1_path = os.path.join(sys.path[0], 'sessions_by_day_plot.png')
os.remove(plot1_path)
exit()
if __name__ == '__main__':
root = Tk()
start_gui(root)
# main()

Resources