How to improve write to synapse in Spark? - apache-spark

I have a script to do write a dataframe to synapse, the flow of my script is Read from Synapse > Transform data on Databricks (GROUP BY CUBE) > Write to Synapse, the data that I read from synapse has 150 million row (Fact Table), I do GROUP BY CUBE to transform the fact table. Now, I have a problem here, the process from write so slowly, I still didn't know what's the problem here.
I have tried some solution:
- Auto optimize
%sql set spark.databricks.delta.properties.defaults.autoOptimize.optimizeWrite = true;
set spark.databricks.delta.properties.defaults.autoOptimize.autoCompact = true;
- Repartition
df.repartition(10000)
- Adaptive Query Execution (AQE)
spark.conf.set("spark.sql.adaptive.optimizeSkewedJoin.enabled", "true")
spark.conf.set("spark.sql.adaptive.enabled", "true")
- Write Semantics
spark.conf.set("spark.databricks.sqldw.writeSemantics", "copy")
Nothing from them solved my problem, and it still write slowly.
This is my time process. , as you can see, some of the job skipped, why does it happen?
This is my specification:
This is my read from synapse script:
df = (spark.read
.format("com.databricks.spark.sqldw")
.option("url", url)
.option("tempDir", tempDir)
.option("forwardSparkAzureStorageCredentials", "true")
.option("query", sql)
.load()
)
This is my write to synapse script:
df.write
.format("com.databricks.spark.sqldw")
.option("tableOptions", "CLUSTERED COLUMNSTORE INDEX, DISTRIBUTION = ROUND_ROBIN") # Added at 20200121
.option("url", url)
.option("dbtable", table)
.option("forward_spark_azure_storage_credentials","True")
.option("tempdir", tempDir)
.mode(write_mode)
.save()
This is my transform script:
cube_department_read = cube_department_df.cube(cube_department_df["YEAR"], cube_department_df["WeekOfYear"], cube_department_df["Month"],
cube_department_df["department_groups"], cube_department_df["category_name"],
cube_department_df["subcategory_name"], cube_department_df["section_name"]) \
.agg(F.max('last_date_of_week').alias('last_date_of_week'),
F.countDistinct('internal_tranx_key').alias('sales_basket'),
F.sum('SalesAmt').alias('sales_amt'),
F.sum('SalesQty').alias('sales_qty'),
F.sum('SalesQtyPro').alias('SalesQtyPro'),
F.sum('SalesAmtPro').alias('SalesAmtPro'),
F.countDistinct('membership_id').alias('member_count'),
F.sum(F.when(cube_department_df["membership_id"].isNotNull(),
cube_department_df["SalesQty"]).otherwise(0)).alias("SalesQty_Member"),
F.sum(F.when(cube_department_df["membership_id"].isNotNull(),
cube_department_df["SalesAmt"]).otherwise(0)).alias("SalesAmt_Member"),
F.sum(F.when(cube_department_df["membership_id"].isNotNull(),
1).otherwise(0)).alias("Basket_Count_Member"),
F.sum(F.when(cube_department_df["membership_id"].isNotNull(),
0).otherwise(cube_department_df["SalesQty"])).alias("SalesQty_NonMember"),
F.sum(F.when(cube_department_df["membership_id"].isNotNull(),
0).otherwise(cube_department_df["SalesAmt"])).alias("SalesAmt_NonMember"),
F.sum(F.when(cube_department_df["membership_id"].isNotNull(),
0).otherwise(1)).alias("Basket_Count_NonMember"),
F.sum(F.when(cube_department_df["promotion_flag"] == 'Y',
cube_department_df["SalesAmt"]).otherwise(0)).alias("SalesAmt_MMDS_Promotion"),
F.sum(F.when(cube_department_df["promotion_flag"] == 'Y',
cube_department_df["SalesQty"]).otherwise(0)).alias("SalesQty_MMDS_Promotion"),
F.sum(F.when(cube_department_df["promotion_flag"] == 'Y',
1).otherwise(0)).alias("Basket_Count_MMDS_Promotion"),
F.sum(F.when(cube_department_df["promotion_flag"] == 'Y',
0).otherwise(cube_department_df["SalesAmt"])).alias("SalesAmt_Non_MMDS_Promotion"),
F.sum(F.when(cube_department_df["promotion_flag"] == 'Y',
0).otherwise(cube_department_df["SalesQty"])).alias("SalesQty_Non_MMDS_Promotion"),
F.sum(F.when(cube_department_df["promotion_flag"] == 'Y',
0).otherwise(1)).alias("Basket_Count_Non_MMDS_Promotion"),
F.sum(F.when((cube_department_df["promotion_flag"] == 'Y') & (cube_department_df["membership_id"].isNotNull()),
cube_department_df["SalesAmt"]).otherwise(0)).alias("SalesAmt_Member_MMDS_Promotion"),
F.sum(F.when((cube_department_df["promotion_flag"] == 'Y') & (cube_department_df["membership_id"].isNotNull()),
cube_department_df["SalesQty"]).otherwise(0)).alias("SalesQty_Member_MMDS_Promotion"),
F.sum(F.when((cube_department_df["promotion_flag"] == 'Y') & (cube_department_df["membership_id"].isNotNull()),
1).otherwise(0)).alias("Basket_Count_Member_MMDS_Promotion"),
F.sum(F.when((cube_department_df["promotion_flag"] == 'Y') & (cube_department_df["membership_id"].isNull()),
cube_department_df["SalesAmt"]).otherwise(0)).alias("SalesAmt_Non_Member_MMDS_Promotion"),
F.sum(F.when((cube_department_df["promotion_flag"] == 'Y') & (cube_department_df["membership_id"].isNull()),
cube_department_df["SalesQty"]).otherwise(0)).alias("SalesQty_Non_Member_MMDS_Promotion"),
F.sum(F.when((cube_department_df["promotion_flag"] == 'Y') & (cube_department_df["membership_id"].isNull()),
1).otherwise(0)).alias("Basket_Count_Non_Member_MMDS_Promotion"),
F.sum(F.when((cube_department_df["promotion_flag"] == 'N') & (cube_department_df["membership_id"].isNotNull()),
cube_department_df["SalesAmt"]).otherwise(0)).alias("SalesAmt_Member_Non_MMDS_Promotion"),
F.sum(F.when((cube_department_df["promotion_flag"] == 'N') & (cube_department_df["membership_id"].isNotNull()),
cube_department_df["SalesQty"]).otherwise(0)).alias("SalesQty_Member_Non_MMDS_Promotion"),
F.sum(F.when((cube_department_df["promotion_flag"] == 'N') & (cube_department_df["membership_id"].isNotNull()),
1).otherwise(0)).alias("Basket_Count_Member_Non_MMDS_Promotion"),
F.sum(F.when((cube_department_df["promotion_flag"] == 'N') & (cube_department_df["membership_id"].isNull()),
cube_department_df["SalesAmt"]).otherwise(0)).alias("SalesAmt_Non_Member_Non_MMDS_Promotion"),
F.sum(F.when((cube_department_df["promotion_flag"] == 'N') & (cube_department_df["membership_id"].isNull()),
cube_department_df["SalesQty"]).otherwise(0)).alias("SalesQty_Non_Member_Non_MMDS_Promotion"),
F.sum(F.when((cube_department_df["promotion_flag"] == 'N') & (cube_department_df["membership_id"].isNull()),
1).otherwise(0)).alias("Basket_Count_Non_Member_Non_MMDS_Promotion"),
F.when((F.sum(cube_department_df["SalesQty"]) < 0) & (F.sum(cube_department_df["SalesAmt"]) < 0),
(F.sum(cube_department_df["SalesAmt"]) / F.sum(cube_department_df["SalesQty"])) * -1) \
.when((F.sum(cube_department_df["SalesQty"]) == 0) | (F.sum(cube_department_df["SalesAmt"]) == 0),
0).otherwise(F.sum(cube_department_df["SalesAmt"]) / F.sum(cube_department_df["SalesQty"])).alias("sales_per_unit"),
F.when((F.sum(cube_department_df["SalesQty"]) < 0) & (F.sum(cube_department_df["SalesAmt"]) < 0),
(F.sum(cube_department_df["SalesAmt"]) / F.countDistinct(cube_department_df["internal_tranx_key"])) * -1) \
.when((F.sum(cube_department_df["SalesQty"]) == 0) | (F.sum(cube_department_df["SalesAmt"]) == 0),
0).otherwise(F.sum(cube_department_df["SalesAmt"]) / F.countDistinct(cube_department_df["internal_tranx_key"])).alias("sales_per_basket"),
F.when((F.sum(cube_department_df["SalesQty"]) < 0) & (F.sum(cube_department_df["SalesAmt"]) < 0),
(F.sum(cube_department_df["SalesQty"]) / F.countDistinct(cube_department_df["internal_tranx_key"])) * -1) \
.when((F.sum(cube_department_df["SalesQty"]) == 0) | (F.sum(cube_department_df["SalesAmt"]) == 0),
0).otherwise(F.sum(cube_department_df["SalesQty"]) / F.countDistinct(cube_department_df["internal_tranx_key"])).alias("unit_per_basket"),
F.when((F.countDistinct(cube_department_df["membership_id"]) < 0) & (F.sum(cube_department_df["SalesAmt"]) < 0),
(F.sum(cube_department_df["SalesAmt"]) / F.countDistinct(cube_department_df["membership_id"])) * -1) \
.when((F.countDistinct(cube_department_df["membership_id"]) == 0) | (F.sum(cube_department_df["SalesAmt"]) == 0),
0).otherwise(F.sum(cube_department_df["SalesAmt"]) / F.countDistinct(cube_department_df["membership_id"])).alias("spend_per_customer")) \
.select("YEAR","WeekOfYear","Month","department_groups","category_name","subcategory_name","section_name",
"last_date_of_week","sales_basket","sales_amt","sales_qty","SalesQtyPro","SalesAmtPro",
"member_count","SalesQty_Member","SalesAmt_Member", "Basket_Count_Member",
"SalesQty_NonMember","SalesAmt_NonMember", "Basket_Count_NonMember",
"SalesAmt_MMDS_Promotion", "SalesQty_MMDS_Promotion", "Basket_Count_MMDS_Promotion",
"SalesAmt_Non_MMDS_Promotion","SalesQty_Non_MMDS_Promotion", "Basket_Count_Non_MMDS_Promotion",
"SalesAmt_Member_MMDS_Promotion","SalesQty_Member_MMDS_Promotion","Basket_Count_Member_MMDS_Promotion",
"SalesAmt_Non_Member_MMDS_Promotion","SalesQty_Non_Member_MMDS_Promotion","Basket_Count_Non_Member_MMDS_Promotion",
"SalesAmt_Member_Non_MMDS_Promotion","SalesQty_Member_Non_MMDS_Promotion","Basket_Count_Member_Non_MMDS_Promotion",
"SalesAmt_Non_Member_Non_MMDS_Promotion","SalesQty_Non_Member_Non_MMDS_Promotion","Basket_Count_Non_Member_Non_MMDS_Promotion",
"sales_per_unit","sales_per_basket","unit_per_basket", "spend_per_customer") \
.orderBy(F.col("YEAR").asc(),
F.col("WeekOfYear").asc(),
F.col("Month").asc(),
F.col("department_groups").asc(),
F.col("category_name").asc(),
F.col("subcategory_name").asc(),
F.col("section_name").asc())

Related

Loop over a dictionary of list and update the corresponding columns - pandas

I have a df and dictionary of list as shown below.
Date Tea_Good Tea_bad coffee_good coffee_bad
2020-02-01 3 1 10 7
2020-02-02 3 1 10 7
2020-02-03 3 1 10 7
2020-02-04 3 1 10 7
2020-02-05 6 1 10 7
2020-02-06 6 2 10 11
2020-02-07 6 2 5 11
2020-02-08 6 2 5 11
2020-02-09 9 2 5 11
2020-02-10 9 2 4 11
2020-02-11 9 2 4 11
2020-02-12 9 2 4 11
2020-02-13 9 2 4 11
2020-02-14 9 2 4 11
The dict is
rf = {
"tea":
[
{
"type": "linear",
"from": "2020-02-01T20:00:00.000Z",
"to": "2020-02-03T20:00:00.000Z",
"days":3,
"coef":[0.1,0.1,0.1,0.1,0.1,0.1],
"case":"bad"
},
{
"type": "polynomial",
"from": "2020-02-08T20:00:00.000Z",
"to": "2020-02-10T20:00:00.000Z",
"days":3,
"coef":[0.1,0.1,0.1,0.1,0.1,0.1],
"case":"good"
}],
"coffee": [
{
"type": "quadratic",
"from": "2020-02-01T20:00:00.000Z",
"to": "2020-02-10T20:00:00.000Z",
"days": 10,
"coef": [0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
"case":"good"
},
{
"type": "constant",
"from": "2020-02-11T20:00:00.000Z",
"to": "2020-02-13T20:00:00.000Z",
"days": 5,
"coef": [0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
"case":"bad"
}]}
Explanation:
Dictionary contains two keys
1. "tea"
2. "coffee"
Based on the key value I wants to update the column of df.
1. Which column?
If key == "tea" and "case" == "bad" update the Tea_bad column
2. When?
"from": "2020-02-01T20:00:00.000Z",
"to": "2020-02-03T20:00:00.000Z"
3. How?
if "type": "linear",
when "from": "2020-02-01T20:00:00.000Z"
t = 0,
a0 = coef[0]
a1 = coef[1]
a2 = coef[2]
a3 = coef[3]
a4 = coef[4]
a5 = coef[5]
df.loc[(df['Date'] >= start_date) & (df['Date'] <= end_date), 'Tea_bad'] = a0 + a1 * t.
I tried below code and it is not working. Pls dont look into the code.
If you got try to implement on ur own ways and help me.
def rf_user_input(df, REQUEST_OBJ):
'''
This functions returns the tea_coffee dataframe with the user input functions for tea, coffee
params: data : tea_coffee dataframe uploaded from user
request_object_api: The api should contain the below params
start_date: start date of the user function for rf
end_date : end date of the user function for the rf
label : {'constant', 'linear', 'quadratic', 'polynomial', 'exponential', 'df'}
coef : list with 6 indexes [a0,a1,a2,a3,a4,a5]
return: rf computed with user inputs
'''
# df.days.iloc[(df[df.Date==start_date].index[0])]
df = df.sort_values(by='Date')
df['days'] = (df['Date'] - df.at[0, 'Date']).dt.days + 1
REQUIRED_KEYS = ["tea", "coffee"]
for teacoffee_category in REQUIRED_KEYS:
print(f" teacoffee_category - {teacoffee_category}")
if teacoffee_category in REQUEST_OBJ.keys():
param_obj_list = REQUEST_OBJ[teacoffee_category]
for params_obj in param_obj_list:
# Do the data processing
goodbad_catgeory = params_obj['case']
kind = teacoffee_category + '_' + goodbad_catgeory
start_date, end_date, label, coef, n_days = params_obj['from'], params_obj['to'], params_obj['type'], \
params_obj['coef'], params_obj['days']
start_date = DT.datetime.strptime(start_date, "%Y-%m-%dT%H:%M:%S.%fZ")
end_date = DT.datetime.strptime(end_date, "%Y-%m-%dT%H:%M:%S.%fZ")
print(f" start date - {start_date}")
print(f" end date - {end_date}")
# Additional n_days code - Start
first_date = df['Date'].min()
period_days = (start_date - first_date)
print(f" period day - {period_days}")
# Additional n_days code - End
# Checking 'start_date' , 'end_date' and 'n_days' conditions
# If the start_date and end_date is null return the calibration df as it is
if (start_date == 0) & (end_date == 0):
return df
if (start_date == 0) & (end_date != 0) & (n_days == 0):
return df
if (start_date != 0) & (end_date == 0) & (n_days == 0):
return df
# if start date, end date and n_days are non zero then consider start date and n_days
if (start_date != 0) & (end_date != 0) & (n_days != 0):
#n_days = (end_date - start_date).days
#n_days = (end_date - start_date).days
end_date = start_date + DT.timedelta(days=n_days)
if (start_date != 0) & (end_date != 0) & (n_days == 0) :
n_days = (end_date - start_date)
print(f" n day = {n_days}")
end_date = end_date
if (start_date != 0) & (end_date == 0) & (n_days != 0) :
#n_days = (end_date - start_date)
#print(f" n day = {n_days}")
end_date = start_date + DT.timedelta(days=n_days)
if (start_date == 0) & (end_date != 0) & (n_days != 0) :
start_date = end_date - DT.timedelta(days=n_days)
if (n_days != 0) & (start_date != 0):
end_date = start_date + DT.timedelta(days=n_days)
# If the start_date and end_date is null return the calibration df as it is
if len(coef) == 6:
# Coefficients Index Initializations
a0 = coef[0]
a1 = coef[1]
a2 = coef[2]
a3 = coef[3]
a4 = coef[4]
a5 = coef[5]
# Constant
if label == 'constant':
if kind == 'tea_good':
df.loc[
(df['Date'] >= start_date) & (df['Date'] <= end_date), 'Tea_Good'] = a0 + (df['days']) - period_days
elif kind == 'tea_bad':
df.loc[
(df['Date'] >= start_date) & (df['Date'] <= end_date), 'Tea_bad'] = a0 + df['days'] - period_days
elif kind == 'coffee_good':
df.loc[
(df['Date'] >= start_date) & (df['Date'] <= end_date), 'coffee_good'] = a0 + df['days'] - period_days
elif kind == 'coffee_bad':
df.loc[
(df['Date'] >= start_date) & (df['Date'] <= end_date), 'coffee_bad'] = a0 + df['days'] - period_days
# Linear
if label == 'linear':
if kind == 'tea_good':
df.loc[
(df['Date'] >= start_date) & (df['Date'] <= end_date), 'Tea_Good'] = a0 + (
a1 * ((df['days']) - period_days))
elif kind == 'tea_bad':
df.loc[
(df['Date'] >= start_date) & (df['Date'] <= end_date), 'Tea_bad'] = a0 + (
a1 * ((df['days']) - period_days))
elif kind == 'coffee_good':
df.loc[
(df['Date'] >= start_date) & (df['Date'] <= end_date), 'coffee_good'] = a0 + (
a1 * ((df['days']) - period_days))
elif kind == 'coffee_bad':
df.loc[
(df['Date'] >= start_date) & (df['Date'] <= end_date), 'coffee_bad'] = a0 + (
a1 * ((df['days']) - period_days))
# Quadratic
if label == 'quadratic':
if kind == 'tea_good':
df.loc[
(df['Date'] >= start_date) & (df['Date'] <= end_date), 'Tea_Good'] = a0 + (
a1 * ((df['days']) - period_days)) + (a2 * ((df['days']) - period_days) ** 2)
elif kind == 'tea_bad':
df.loc[
(df['Date'] >= start_date) & (df['Date'] <= end_date), 'Tea_bad'] = a0 + (
a1 * ((df['days']) - period_days)) + (a2 * ((df['days']) - period_days) ** 2)
elif kind == 'coffee_good':
df.loc[
(df['Date'] >= start_date) & (df['Date'] <= end_date), 'coffee_good'] = a0 + (
a1 * ((df['days']) - period_days)) + (a2 * ((df['days']) - period_days) ** 2)
elif kind == 'coffee_bad':
df.loc[
(df['Date'] >= start_date) & (df['Date'] <= end_date), 'coffee_bad'] = a0 + (
a1 * ((df['days']) - period_days)) + (a2 * ((df['days']) - period_days) ** 2)
# Polynomial
if label == 'polynomial':
if kind == 'tea_good':
df.loc[
(df['Date'] >= start_date) & (df['Date'] <= end_date), 'Tea_Good'] = a0 + (
a1 * ((df['days']) - period_days)) + (a2 * (
(df['days']) - period_days) ** 2) + (a3 * (
(df['days']) - period_days) ** 3) + (a4 * (
(df['days']) - period_days) ** 4) + (a5 * ((df['days']) - period_days) ** 5)
elif kind == 'tea_bad':
df.loc[
(df['Date'] >= start_date) & (df['Date'] <= end_date), 'Tea_bad'] = a0 + (
a1 * ((df['days']) - period_days)) + (a2 * (
(df['days']) - period_days) ** 2) + (a3 * (
(df['days']) - period_days) ** 3) + (a4 * (
(df['days']) - period_days) ** 4) + (a5 * ((df['days']) - period_days) ** 5)
elif kind == 'coffee_good':
df.loc[
(df['Date'] >= start_date) & (df['Date'] <= end_date), 'coffee_good'] = a0 + (
a1 * ((df['days']) - period_days)) + (a2 * (
(df['days']) - period_days) ** 2) + (a3 * (
(df['days']) - period_days) ** 3) + (a4 * (
(df['days']) - period_days) ** 4) + (a5 * ((df['days']) - period_days) ** 5)
elif kind == 'coffee_bad':
df.loc[(df['Date'] >= start_date) & (df['Date'] <= end_date), 'coffee_bad'] = a0 + (
a1 * ((df['days']) - period_days)) + (a2 * (
(df['days']) - period_days) ** 2) + (a3 * (
(df['days']) - period_days) ** 3) + (a4 * (
(df['days']) - period_days) ** 4) + (a5 * ((df['days']) - period_days) ** 5)
# Exponential
if label == 'exponential':
if kind == 'tea_good':
df.loc[(df['Date'] >= start_date) & (df['Date'] <= end_date), 'Tea_Good'] = np.exp(a0)
elif kind == 'tea_bad':
df.loc[(df['Date'] >= start_date) & (df['Date'] <= end_date), 'Tea_bad'] = np.exp(a0)
elif kind == 'coffee_good':
df.loc[(df['Date'] >= start_date) & (df['Date'] <= end_date), 'coffee_good'] = np.exp(a0)
elif kind == 'coffee_bad':
df.loc[(df['Date'] >= start_date) & (df['Date'] <= end_date), 'coffee_bad'] = np.exp(a0)
# Calibration File
if label == 'calibration_file':
pass
# return df
else:
raise Exception('Coefficients index do not match. All values of coefficients should be passed')
else:
return df
return df
I have added the same question in different way. I thought in the I am not explained well. The link of this question is given below.
Replace the column values based on the list of dictionary and specific date condition - use if and for loop - Pandas
Use:
def rf_user_input(df, req_obj):
df = df.sort_values('Date')
df['days'] = (df['Date'] - df.at[0, 'Date']).dt.days + 1
cols, df.columns = df.columns, df.columns.str.lower()
for category in ("tea", "coffee"):
if category not in req_obj.keys():
continue
for params_obj in req_obj[category]:
case = params_obj['case']
kind = '{}_{}'.format(category, case)
start_date = pd.to_datetime(params_obj['from'], format='%Y-%m-%dT%H:%M:%S.%fZ')
end_date = pd.to_datetime(params_obj['to'], format='%Y-%m-%dT%H:%M:%S.%fZ')
label, coef, n_days = params_obj['type'], params_obj['coef'], params_obj['days']
# Additional n_days code - Start
first_date = df['date'].min()
period_days = (start_date - first_date).days
# Additional n_days code - End
# Checking 'start_date' , 'end_date' and 'n_days' conditions
# If the start_date and end_date is null return the calibration df as it is
if (start_date == 0) and (end_date == 0):
return df.set_axis(cols, axis=1)
if (start_date == 0) and (end_date != 0) and (n_days == 0):
return df.set_axis(cols, axis=1)
if (start_date != 0) and (end_date == 0) and (n_days == 0):
return df.set_axis(cols, axis=1)
# if start date, end date and n_days are non zero then consider start date and n_days
if (start_date != 0) and (end_date != 0) and (n_days != 0):
end_date = start_date + pd.Timedelta(days=n_days)
if (start_date != 0) and (end_date != 0) and (n_days == 0):
n_days = (end_date - start_date)
if (start_date != 0) and (end_date == 0) and (n_days != 0):
end_date = start_date + pd.Timedelta(days=n_days)
if (start_date == 0) and (end_date != 0) and (n_days != 0):
start_date = end_date - pd.Timedelta(days=n_days)
if (n_days != 0) and (start_date != 0):
end_date = start_date + pd.Timedelta(days=n_days)
# If the start_date and end_date is null return the calibration df as it is
if len(coef) == 6:
a0, a1, a2, a3, a4, a5 = coef
mask = df['date'].between(start_date, end_date)
if label == 'constant':
if kind in ('tea_good', 'tea_bad', 'coffee_good', 'coffee_bad'):
df.loc[mask, kind] = a0 + df['days'] - period_days
elif label == 'linear':
if kind in ('tea_good', 'tea_bad', 'coffee_good', 'coffee_bad'):
df.loc[mask, kind] = a0 + \
(a1 * ((df['days']) - period_days))
# Quadratic
elif label == 'quadratic':
if kind in ('tea_good', 'tea_bad', 'coffee_good', 'coffee_bad'):
df.loc[mask, kind] = a0 + (a1 * ((df['days']) - period_days)) + (
a2 * ((df['days']) - period_days) ** 2)
# Polynomial
elif label == 'polynomial':
if kind in ('tea_good', 'tea_bad', 'coffee_good', 'coffee_bad'):
df.loc[mask, kind] = a0 + (
a1 * ((df['days']) - period_days)) + (a2 * (
(df['days']) - period_days) ** 2) + (a3 * (
(df['days']) - period_days) ** 3) + (a4 * (
(df['days']) - period_days) ** 4) + (a5 * ((df['days']) - period_days) ** 5)
# Exponential
elif label == 'exponential':
if kind in ('tea_good', 'tea_bad', 'coffee_good', 'coffee_bad'):
df.loc[mask, kind] = np.exp(a0)
# Calibration File
elif label == 'calibration_file':
pass
else:
raise Exception(
'Coefficients index do not match. All values of coefficients should be passed')
return df.set_axis(cols, axis=1)
Result:
# rf_unser_input(df, rf)
Date Tea_Good Tea_bad coffee_good coffee_bad days
0 2020-02-01 3.0 1.0 10.0 7.0 1
1 2020-02-02 3.0 0.3 0.3 7.0 2
2 2020-02-03 3.0 0.4 0.4 7.0 3
3 2020-02-04 3.0 0.5 0.5 0.3 4
4 2020-02-05 12.0 1.0 3.1 0.4 5
5 2020-02-06 13.0 2.0 4.3 0.5 6
6 2020-02-07 6.0 2.0 5.7 0.6 7
7 2020-02-08 6.0 2.0 7.3 11.0 8
8 2020-02-09 6.3 2.0 9.1 11.0 9
9 2020-02-10 36.4 2.0 11.1 11.0 10
10 2020-02-11 136.5 2.0 13.3 11.0 11
11 2020-02-12 9.0 2.0 4.0 11.0 12
12 2020-02-13 9.0 2.0 4.0 11.0 13
13 2020-02-14 9.0 2.0 4.0 11.0 14
One solution is to loop over the dictionary and use apply:
df.Date = pd.to_datetime(df.Date)
df = df.set_index('Date', drop=True)
df['Period'] = [(date - df.index[0]).days for date in df.index]
for key, val in rf.items():
for elem in val:
type_method = elem.get('type')
col_name = f'{key.capitalize()}_{elem.get("case")}'
date_from = pd.to_datetime(elem.get('from'))
date_to = pd.to_datetime(elem.get('to'))
a0, a1, a2, a3, a4, a5 = elem.get('coef')
mask_dates = (df.index >= date_from) & (df.index <= date_to)
func_dict = {
'linear': lambda x: a0 + a1 * x['Period'],
'constant': lambda x: a0 + x['Period'],
'quadratic': lambda x: a0 + a1 * (x['Period']) + a2 * (x['Period'] ** 2),
'exponential': lambda x: np.exp(a0),
'polynomial': lambda x: a0 +
a1 * (x['Period']) +
a2 * (x['Period'] ** 2) +
a3 * (x['Period'] ** 3) +
a4 * (x['Period'] ** 4) +
a5 * (x['Period'] ** 5),
}
df.loc[mask_dates, col_name] = df[mask_dates].apply(func_dict[type_method], axis=1)
Output:
Tea_good Tea_bad Coffee_good Coffee_bad Period
Date
2020-02-01 3.0 1.0 10.0 7.0 0
2020-02-02 3.0 0.2 0.3 7.0 1
2020-02-03 3.0 0.3 0.7 7.0 2
2020-02-04 3.0 1.0 1.3 7.0 3
2020-02-05 6.0 1.0 2.1 7.0 4
2020-02-06 6.0 2.0 3.1 11.0 5
2020-02-07 6.0 2.0 4.3 11.0 6
2020-02-08 6.0 2.0 5.7 11.0 7
2020-02-09 3744.9 2.0 7.3 11.0 8
2020-02-10 6643.0 2.0 9.1 11.0 9
2020-02-11 9.0 2.0 4.0 11.0 10
2020-02-12 9.0 2.0 4.0 11.1 11
2020-02-13 9.0 2.0 4.0 12.1 12
2020-02-14 9.0 2.0 4.0 11.0 13
Please note that I had to change the column names so that tea/coffee are capitalized. Also, the use of lambda functions like this is lazy and should be refactored into normal functions.

Replace the column values based on the list of dictionary and specific date condition - Pandas

I have two dfs and list of dictionary as shown below.
df:
Date Tea_Good Tea_bad coffee_good coffee_bad
2020-02-01 3 1 10 7
2020-02-02 3 1 10 7
2020-02-03 3 1 10 7
2020-02-04 3 1 10 7
2020-02-05 6 1 10 7
2020-02-06 6 2 10 11
2020-02-07 6 2 5 11
2020-02-08 6 2 5 11
2020-02-09 9 2 5 11
2020-02-10 9 2 4 11
2020-02-11 9 2 4 11
2020-02-12 9 2 4 11
2020-02-13 9 2 4 11
2020-02-14 9 2 4 11
df2
ObservationDate beta
2020-02-01 100
2020-02-02 230
2020-02-03 150
2020-02-04 100
2020-02-05 200
2020-02-06 180
2020-02-07 190
2020-02-08 290
rf = {
"tea":
[
{
"type": "replace_beta",
"from": "2020-02-01T20:00:00.000Z",
"to": "2020-02-03T20:00:00.000Z",
"days":3,
"coef":[0.1,0.1,0.1,0.1,0.1,0.1],
"case":"bad"
},
{
"type": "polynomial",
"from": "2020-02-08T20:00:00.000Z",
"to": "2020-02-10T20:00:00.000Z",
"days":3,
"coef":[0.1,0.1,0.1,0.1,0.1,0.1],
"case":"good"
},
{
"type": "linear",
"from": "2020-02-01T20:00:00.000Z",
"to": "2020-02-03T20:00:00.000Z",
"days":3,
"coef":[0.1,0.1,0.1,0.1,0.1,0.1],
"case":"bad"
},
{
"type": "replace_beta",
"from": "2020-02-04T20:00:00.000Z",
"to": "2020-02-05T20:00:00.000Z",
"days":2,
"coef":[10,10,10,10,10,10],
"case":"good"
}],
"coffee": [
{
"type": "replace_beta",
"from": "2020-02-01T20:00:00.000Z",
"to": "2020-02-10T20:00:00.000Z",
"days": 10,
"coef": [0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
"case":"good"
},
{
"type": "replace_beta",
"from": "2020-02-11T20:00:00.000Z",
"to": "2020-02-15T20:00:00.000Z",
"days": 5,
"coef": [0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
"case":"bad"
},
{
"type": "replace_beta",
"from": "2020-02-01T20:00:00.000Z",
"to": "2020-02-03T20:00:00.000Z",
"days": 3,
"coef": [0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
"case":"good"
},
{
"type": "replace_beta",
"from": "2020-02-03T20:00:00.000Z",
"to": "2020-02-06T20:00:00.000Z",
"days": 4,
"coef": [0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
"case":"bad"
}
]
}
I tried below code:
def sc_user_input(df, df2, sc):
REQUIRED_KEYS = ["tea", "coffee"]
for teacoffee_category in REQUIRED_KEYS:
print(teacoffee_category)
if teacoffee_category in sc.keys():
param_obj_list = sc[teacoffee_category]
for params_obj in param_obj_list:
goodbad_catgeory = params_obj['case']
kind = teacoffee_category + '_' + goodbad_catgeory
# Do the data processing
start_date, end_date, label, coef, n_days = params_obj['from'], params_obj['to'], params_obj['type'], \
params_obj['coef'], params_obj['days']
#start_date = DT.datetime.strptime(start_date, "%Y-%m-%dT%H:%M:%S.%fZ")
#print(start_date)
#start = pd.Timestamp(d['from']).strftime('%Y-%m-%d')
start_date = DT.datetime.strptime(start_date, "%Y-%m-%dT%H:%M:%S.%fZ")
print(start_date)
#end_date = pd.Timestamp(params_obj['to']).strftime('%Y-%m-%d')
# Additional n_days code - Start
df['Date'] = pd.to_datetime(df['Date'])
df['days'] = (df['Date'] - df.at[0, 'Date']).dt.days + 1
first_date = df['Date'].min()
period_days = (start_date - first_date).days
print(period_days)
# Additional n_days code - End
if (n_days != 0) & (start_date != 0):
end_date = start_date + DT.timedelta(days=n_days)
print(end_date)
start_date = pd.Timestamp(start_date).strftime('%Y-%m-%d')
print(start_date)
end_date = pd.Timestamp(end_date).strftime('%Y-%m-%d')
print(end_date)
if (start_date == 0) | (end_date == 0):
return df
if len(coef) == 6:
# Coefficients Index Initializations
a0 = coef[0]
a1 = coef[1]
a2 = coef[2]
a3 = coef[3]
a4 = coef[4]
a5 = coef[5]
# Constant
if label == 'constant':
if kind == 'tea_good':
df.loc[
(df['Date'] >= start_date) & (df['Date'] <= end_date), 'Tea_Good'] = a0 * (
(df['days']) - period_days)
elif kind == 'tea_bad':
df.loc[
(df['Date'] >= start_date) & (df['Date'] <= end_date), 'Tea_bad'] = a0 * (
(df['days']) - period_days)
elif kind == 'coffee_good':
df.loc[
(df['Date'] >= start_date) & (df['Date'] <= end_date), 'coffee_good'] = a0 * (
(df['days']) - period_days)
elif kind == 'coffee_bad':
df.loc[
(df['Date'] >= start_date) & (df['Date'] <= end_date), 'coffee_bad'] = a0 * (
(df['days']) - period_days)
# Linear
if label == 'linear':
if kind == 'tea_good':
df.loc[
(df['Date'] >= start_date) & (df['Date'] <= end_date), 'Tea_Good'] = a0 + (
a1 * ((df['days']) - period_days))
elif kind == 'tea_bad':
df.loc[
(df['Date'] >= start_date) & (df['Date'] <= end_date), 'Tea_bad'] = a0 + (
a1 * ((df['days']) - period_days))
elif kind == 'coffee_good':
df.loc[
(df['Date'] >= start_date) & (df['Date'] <= end_date), 'coffee_good'] = a0 + (
a1 * ((df['days']) - period_days))
elif kind == 'coffee_bad':
df.loc[
(df['Date'] >= start_date) & (df['Date'] <= end_date), 'coffee_bad'] = a0 + (
a1 * ((df['days']) - period_days))
# Quadratic
if label == 'quadratic':
if kind == 'tea_good':
df.loc[
(df['Date'] >= start_date) & (df['Date'] <= end_date), 'Tea_Good'] = a0 + (
a1 * ((df['days']) - period_days)) + (a2 * ((df['days']) - period_days) ** 2)
elif kind == 'tea_bad':
df.loc[
(df['Date'] >= start_date) & (df['Date'] <= end_date), 'Tea_bad'] = a0 + (
a1 * ((df['days']) - period_days)) + (a2 * ((df['days']) - period_days) ** 2)
elif kind == 'coffee_good':
df.loc[
(df['Date'] >= start_date) & (df['Date'] <= end_date), 'coffee_good'] = a0 + (
a1 * ((df['days']) - period_days)) + (a2 * ((df['days']) - period_days) ** 2)
elif kind == 'coffee_bad':
df.loc[
(df['Date'] >= start_date) & (df['Date'] <= end_date), 'coffee_bad'] = a0 + (
a1 * ((df['days']) - period_days)) + (a2 * ((df['days']) - period_days) ** 2)
# Polynomial
if label == 'polynomial':
if kind == 'tea_good':
df.loc[
(df['Date'] >= start_date) & (df['Date'] <= end_date), 'Tea_Good'] = a0 + (
a1 * ((df['days']) - period_days)) + (a2 * (
(df['days']) - period_days) ** 2) + (a3 * (
(df['days']) - period_days) ** 3) + (a4 * (
(df['days']) - period_days) ** 4) + (a5 * ((df['days']) - period_days) ** 5)
elif kind == 'tea_bad':
df.loc[
(df['Date'] >= start_date) & (df['Date'] <= end_date), 'Tea_bad'] = a0 + (
a1 * ((df['days']) - period_days)) + (a2 * (
(df['days']) - period_days) ** 2) + (a3 * (
(df['days']) - period_days) ** 3) + (a4 * (
(df['days']) - period_days) ** 4) + (a5 * ((df['days']) - period_days) ** 5)
elif kind == 'coffee_good':
df.loc[
(df['Date'] >= start_date) & (df['Date'] <= end_date), 'coffee_good'] = a0 + (
a1 * ((df['days']) - period_days)) + (a2 * (
(df['days']) - period_days) ** 2) + (a3 * (
(df['days']) - period_days) ** 3) + (a4 * (
(df['days']) - period_days) ** 4) + (a5 * ((df['days']) - period_days) ** 5)
elif kind == 'coffee_bad':
df.loc[
(df['Date'] >= start_date) & (df['Date'] <= end_date), 'coffee_bad'] = a0 + (
a1 * ((df['days']) - period_days)) + (a2 * (
(df['days']) - period_days) ** 2) + (a3 * (
(df['days']) - period_days) ** 3) + (a4 * (
(df['days']) - period_days) ** 4) + (a5 * ((df['days']) - period_days) ** 5)
# Exponential
if label == 'exponential':
if kind == 'tea_good':
df.loc[
(df['Date'] >= start_date) & (df['Date'] <= end_date), 'Tea_Good'] = np.exp(
a0)
elif kind == 'tea_bad':
df.loc[
(df['Date'] >= start_date) & (df['Date'] <= end_date), 'Tea_bad'] = np.exp(
a0)
elif kind == 'coffee_good':
df.loc[(df['Date'] >= start_date) & (
df['Date'] <= end_date), 'coffee_good'] = np.exp(a0)
elif kind == 'coffee_bad':
df.loc[
(df['Date'] >= start_date) & (df['Date'] <= end_date), 'coffee_bad'] = np.exp(
a0)
#google mobility
if label == 'google_mobility':
print(label)
if kind == 'tea_good':
#df1 = df1[['Date', 'Blue-collar best']].copy()
df2 = df2[['ObservationDate', 'beta']]
df2.rename({'ObservationDate':'Date'}, axis=1, inplace=True)
df['Date'] = pd.to_datetime(df['Date'])
df2['Date'] = pd.to_datetime(df2['Date'])
if df2['Date'].max() < pd.Timestamp(end_date):
print(f"we dont have data beyond {df2['Date'].max()}")
df = df.merge(df2, on='Date', how='left')
m = df['Date'].between(start_date, end_date, inclusive=True)
df.loc[m, 'Tea_Good'] = df['Date'].map(df2.set_index('Date')['beta']).fillna(df['Tea_Good'])
#df.loc[m, 'Blue-collar best'] = df.pop('beta').fillna(df['Blue-collar best'])
#df = df1
elif kind == 'tea_bad':
#df1 = df1[['Date', 'Int blue-collar']].copy()
df2 = df2[['ObservationDate', 'beta']]
df2.rename({'ObservationDate':'Date'}, axis=1, inplace=True)
df['Date'] = pd.to_datetime(df['Date'])
df2['Date'] = pd.to_datetime(df2['Date'])
if df2['Date'].max() < pd.Timestamp(end_date):
print(f"we dont have data beyond {df2['Date'].max()}")
df = df.merge(df2, on='Date', how='left')
m = df['Date'].between(start_date, end_date, inclusive=True)
df.loc[m, 'Tea_bad'] = df['Date'].map(df2.set_index('Date')['beta']).fillna(df['Tea_bad'])
#df.loc[m, 'Int blue-collar'] = df.pop('beta').fillna(df['Int blue-collar'])
#df = df1
elif kind == 'coffee_good':
#df1 = df1[['Date', 'White-collar best']].copy()
df2 = df2[['ObservationDate', 'beta']]
df2.rename({'ObservationDate':'Date'}, axis=1, inplace=True)
df['Date'] = pd.to_datetime(df['Date'])
df2['Date'] = pd.to_datetime(df2['Date'])
if df2['Date'].max() < pd.Timestamp(end_date):
print(f"we dont have data beyond {df2['Date'].max()}")
df = df.merge(df2, on='Date', how='left')
m = df['Date'].between(start_date, end_date, inclusive=True)
#df1.loc[m, 't_factor'] = df1['Date'].map(df2.set_index('Date')['beta']).fillna(df1['t_factor'])
df.loc[m, 'coffee_good'] = df['Date'].map(df2.set_index('Date')['beta']).fillna(df['coffee_good'])
#df.loc[m, 'White-collar best'] = df.pop('beta').fillna(df['White-collar best'])
#df = df1
elif kind == 'coffee_bad':
#df1 = df1[['Date', 'Int white-collar']].copy()
df2 = df2[['ObservationDate', 'beta']]
df2.rename({'ObservationDate':'Date'}, axis=1, inplace=True)
df['Date'] = pd.to_datetime(df['Date'])
df2['Date'] = pd.to_datetime(df2['Date'])
if df2['Date'].max() < pd.Timestamp(end_date):
print(f"we dont have data beyond {df2['Date'].max()}")
df = df.merge(df2, on='Date', how='left')
m = df['Date'].between(start_date, end_date, inclusive=True)
df.loc[m, 'coffee_bad'] = df['Date'].map(df2.set_index('Date')['beta']).fillna(df['coffee_bad'])
#df.loc[m, 'Int white-collar'] = df.pop('beta').fillna(df['Int white-collar'])
#df = df1
# return df
else:
raise Exception('Coefficients index do not match. All values of coefficients should be passed')
else:
raise Exception('Start date not defined')
else:
return df
return df
I am facing below error. I could not resolve
"['ObservationDate'] not in index"
here is full error
KeyError Traceback (most recent call last)
<ipython-input-111-58a804a79c5c> in <module>
----> 1 sc_user_input(df, df2, scrf).head()
<ipython-input-110-66cf8cded7aa> in sc_user_input(df, df2, sc)
165 print(label)
166 if kind == 'tea_good':
--> 167 df2 = df2[['ObservationDate', 'beta']]
168 df2.rename({'ObservationDate':'Date'}, axis=1, inplace=True)
169 df['Date'] = pd.to_datetime(df['Date'])
~/admvenv/lib/python3.7/site-packages/pandas/core/frame.py in __getitem__(self, key)
2804 if is_iterator(key):
2805 key = list(key)
-> 2806 indexer = self.loc._get_listlike_indexer(key, axis=1, raise_missing=True)[1]
2807
2808 # take() does not accept boolean indexers
~/admvenv/lib/python3.7/site-packages/pandas/core/indexing.py in _get_listlike_indexer(self, key, axis, raise_missing)
1551
1552 self._validate_read_indexer(
-> 1553 keyarr, indexer, o._get_axis_number(axis), raise_missing=raise_missing
1554 )
1555 return keyarr, indexer
~/admvenv/lib/python3.7/site-packages/pandas/core/indexing.py in _validate_read_indexer(self, key, indexer, axis, raise_missing)
1644 if not (self.name == "loc" and not raise_missing):
1645 not_found = list(set(key) - set(ax))
-> 1646 raise KeyError(f"{not_found} not in index")
1647
1648 # we skip the warning on Categorical/Interval
KeyError: "['ObservationDate'] not in index"
Explanation:
if "type" == replace_beta:
df1['col'] = df2['beta'] (duration only from the "from" and "to" date specified in that dictionary)
elif "type" == "quadratic":
df['col'] = a0 + a1*(T) + a2*(T)**2 + previous value of df['col']
where T = 1 for one day after the "from" date of that dictionary and T counted in days based Date value
elif "type" == "linear":
df['col'] = a0 + a1*(T) + previous value of df['col']
where T = 1 for one day after the "from" date of that dictionary.
elif "type" == "polynomial":
df['col'] = a0 + a1*(T) + a2*(T)**2 + a3*(T)**3 + a4*(T)**4 + a5*(T)**5 + previous value of df['col']
where T = 1 for start_date of that dictionary.

why is my ROC curve getting plotted in reverse

I have one csv file and tried to plot an ROC curve with out using any predefined libraries for the curve plotting. I have used numpy and pandas for my code. Can any one please tell me where am I going wrong?ROC Curve
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('5_b.csv')
#adding a new column to the data
df['Y_pred'] = np.where(df['proba']<=0.5, 0, 1)
#printing the top 5 values in data
#df.head(5)
#sorting dataframe
df=df.sort_values(by =['proba'])
df.head(5)
#confusion matrix
TP_Main = len(df[(df['y'] == 1) & (df['Y_pred'] == 1)])
FP_Main = len(df[(df['y'] == 0) & (df['Y_pred'] == 1)])
FN_Main = len(df[(df['y'] == 1) & (df['Y_pred'] == 0)])
TN_Main = len(df[(df['y'] == 0) & (df['Y_pred'] == 0)])
print("TN_Main : {0},FN_Main : {1}".format(TN_Main,FN_Main))
print("FP_Main : {0},TP_Main : {1}".format(FP_Main,TP_Main))
#F1score
precision = TP_Main/(TP_Main+FP_Main)
recall = TP_Main/(TP_Main+FN_Main)
F1score = ((precision*recall)/(precision+recall))*2
print("precision : {0},recall : {1}".format(precision,recall))
print("F1score : ",F1score)
#df.sort_values(by =['proba'], inplace = True, ascending = False)
tprList = []
fprList = []
for i in range(len(df)):
df['Y_pred'] =np.where(df['proba']<=df.iloc[i][1],0,1)
TP = len(df[(df['y'] == 1) & (df['Y_pred'] == 1)])
FP = len(df[(df['y'] == 0) & (df['Y_pred'] == 1)])
FN = len(df[(df['y'] == 1) & (df['Y_pred'] == 0)])
TN = len(df[(df['y'] == 0) & (df['Y_pred'] == 0)])
TPR = TP/(FN+TP)
FPR = TN/(FP+TN)
tprList.append(TPR)
fprList.append(FPR)
tpr_array = np.array(tprList)
fpr_array = np.array(fprList)
#Accuracy score
AccScore = (TN_Main+TP_Main)/len(df)
print("Accuracy Score =", AccScore)
AUCScore = np.trapz(tpr_array,fpr_array)
print("AUC Score :",AUCScore)
plt.plot(tpr_array,fpr_array)

Use spark sql optimizer to optimize two ranges join

I am using spark sql optimizer to optimize two ranges join, it is optimized to calculate two range intersection to avoid join
test("SparkTest") {
object RangeIntersectRule extends Rule[LogicalPlan] {
override def apply(plan: LogicalPlan): LogicalPlan = plan transformDown {
case Join(Project(_, Range(start1, end1, _, _, _, _)), Project(_, Range(start2, end2, _, _, _, _)), _, _) => {
val start = start1 max start2
val end = end1 min end2
if (start1 > end2 || end1 < start2) Range(0, 0, 1, Some(1), false) else Range(start, end, 1, Some(1), false)
}
}
}
val spark = SparkSession.builder().master("local").appName("SparkTest").enableHiveSupport().getOrCreate()
spark.experimental.extraOptimizations = Seq(RangeIntersectRule)
spark.range(10, 40).toDF("x").createOrReplaceTempView("t1")
spark.range(20, 50).toDF("y").createOrReplaceTempView("t2")
val df = spark.sql("select t1.x from t1 join t2 on t1.x = t2.y")
df.explain(true)
df.show(truncate = false)
}
But when I run it, an exception throws, could someone help where the problem is?Thanks
The optimized logical plan and physical plan is:
== Optimized Logical Plan ==
Project [x#2L]
+- !Project [id#0L AS x#2L]
+- Range (20, 40, step=1, splits=Some(1))
== Physical Plan ==
Project [x#2L]
+- !Project [id#0L AS x#2L]
+- Range (20, 40, step=1, splits=1)
The exception is:
Caused by: java.lang.RuntimeException: Couldn't find id#0L in [id#14L]
at scala.sys.package$.error(package.scala:27)
at org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1$$anonfun$applyOrElse$1.apply(BoundAttribute.scala:106)
at org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1$$anonfun$applyOrElse$1.apply(BoundAttribute.scala:100)
at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:59)
... 47 more
object RangeIntersectRule extends Rule[LogicalPlan] {
override def apply(plan: LogicalPlan): LogicalPlan = plan transformDown {
case Join(Range(start1, end1, 1, Some(1), output1, false), Range(start2, end2, 1, Some(1), output2, false), Inner, _) => {
val start = start1 max start2
val end = end1 min end2
if (start1 > end2 || end1 < start2) Project(output1, Range(0, 0, 1, Some(1), output1, false))
else Project(output1, Range(start, end, 1, Some(1), output1, false))
}
}
}

OpenCV in Python

Code runs but instead of posting my name during the detector.py it says unknown. Anyone have any solutions?
import cv2
import numpy as np
recognizer = cv2.face.LBPHFaceRecognizer_create()
recognizer.read('trainer/trainer.yml')
cascadePath = "haarcascade_frontalface_default.xml"
faceCascade = cv2.CascadeClassifier(cascadePath);
font = cv2.FONT_HERSHEY_SIMPLEX
cam = cv2.VideoCapture(0)
while True:
ret, im =cam.read()
gray = cv2.cvtColor(im,cv2.COLOR_BGR2GRAY)
faces = faceCascade.detectMultiScale(gray, 1.2,5)
for(x,y,w,h) in faces:
cv2.rectangle(im, (x-20,y-20), (x+w+20,y+h+20), (0,255,0), 4)
Id = recognizer.predict(gray[y:y+h,x:x+w])
if(Id == 1):
Id = "Alec"
elif(Id == 2):
Id = "Chase"
else:
Id = "Unknown"
cv2.rectangle(im, (x-22,y-90), (x+w+22, y-22), (0,255,0), -1)
cv2.putText(im, str(Id), (x,y-40), font, 2, (255,255,255), 3)
if cv2.waitKey(10) & 0xFF == ord('q'):
break
cam.release()
cv2.destroyAllWindows()
It is not showing any types of error. I have commented out the If(id==X) code just to see what it would print on the screen. The program printed (1, 30-40). So I'm guessing the 1 is my ID. I have the DataSet and trainer program if I need to provide it.
recognizer.predict returns both the Id and confidence score.
Id, conf = recognizer.predict(gray[y:y+h,x:x+w])
if(conf<50):
if(Id==1):
Id="asciime"
elif(Id==2):
Id="Something"
else:
Id="Unknown"
OpenCV's Python API documentation is very poor. I often use the C++ reference. In this case the predict method is
void cv::face::FaceRecognizer::predict(InputArray src, int& label, double& confidence) const
See https://docs.opencv.org/3.2.0/dd/d65/classcv_1_1face_1_1FaceRecognizer.html#ab0d593e53ebd9a0f350c989fcac7f251 .
Confidence is standard set to 50.
However the Id has 2 values in it. the int(ID) and double(Conf).
https://docs.opencv.org/3.0.0/dd/d65/classcv_1_1face_1_1FaceRecognizer.html#aede3fa2ec7a4ee35e67bc998df23883b
getting the first value with Id[1] would work
for(x,y,w,h) in faces:
cv2.rectangle(im, (x-20,y-20), (x+w+20,y+h+20), (0,255,0), 4)
Id = recognizer.predict(gray[y:y+h,x:x+w])
if(Id[1] == 1):
Id = "Alec"
elif(Id[1] == 2):
Id = "Chase"
else:
Id = "Unknown"
cv2.rectangle(im, (x-22,y-90), (x+w+22, y-22), (0,255,0), -1)
cv2.putText(im, str(Id), (x,y-40), font, 2, (255,255,255), 3)
if cv2.waitKey(10) & 0xFF == ord('q'):
break
or
for(x,y,w,h) in faces:
cv2.rectangle(im, (x-20,y-20), (x+w+20,y+h+20), (0,255,0), 4)
Id,conf = recognizer.predict(gray[y:y+h,x:x+w])
if(Id == 1):
Id = "Alec"
elif(Id == 2):
Id = "Chase"
else:
Id = "Unknown"
cv2.rectangle(im, (x-22,y-90), (x+w+22, y-22), (0,255,0), -1)
cv2.putText(im, str(Id), (x,y-40), font, 2, (255,255,255), 3)
if cv2.waitKey(10) & 0xFF == ord('q'):
break

Resources