PYSPARK org.apache.spark.sql.AnalysisException: cannot resolve '`INPUT__FILE__NAME`' given input columns - python-3.x

I using configuration file as below :
"trial":{
"stage_table": "trial_stg",
"folder_location": "Trial",
"column_mapping": [
{
"source_column": "(split(INPUT__FILE__NAME, '\\/')[11])",
"source_datatype": "text",
"target_column": "indication",
"target_datatype": "text",
"transform_type": "expression",
"validate": false
}
I am trying to get file name using INPUT__FILE__NAME function in pyspark but I am getting issue.
Below is the code after reading this config file :
def query_expression_builder(mapping):
print("Inside query_expression_builder")
print("mapping :",mapping)
def match_transform_type(map_col):
print("Inside match_transform_type")
if map_col.get('transform_type') is None:
print("transform_type is",map_col.get('transform_type'))
print("map_col inside if :",map_col)
return f"`{map_col['source_column']}` AS {map_col['target_column']}"
elif str(map_col.get('transform_type')).__eq__('expression'):
print("transform_type is",map_col.get('transform_type'))
print("map_col inside elif :",map_col)
return f"{map_col['source_column']} AS {map_col['target_column']}"
else:
print("transform_type is",map_col.get('transform_type'))
print("map_col inside else :",map_col)
return f"`{map_col['source_column']}` AS {map_col['target_column']}"
if mapping is None:
print("Check for mapping is None")
return []
else:
print("Mapping is not None")
return list(map(lambda col_mapping: match_transform_type(map_col=col_mapping), mapping))
def main():
query = query_expression_builder\
(mapping=config['file_table_mapping'][tbl]['column_mapping'])
print(f"Table = {tbl} Executing query {query}")
file_path = f"s3://{config['raw_bucket']}/{config['landing_directory']}/{config['file_table_mapping'][tbl]['folder_location']}/{config_audit['watermark_timestamp']}*.csv"
write_df = spark.read.csv(path=file_path, header=True,\
inferSchema=False).selectExpr(query) \
.withColumn("prcs_run_id", func.lit(config_audit['prcs_run_id']))\
.withColumn("job_run_id",\
func.lit(config_audit['job_run_id']))\
.withColumn("ins_ts", func.lit(ins_ts))\
.withColumn("rec_crt_user", func.lit(config["username"]))
write_df.show()
Below is the error I am getting :
"cannot resolve '`INPUT__FILE__NAME`' given input columns: [Pediatric Patients included (Y/N), Trial registry number, Number of patients, Sponsor, Number of treatment arms, Multicenter study, Trial Conclusion, Clinical Phase, Study Population, Country Codes, Exclusion criteria, Trial ID, Trial AcronymDerived, Comments, Countries, Trial registry name, Sample size calculation details, Randomisation, Blinding, Trial Comments, Trial start year, Trial end year, Inclusion criteria, Study treatment, Trial design, Controlled trial, Trial Acronym, Trial Control, Asymptomatic patients, Analysis method details]; line 1 pos 7;\n'Project ['split('INPUT__FILE__NAME, /)[11] AS indication#4346, Trial ID#4286 AS trial_id#4347, Trial Acronym#4287 AS trial_acronym#4348, Trial AcronymDerived#4288 AS trial_acronym_derived#4349, Sponsor#4289 AS sponsor#4350, Asymptomatic patients#4290 AS asymptomatic_patients#4351, Pediatric Patients included (Y/N)#4291 AS pediatric_patients_included#4352, Number of patients#4292 AS num_of_patients#4353, Number of treatment arms#4293 AS num_of_treatment_arms#4354, Trial start year#4294 AS trial_strt_yr#4355, Trial end year#4295 AS trial_end_yr#4356, Clinical Phase#4296 AS clinical_phase#4357, Study Population#4297 AS study_population#4358, Study treatment#4298 AS study_treatment#4359, Randomisation#4299 AS randomization#4360, Controlled trial#4300 AS controlled_trial#4361, Trial Control#4301 AS trial_control#4362, Blinding#4302 AS blinding#4363, Trial registry name#4303 AS trial_registry_name#4364, Trial registry number#4304 AS trial_registry_num#4365, Countries#4305 AS countries#4366, Country Codes#4306 AS country_codes#4367, Trial design#4307 AS trial_design#4368, Multicenter study#4308 AS multicenter_study#4369, ... 7 more fields]\n+- Relation[Trial ID#4286,Trial Acronym#4287,Trial AcronymDerived#4288,Sponsor#4289,Asymptomatic patients#4290,Pediatric Patients included (Y/N)#4291,Number of patients#4292,Number of treatment arms#4293,Trial start year#4294,Trial end year#4295,Clinical Phase#4296,Study Population#4297,Study treatment#4298,Randomisation#4299,Controlled trial#4300,Trial Control#4301,Blinding#4302,Trial registry name#4303,Trial registry number#4304,Countries#4305,Country Codes#4306,Trial design#4307,Multicenter study#4308,Inclusion criteria#4309,... 6 more fields] csv\n"
Traceback (most recent call last):
File "/mnt/yarn/usercache/root/appcache/application_1594568207850_0001/container_1594568207850_0001_01_000001/pyspark.zip/pyspark/sql/utils.py", line 63, in deco
return f(*a, **kw)
File "/mnt/yarn/usercache/root/appcache/application_1594568207850_0001/container_1594568207850_0001_01_000001/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value
format(target_id, ".", name), value)
py4j.protocol.Py4JJavaError: An error occurred while calling o529.selectExpr.
: org.apache.spark.sql.AnalysisException: cannot resolve '`INPUT__FILE__NAME`' given input columns:
How can I use INPUT__FILE__NAME function? I have already enabled hive support in my code. Or is there any other way to do this? I cannot find anything on net on how to use this function.

Try by using single underscore(_) in input_file_name() instead of double underscore.
Example:
from pyspark.sql.functions import *
sql("select *,input_file_name() from tmp")
#or
df.withColumn("filename",input_file_name()).show()

Related

Encountered an internal AutoML error- ClientException: Message: No objects to concatenate

I am trying to implement Hierarchical time series forecasting on azureautoml pipelines.
I followed this notebook for implementation
https://github.com/Azure/azureml-examples/blob/main/v1/python-sdk/tutorials/automl-with-azureml/forecasting-hierarchical-timeseries/auto-ml-forecasting-hierarchical-timeseries.ipynb
While I ran training pipeline on compute instance it worked, but when I am running the same on compute cluster it breaks at hts-proportion-calculation part.
This is the error I am getting,
system error:
Encountered an internal AutoML error. Error Message/Code: ClientException. Additional Info: ClientException:
      Message: No objects to concatenate
      InnerException: None
      ErrorResponse
{
"error": {
"message": "No objects to concatenate"
}
}
logs :
Loading arguments for scenario proportions-calculation
adding argument --input-medatadata
adding argument --hts-graph
adding argument --enable-event-logger
Input arguments dict is {'--input-medatadata': '/mnt/azureml/cr/j/85509be625484b6caa3c1d97b7ab2e33/cap/data-capability/wd/INPUT_automl_training_workspaceblobstore/azureml/17ca5ae7-7269-4246-888f-e781071e3f5c/automl_training', '--hts-graph': '/mnt/azureml/cr/j/85509be625484b6caa3c1d97b7ab2e33/cap/data-capability/wd/INPUT_hts_graph_workspaceblobstore/azureml/a2c1b15a-c895-41e8-b6a6-1ca37ebe9e77/hts_graph', '--enable-event-logger': None}
Unknown file to proceed outputs.txt
processing: outputs.txt with type None.
Cleaning up all outstanding Run operations, waiting 300.0 seconds
3 items cleaning up...
Cleanup took 0.001676321029663086 seconds
Traceback (most recent call last):
File "proportions_calculation_wrapper.py", line 47, in <module>
runtime_wrapper.run()
File "/azureml-envs/azureml_e34d0633ffc4cb2fa25d91e3da5f59be/lib/python3.7/site-packages/azureml/train/automl/runtime/_many_models/automl_pipeline_step_wrapper.py", line 63, in run
self._run()
File "/azureml-envs/azureml_e34d0633ffc4cb2fa25d91e3da5f59be/lib/python3.7/site-packages/azureml/train/automl/runtime/_hts/proportions_calculation.py", line 44, in _run
proportions_calculation(self.arguments_dict, self.event_logger, script_run=self.step_run)
File "/azureml-envs/azureml_e34d0633ffc4cb2fa25d91e3da5f59be/lib/python3.7/site-packages/azureml/train/automl/runtime/_hts/proportions_calculation.py", line 173, in proportions_calculation
proportion_files_list, forecasting_parameters.time_column_name, graph.label_column_name
File "/azureml-envs/azureml_e34d0633ffc4cb2fa25d91e3da5f59be/lib/python3.7/site-packages/azureml/train/automl/runtime/_hts/proportions_calculation.py", line 92, in calculate_time_agg_sum_for_all_files
df = pd.concat(pool.map(concat_func, files_batches), ignore_index=True)
File "/azureml-envs/azureml_e34d0633ffc4cb2fa25d91e3da5f59be/lib/python3.7/site-packages/pandas/util/_decorators.py", line 311, in wrapper
return func(*args, **kwargs)
File "/azureml-envs/azureml_e34d0633ffc4cb2fa25d91e3da5f59be/lib/python3.7/site-packages/pandas/core/reshape/concat.py", line 304, in concat
sort=sort,
File "/azureml-envs/azureml_e34d0633ffc4cb2fa25d91e3da5f59be/lib/python3.7/site-packages/pandas/core/reshape/concat.py", line 351, in __init__
raise ValueError("No objects to concatenate")
ValueError: No objects to concatenate
Please let me know how can I resolve this issue ?
This error was incurred as Iteration timeout was not less than experiment timeout , but the system error & logs are a kind of misleading.
df = pd.concat(pool.map(concat_func, files_batches), ignore_index=True)
logs was pointing to pandas "No objects to concatenate"
This error can be overcome by setting iterationtimeout value less than experimenttime out value.
I had set iteration_timeout_minutes=60 which caused the error.
automl_settings = AutoMLConfig(
task="forecasting",
primary_metric="normalized_root_mean_squared_error",
experiment_timeout_hours=1,
label_column_name=label_column_name,
track_child_runs=False,
forecasting_parameters=forecasting_parameters,
pipeline_fetch_max_batch_size=15,
model_explainability=model_explainability,
n_cross_validations="auto", # Feel free to set to a small integer (>=2) if runtime is an issue.
cv_step_size="auto",
# The following settings are specific to this sample and should be adjusted according to your own needs.
iteration_timeout_minutes=10,
iterations=15,
)
We are able to run the sample successfully using the compute cluster as given below.
from azureml.core.compute import ComputeTarget, AmlCompute
# Name your cluster
compute_name = "hts-compute"
if compute_name in ws.compute_targets:
compute_target = ws.compute_targets[compute_name]
if compute_target and type(compute_target) is AmlCompute:
print("Found compute target: " + compute_name)
else:
print("Creating a new compute target...")
provisioning_config = AmlCompute.provisioning_configuration(
vm_size="STANDARD_D16S_V3", max_nodes=20
)
# Create the compute target
compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)
# Can poll for a minimum number of nodes and for a specific timeout.
# If no min node count is provided it will use the scale settings for the cluster
compute_target.wait_for_completion(
show_output=True, min_node_count=None, timeout_in_minutes=20
)
# For a more detailed view of current cluster status, use the 'status' property
print(compute_target.status.serialize())

PyAlgoTrade: How to use resampleBarFeed with multiple instruments?

I am resampling a few instruments with [pyalogtrade][1].
I have a base barfeed for 1-minute data, which is working fine
I have added a resampler to resample for 2 minutes, as follows:
class Strategy(strategy.BaseStrategy):
def __init__(self, instruments,feed, brk):
strategy.BaseStrategy.__init__(self, feed, brk)
self.__position = None
self.__instrument = instruments
self._resampledBF = self.resampleBarFeed(2 * bar.Frequency.MINUTE, self.resampledOnBar_2minute)
self.info ("initialised strategy")
I got this error:
2022-09-08 12:36:00,396 strategy [INFO] 1-MIN: INSTRUMENT1: Date: 2022-09-08 12:35:00+05:30 Open: 17765.55 High: 17774.5 Low: 17765.35 Close: 1777 myStrategy.run()
File "pyalgotrade\pyalgotrade\strategy\__init__.py", line 514, in run
self.__dispatcher.run()
File "pyalgotrade\pyalgotrade\dispatcher.py", line 109, in run
eof, eventsDispatched = self.__dispatch()
File "pyalgotrade\pyalgotrade\dispatcher.py", line 97, in __dispatch
if self.__dispatchSubject(subject, smallestDateTime):
File "pyalgotrade\pyalgotrade\dispatcher.py", line 75, in __dispatchSubject ret = subject.dispatch() is True
File "pyalgotrade\pyalgotrade\feed\__init__.py", line 106, in dispatch
dateTime, values = self.getNextValuesAndUpdateDS()
File "pyalgotrade\pyalgotrade\feed\__init__.py", line 81, in getNextValuesAndUpdateDS
dateTime, values = self.getNextValues()
File "pyalgotrade\pyalgotrade\barfeed\__init__.py", line 101, in getNextValues
raise Exception(
Exception: Bar date times are not in order. Previous datetime was 2022-09-08 12:34:00+05:30 and current datetime is 2022-09-08 12:34:00+05:30
However, the error does not occur if the self._resampledBF = self.resampleBarFeed is commented out.
Also, on searching online, I found a similar report/ possible fix reported earlier on Google groups: https://groups.google.com/g/pyalgotrade/c/v9ht1Bfz5Ds/m/ojF8uH8sFwAJ
The solution recommended was:
Sorry never mind, I fixed it. Using current timestamp instead of the one from IB and that fixed it.
Not sure if this is has been resolved.
Would like to know how to resolve the error while resampling.

Failing to use sumproduct on date ranges with multiple conditions [Python]

From replacement data table (below on the image), I am trying to incorporate the solbox product replace in time series data format(above on the image). I need to extract out the number of consumers per day from the information.
What I need to find out:
On a specific date, which number of solbox product was active
On a specific date, which number of solbox product (which was a consumer) was active
I have used this line of code in excel but cannot implement this on python properly.
=SUMPRODUCT((Record_Solbox_Replacement!$O$2:$O$1367 = "consumer") * (A475>=Record_Solbox_Replacement!$L$2:$L$1367)*(A475<Record_Solbox_Replacement!$M$2:$M$1367))
I tried in python -
timebase_df['date'] = pd.date_range(start = replace_table_df['solbox_started'].min(), end = replace_table_df['solbox_started'].max(), freq = frequency)
timebase_df['date_unix'] = timebase_df['date'].astype(np.int64) // 10**9
timebase_df['no_of_solboxes'] = ((timebase_df['date_unix']>=replace_table_df['started'].to_numpy()) & (timebase_df['date_unix'] < replace_table_df['ended'].to_numpy() & replace_table_df['customer_type'] == 'customer']))
ERROR:
~\Anaconda3\Anaconda4\lib\site-packages\pandas\core\ops\array_ops.py in comparison_op(left, right, op)
232 # The ambiguous case is object-dtype. See GH#27803
233 if len(lvalues) != len(rvalues):
--> 234 raise ValueError("Lengths must match to compare")
235
236 if should_extension_dispatch(lvalues, rvalues):
ValueError: Lengths must match to compare
Can someone help me please? I can explain in comment section if I have missed something.

for loop over list KeyError: 664

I am trying to iterate this list with words as
CTCCTC TCCTCT CCTCTC CTCTCC TCTCCC CTCCCA TCCCAA CCCAAA CCAAAC CAAACT
CTGGGC TGGGCC GGGCCA GGCCAA GCCAAT CCAATG CAATGC AATGCC ATGCCT TGCCTG GCCTGC
TGCCAG GCCAGG CCAGGA CAGGAG AGGAGG GGAGGG GAGGGG AGGGGC GGGGCT GGGCTG GGCTGG GCTGGT CTGGTC
TGGTCT GGTCTG GTCTGG TCTGGA CTGGAC TGGACA GGACAC GACACT ACACTA CACTAT
ATTCAG TTCAGC TCAGCC CAGCCA AGCCAG GCCAGT CCAGTC CAGTCA AGTCAA GTCAAC TCAACA CAACAC AACACA
ACACAA CACAAG ACAAGG AGGTGG GGTGGC GTGGCC TGGCCT GGCCTG GCCTGC CCTGCA CTGCAC
TGCACT GCACTC CACTCG ACTCGA CTCGAG TCGAGG CGAGGT GAGGTT AGGTTC GGTTCC
TATATA ATATAC TATACC ATACCT TACCTG ACCTGG CCTGGT CTGGTA TGGTAA GGTAAT GTAATG TAATGG AATGGA
I am trying for loop to read each item in the list and parse it through mk_model.vector
the code used is as follows
for x in all_seq_sentences[:]:
mk_model.vector(x)
print(x)
Usually, mk_model.vector("AGT") will give an array corresponding to defines dna2vec model, But here rather than actually performing the model run it throws error as
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-144-77c47b13e98a> in <module>
1 for x in all_seq_sentences[:]:
----> 2 mk_model.vector(x)
3 print(x)
4
~/Desktop/DNA2vec/dna2vec/dna2vec/multi_k_model.py in vector(self, vocab)
35
36 def vector(self, vocab):
---> 37 return self.data[len(vocab)].model[vocab]
38
39 def unitvec(self, vec):
KeyError: 664
Looking forward to some help here
The above problem was having issues because the for loop took all items in first line as one item, which is why .split() was best solution of it. To read follow https://python-reference.readthedocs.io/en/latest/docs/str/split.html
working code:
for i in all_seq_sentences:
word = i.split()
print(word[0])
and then later implement another loop to access the model.vector function
vec_of_all_seq = []
for sentence in all_seq_sentences:
sentence = sentence.split()
for word in sentence:
vec_of_all_seq.append(mk_model.vector(word))
vector representation derived from model.vector will be saved in numpy array named vec_of_all_seq.

Tkinter Alarm Clock

I am trying to build a countdown timer with tkinter. I want to pass on the values from the entries onto the countdown(count) function. Here is what I tried:
def countdown(count):
label['text'] = count
if count > 0:
top.after(1000, countdown,count-1)
top = tkinter.Tk()
top.geometry("700x100")
hoursT=tkinter.Label(top, text="Hours:")
hoursE=tkinter.Entry(top)
minuteT=tkinter.Label(top, text="Minutes:")
minuteE=tkinter.Entry(top)
secondT=tkinter.Label(top, text="Seconds:")
secondE=tkinter.Entry(top)
hoursT.grid(row=1,column=1)
hoursE.grid(row=1,column=2)
minuteT.grid(row=1,column=3)
minuteE.grid(row=1,column=4)
secondT.grid(row=1,column=5)
secondE.grid(row=1,column=6)
label = tkinter.Label(top)
label.grid(row=3)
t=(int(hoursE.get())*360+int(minuteT.get())*60+int(secondE.get())
button=tkinter.Button(top,text="Start Timer",command=lambda count=t:countdown(count))
button.grid(row=2)
However, I get this error:
Traceback (most recent call last):
File "C:\Users\charley.ACER-PC\AppData\Local\Programs\Python\Python35- 32\tkinterTutorial.py", line 30, in <module>
t=(int(hoursE.get())*360+int(minuteT.get())*60+int(secondE.get()))
ValueError: invalid literal for int() with base 10: ''
How can I run this code:
t=(int(hoursE.get())*360+int(minuteT.get())*60+int(secondE.get())
only when the entries are filled with integers?
Thanks :)
One solution is to have the button there in the first place and change the command at run time with event listeners:
button=tkinter.Button(top,text="Start Timer",command=lambda:None)
button.grid(row=2)
def updateButton():
hour,min,sec=hoursE.get(),minuteT.get(),secondE.get()
if hour.isdigit() and min.isdigit() and sec.isdigit():
time=int(hour)*360+int(min)*60+int(sec)
button.configure(command=lambda count=time:countdown(count))
for widget in (hoursE,minuteT,secondE):
widget.bind("<FocusOut>", updateButton)

Resources