How to change partition columns in delta live tables? - databricks

I first setup a delta live tables using Python as follow
#dlt.table
def transaction():
return (
spark
.readStream
.format("cloudFiles")
.schema(transaction_schema)
.option("cloudFiles.format", "parquet")
.load(path)
)
And I wrote the delta live table to target database test
{
"id": <id>,
"clusters": [
{
"label": "default",
"autoscale": {
"min_workers": 1,
"max_workers": 5
}
}
],
"development": true,
"continuous": false,
"edition": "core",
"photon": false,
"libraries": [
{
"notebook": {
"path": <path>
}
}
],
"name": "dev pipeline",
"storage": <storage>,
"target": "test"
}
Everything worked as expected in the first trial.
After a while, I noticed that I forgot to add a partition column to the table, so I dropped the table in test by DROP TABLE test.transaction, and updated the notebook to
#dlt.table(
partition_cols=["partition"],
)
def transaction():
return (
spark
.readStream
.format("cloudFiles")
.schema(transaction_schema)
.option("cloudFiles.format", "parquet")
.load(path)
.withColumn("partition", F.to_date("timestamp"))
)
However, when I ran the pipeline again, I got an error
org.apache.spark.sql.AnalysisException: Cannot change partition columns for table transaction.
Current:
Requested: partition
Looks like I can't change the partition column by only dropping the target table.
What is the proper way to change partition columns in delta live tables?

If you have changed the partitioning schema, then instead of starting pipeline using Start button, you need to select "Full refresh" option from the dropdown of the Start button:

Related

Streaming not working in Delta Live table pipeline (Databricks)?

I am working on a pipeline in Databricks > Workflows > Delta Live Tables and having an issue with the streaming part.
Expectations:
One bronze table reads the json files with AutoLoader (cloudFiles), in a streaming mode (spark.readStream)
One silver table reads and flattens the bronze table in streaming (dlt.read_stream)
Result:
When taking the root location as the source (load /*, several hundreds of files): the pipelines starts, but the number of rows/files appended is not updated in the graph until the bronze part be completed. Then, the silver part starts, the number of files/rows never updates either and the pipeline terminates with a memory error.
When taking a very small number of files (/specific_folder among hundreds) : the pipeline runs well and terminates with no error, but again, the number of rows/files appended is not updated in the graph until each part is completed.
This led me to the conclusion that the pipeline seems not to run in a streaming mode.
Maybe I am missing something about the config or how to run properly a DLT pipeline, and would need your help on this please.
Here is the configuration of the pipeline:
{
"id": "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx",
"clusters": [
{
"label": "default",
"aws_attributes": {
"instance_profile_arn": "arn:aws:iam::xxxxxxxxxxxx:instance-profile/iam_role_example"
},
"autoscale": {
"min_workers": 1,
"max_workers": 10,
"mode": "LEGACY"
}
}
],
"development": true,
"continuous": false,
"channel": "CURRENT",
"edition": "PRO",
"photon": false,
"libraries": [
{
"notebook": {
"path": "/Repos/user_example#xxxxxx.xx/dms/bronze_job"
}
}
],
"name": "01-landing-task-1",
"storage": "dbfs:/pipelines/xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx",
"configuration": {
"SCHEMA": "example_schema",
"RAW_MOUNT_NAME": "xxxx",
"DELTA_MOUNT_NAME": "xxxx",
"spark.sql.parquet.enableVectorizedReader": "false"
},
"target": "landing"}
Here is the code of the pipeline (the query in the silver table contains many more columns with a get_json_object, ~30 actually):
import dlt
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql.window import Window
RAW_MOUNT_NAME = spark.conf.get("RAW_MOUNT_NAME")
SCHEMA = spark.conf.get("SCHEMA")
SOURCE = spark.conf.get("SOURCE")
TABLE_NAME = spark.conf.get("TABLE_NAME")
PRIMARY_KEY_PATH = spark.conf.get("PRIMARY_KEY_PATH")
#dlt.table(
name=f"{SCHEMA}_{TABLE_NAME}_bronze",
table_properties={
"quality": "bronze"
}
)
def bronze_job():
load_path = f"/mnt/{RAW_MOUNT_NAME}/{SOURCE}/5e*"
return spark \
.readStream \
.format("text") \
.option("encoding", "UTF-8") \
.load(load_path) \
.select("value", "_metadata") \
.withColumnRenamed("value", "json") \
.withColumn("id", F.expr(f"get_json_object(json, '$.{PRIMARY_KEY_PATH}')")) \
.withColumn("_etl_timestamp", F.col("_metadata.file_modification_time")) \
.withColumn("_metadata", F.col("_metadata").cast(T.StringType())) \
.withColumn("_etl_operation", F.lit("U")) \
.withColumn("_etl_to_delete", F.lit(False)) \
.withColumn("_etl_file_name", F.input_file_name()) \
.withColumn("_etl_job_processing_timestamp", F.current_timestamp()) \
.withColumn("_etl_table", F.lit(f"{TABLE_NAME}")) \
.withColumn("_etl_partition_date", F.to_date(F.col("_etl_timestamp"), "yyyy-MM-dd")) \
.select("_etl_operation", "_etl_timestamp", "id", "json", "_etl_file_name", "_etl_job_processing_timestamp", "_etl_table", "_etl_partition_date", "_etl_to_delete", "_metadata")
#dlt.table(
name=f"{SCHEMA}_{TABLE_NAME}_silver",
table_properties = {
"quality": "silver",
"delta.autoOptimize.optimizeWrite": "true",
"delta.autoOptimize.autoCompact": "true"
}
)
def silver_job():
df = dlt.read_stream(f"{SCHEMA}_{TABLE_NAME}_bronze").where("_etl_table == 'extraction'")
return df.select(
df.id.alias('medium_id'),
F.get_json_object(df.json, '$.request').alias('request_id'))
Thank you very much for your help!

Glue Catalog w/ Delta Tables Connected to Databricks SQL Engine

I am trying to query delta tables from my AWS Glue Catalog on Databricks SQL Engine. They are stored in Delta Lake format. I have glue crawlers automating schemas. The catalog is setup & functioning with non Delta Tables. The setup via databricks loads the available tables per database via the catalog & but the query fails due to databricks using hive instead of delta to read.
Incompatible format detected.
A transaction log for Databricks Delta was found at `s3://COMPANY/club/attachment/_delta_log`,
but you are trying to read from `s3://COMPANY/club/attachment` using format("hive"). You must use
'format("delta")' when reading and writing to a delta table.
To disable this check, SET spark.databricks.delta.formatCheck.enabled=false
To learn more about Delta, see https://docs.databricks.com/delta/index.html
SQL Warehouse settings => Data Access Configuration
spark.databricks.hive.metastore.glueCatalog.enabled : true
The crawler using DELTA LAKE setup from AWS produces the following table metadata
{
"StorageDescriptor": {
"cols": {
"FieldSchema": [
{
"name": "id",
"type": "string",
"comment": ""
},
{
"name": "media",
"type": "string",
"comment": ""
},
{
"name": "media_type",
"type": "string",
"comment": ""
},
{
"name": "title",
"type": "string",
"comment": ""
},
{
"name": "type",
"type": "smallint",
"comment": ""
},
{
"name": "clubmessage_id",
"type": "string",
"comment": ""
}
]
},
"location": "s3://COMPANY/club/attachment/_symlink_format_manifest",
"inputFormat": "org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat",
"outputFormat": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
"compressed": "false",
"numBuckets": "-1",
"SerDeInfo": {
"name": "",
"serializationLib": "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
"parameters": {}
},
"bucketCols": [],
"sortCols": [],
"parameters": {
"UPDATED_BY_CRAWLER": "CRAWLER_NAME",
"CrawlerSchemaSerializerVersion": "1.0",
"CrawlerSchemaDeserializerVersion": "1.0",
"classification": "parquet"
},
"SkewedInfo": {},
"storedAsSubDirectories": "false"
},
"parameters": {
"UPDATED_BY_CRAWLER": "CRAWLER_NAME",
"CrawlerSchemaSerializerVersion": "1.0",
"CrawlerSchemaDeserializerVersion": "1.0",
"classification": "parquet"
}
}
I am facing the same problem. It seems like you can not use Spark SQL to query a delta table in Glue, because setting
spark.databricks.hive.metastore.glueCatalog.enabled : true
implies that the table will be a hive table.
You will need to access the table in S3 directly, losing the advantages of the meta data catalog.
You can read from it though, by blocking your cluster from accessing the _delta_log folder with the following IAM policy:
{ "Sid": "BlockDeltaLog", "Effect": "Deny", "Action": "s3:*", "Resource": [ "arn:aws:s3:::BUCKET" ], "Condition": { "StringLike": { "s3:prefix": [ "_delta_log/" ] } } }
I was able to query a delta table created by glue crawlers after updating the location. In your case it would need to be changed from:
s3://COMPANY/club/attachment/_symlink_format_manifest
to
s3://COMPANY/club/attachment
This is because delta on spark doesn't look at _symlink_format_manifest like hive and presto. It just needs to know the root directory.
The command in databricks to update the location is something like this:
ALTER table my_db.my_table
SET LOCATION "s3://COMPANY/club/attachment"
Note: your database location has to be set as well in order for that command to work

Spark can not process recursive avro data

I have avsc schema like below:
{
"name": "address",
"type": [
"null",
{
"type":"record",
"name":"Address",
"namespace":"com.data",
"fields":[
{
"name":"address",
"type":[ "null","com.data.Address"],
"default":null
}
]
}
],
"default": null
}
On loading this data in pyspark:
jsonFormatSchema = open("Address.avsc", "r").read()
spark = SparkSession.builder.appName('abc').getOrCreate()
df = spark.read.format("avro")\
.option("avroSchema", jsonFormatSchema)\
.load("xxx.avro")
I got such exception:
"Found recursive reference in Avro schema, which can not be processed by Spark"
I tried many other configurations, but without any success.
To execute I use with spark-submit:
--packages org.apache.spark:spark-avro_2.12:3.0.1
This is a intended feature, you can take a look at the "issue" :
https://issues.apache.org/jira/browse/SPARK-25718

Azure Data Factory activity copy: Evaluate column in sink table with #pipeline().TriggerTime

With Data Factory V2 I'm trying to implement a stream of data copy from one Azure SQL database to another.
I have mapped all the columns of the source table with the sink table but in the sink table I have an empty column where I would like to enter the pipeline run time.
Does anyone know how to fill this column in the sink table without it being present in the source table?
Below there is the code of my copy pipeline
{
"name": "FLD_Item_base",
"properties": {
"activities": [
{
"name": "Copy_Team",
"description": "copytable",
"type": "Copy",
"policy": {
"timeout": "7.00:00:00",
"retry": 0,
"retryIntervalInSeconds": 30,
"secureOutput": false,
"secureInput": false
},
"typeProperties": {
"source": {
"type": "SqlSource"
},
"sink": {
"type": "SqlSink",
"writeBatchSize": 10000,
"preCopyScript": "TRUNCATE TABLE Team_new"
},
"enableStaging": false,
"dataIntegrationUnits": 0,
"translator": {
"type": "TabularTranslator",
"columnMappings": {
"Code": "Code",
"Name": "Name"
}
}
},
"inputs": [
{
"referenceName": "Team",
"type": "DatasetReference"
}
],
"outputs": [
{
"referenceName": "Team_new",
"type": "DatasetReference"
}
]
}
]
}
}
In my sink table I already have the column data_loadwhere I would like to insert the pipeline execution date, but I did not currently map it.
Based on your situation, please configure SQL Server stored procedure in your SQL Server sink as a workaround.
Please follow the steps from this doc:
Step 1: Configure your Sink dataset:
Step 2: Configure Sink section in copy activity as follows:
Step 3: In your database, define the table type with the same name as sqlWriterTableType. Notice that the schema of the table type should be same as the schema returned by your input data.
CREATE TYPE [dbo].[testType] AS TABLE(
[ID] [varchar](256) NOT NULL,
[EXECUTE_TIME] [datetime] NOT NULL
)
GO
Step 4: In your database, define the stored procedure with the same name as SqlWriterStoredProcedureName. It handles input data from your specified source, and merge into the output table. Notice that the parameter name of the stored procedure should be the same as the "tableName" defined in dataset.
Create PROCEDURE convertCsv #ctest [dbo].[testType] READONLY
AS
BEGIN
MERGE [dbo].[adf] AS target
USING #ctest AS source
ON (1=1)
WHEN NOT MATCHED THEN
INSERT (id,executeTime)
VALUES (source.ID,GETDATE());
END
you can consider using stored procedure at the sink side to apply the source data into the sink table by designating "sqlWriterStoredProcedureName" of the SqlSink. Pass the pipeline run time to the stored procedure as the parameter and insert into sink table.

How to use Azure Data Factory for importing data incrementally from MySQL to Azure Data Warehouse?

I'm using Azure Data Factory to periodically import data from MySQL to Azure SQL Data Warehouse.
The data goes through a staging blob storage on an Azure storage account, but when I run the pipeline it fails because it can't separate the blob text back to columns. Each row that the pipeline tries to insert into the destination becomes a long string which contains all the column values delimited by a "⯑" character.
I used Data Factory before, without trying the incremental mechanism, and it worked fine. I don't see a reason it would cause such a behavior, but I'm probably missing something.
I'm attaching the JSON that describes the pipeline with some minor naming changes, please let me know if you see anything that can explain this.
Thanks!
EDIT: Adding exception message:
Failed execution Database operation failed. Error message from
database execution :
ErrorCode=FailedDbOperation,'Type=Microsoft.DataTransfer.Com‌​mon.Shared.HybridDel‌​iveryException,Messa‌​ge=Error
happened when loading data into SQL Data
Warehouse.,Source=Microsoft.DataTransfer.ClientLibrary,''Typ‌​e=System.Data.SqlCli‌​ent.SqlException,Mes‌​sage=Query
aborted-- the maximum reject threshold (0 rows) was reached while
reading from an external source: 1 rows rejected out of total 1 rows
processed.
(/f4ae80d1-4560-4af9-9e74-05de941725ac/Data.8665812f-fba1-40‌​7a-9e04-2ee5f3ca5a7e‌​.txt)
Column ordinal: 27, Expected data type: VARCHAR(45) collate SQL_Latin1_General_CP1_CI_AS, Offending value:* ROW OF VALUES
* (Tokenization failed), Error: Not enough columns in this
line.,},],'.
{
"name": "CopyPipeline-move_incremental_test",
"properties": {
"activities": [
{
"type": "Copy",
"typeProperties": {
"source": {
"type": "RelationalSource",
"query": "$$Text.Format('select * from [table] where InsertTime >= \\'{0:yyyy-MM-dd HH:mm}\\' AND InsertTime < \\'{1:yyyy-MM-dd HH:mm}\\'', WindowStart, WindowEnd)"
},
"sink": {
"type": "SqlDWSink",
"sqlWriterCleanupScript": "$$Text.Format('delete [schema].[table] where [InsertTime] >= \\'{0:yyyy-MM-dd HH:mm}\\' AND [InsertTime] <\\'{1:yyyy-MM-dd HH:mm}\\'', WindowStart, WindowEnd)",
"allowPolyBase": true,
"polyBaseSettings": {
"rejectType": "Value",
"rejectValue": 0,
"useTypeDefault": true
},
"writeBatchSize": 0,
"writeBatchTimeout": "00:00:00"
},
"translator": {
"type": "TabularTranslator",
"columnMappings": "column1:column1,column2:column2,column3:column3"
},
"enableStaging": true,
"stagingSettings": {
"linkedServiceName": "StagingStorage-somename",
"path": "somepath"
}
},
"inputs": [
{
"name": "InputDataset-input"
}
],
"outputs": [
{
"name": "OutputDataset-output"
}
],
"policy": {
"timeout": "1.00:00:00",
"concurrency": 10,
"style": "StartOfInterval",
"retry": 3,
"longRetry": 0,
"longRetryInterval": "00:00:00"
},
"scheduler": {
"frequency": "Hour",
"interval": 1
},
"name": "Activity-0-_Custom query_->[schema]_[table]"
}
],
"start": "2017-06-01T05:29:12.567Z",
"end": "2099-12-30T22:00:00Z",
"isPaused": false,
"hubName": "datafactory_hub",
"pipelineMode": "Scheduled"
}
}
It sounds like what your doing is right, but the data is poorly formed (common problem, none UTF-8 encoding) so ADF can't parse the structure as you require. When I encounter this I often have to add a custom activity to the pipeline that cleans and prepares the data so it can then be used in a structured way by downstream activities. Unfortunately this is a big be overhead in the development of the solution and will require you to write a C# class to deal with the data transformation.
Also remember ADF has none of its own compute, it only invokes other services, so you'll also need an Azure Batch Service to execute to compiled code.
Sadly there is no magic fix here. Azure is great to Extract and Load your perfectly structured data, but in the real world we need other services to do the Transform or Cleaning meaning we need a pipeline that can ETL or I prefer ECTL.
Here's a link on create ADF custom activities to get you started: https://www.purplefrogsystems.com/paul/2016/11/creating-azure-data-factory-custom-activities/
Hope this helps.
I've been struggeling with the same message, sort of, when importing from Azure sql db to Azure DWH using Data Factory v.2 using staging (which implies Polybase). I've learned that Polybase will fail with error messages related to incorrect data types etc. The message I've received is much similar to the one mentioned here, even though I'm not using Polybase directly from SQL, but via Data Factory.
Anyways, the solution for me was to avoid NULL values for columns of decimal or numeric type, e.g. ISNULL(mynumericCol, 0) as mynumericCol.

Resources