I'm writing azure graph query to extract the all vms, which include backup policy if associated it, if backup policy associated, need to extract the retention details of the backup. but I'm getting issue saying
Table RecoveryServicesResources was referenced as right table 2 times, which exceeded the limit of 1. Please see https://aka.ms/resourcegraph-tables for help. (Code:DisallowedMaxNumberOfRemoteTables)
to get all backup polices and associated devices
Graph Query 1
RecoveryServicesResources
| where type in~ ('Microsoft.RecoveryServices/vaults/backupFabrics/protectionContainers/protectedItems')
| where properties.backupManagementType == "AzureIaasVM"
| extend vaultName = case(type =~ 'microsoft.dataprotection/backupVaults/backupInstances',split(split(id, '/Microsoft.DataProtection/backupVaults/')[1],'/')[0],type =~ 'Microsoft.RecoveryServices/vaults/backupFabrics/protectionContainers/protectedItems',split(split(id, '/Microsoft.RecoveryServices/vaults/')[1],'/')[0],'--')
| extend dataSourceType = case(type=~'Microsoft.RecoveryServices/vaults/backupFabrics/protectionContainers/protectedItems',properties.backupManagementType,type =~ 'microsoft.dataprotection/backupVaults/backupInstances',properties.dataSourceSetInfo.datasourceType,'--')
| extend friendlyName = properties.friendlyName
| extend dsResourceGroup = split(split(properties.dataSourceInfo.resourceID, '/resourceGroups/')[1],'/')[0]
| extend dsSubscription = split(split(properties.dataSourceInfo.resourceID, '/subscriptions/')[1],'/')[0]
| extend primaryLocation = properties.dataSourceInfo.resourceLocation
| extend policyName = case(type =~ 'Microsoft.RecoveryServices/vaults/backupFabrics/protectionContainers/protectedItems',properties.policyName, type =~ 'microsoft.dataprotection/backupVaults/backupInstances', properties.policyInfo.name, '--')
| extend protectionState = properties.currentProtectionState
| extend vmProperties = properties
| where protectionState in~ ('ConfiguringProtection','ProtectionConfigured','ConfiguringProtectionFailed','ProtectionStopped','SoftDeleted','ProtectionError')
| project id, friendlyName, dataSourceType, dsResourceGroup, dsSubscription, vmProperties, vaultName, protectionState, policyName,primaryLocation
| join kind = leftouter (
RecoveryServicesResources
| where type == 'microsoft.recoveryservices/vaults/backuppolicies'
| extend xvaultName = case(type == 'microsoft.recoveryservices/vaults/backuppolicies', split(split(id, 'microsoft.recoveryservices/vaults/')[1],'/')[0],type == 'microsoft.recoveryservices/vaults/backuppolicies', split(split(id, 'microsoft.recoveryservices/vaults/')[1],'/')[0],'--')
| extend datasourceType = case(type == 'microsoft.recoveryservices/vaults/backuppolicies', properties.backupManagementType,type == 'microsoft.dataprotection/backupVaults/backupPolicies',properties.datasourceTypes[0],'--')
| extend policyID = id
| extend dailyDurationType = replace('"','',replace(':','',replace('durationType','',replace('{','',tostring(split(properties.retentionPolicy.dailySchedule.retentionDuration,',')[0])))))
| extend daylyLTR = replace('"','',replace(':','',replace('count','',replace('}','',tostring(split(properties.retentionPolicy.dailySchedule.retentionDuration,',')[1])))))
| extend DailyBackup = strcat("Daily, retention Duration ", daylyLTR, " ", dailyDurationType)
| extend weeklyDurationType = replace('"','',replace(':','',replace('durationType','',replace('{','',tostring(split(properties.retentionPolicy.weeklySchedule.retentionDuration,',')[0])))))
| extend weeklyLTR = replace('"','',replace(':','',replace('count','',replace('}','',tostring(split(properties.retentionPolicy.weeklySchedule.retentionDuration,',')[1])))))
| extend weeklyStartDate = split(tostring(properties.retentionPolicy.weeklySchedule.daysOfTheWeek),'"')[1]
| extend WeeklyBackup = strcat("Every ", weeklyStartDate, ", retention Duration ", weeklyLTR, " ", weeklyDurationType)
| extend monthlyDurationType = replace('"','',replace(':','',replace('durationType','',replace('{','',tostring(split(properties.retentionPolicy.monthlySchedule.retentionDuration,',')[0])))))
| extend monthlyLTR = replace('"','',replace(':','',replace('count','',replace('}','',tostring(split(properties.retentionPolicy.monthlySchedule.retentionDuration,',')[1])))))
| extend monthlyStartDayWeek = split(tostring(properties.retentionPolicy.monthlySchedule.retentionScheduleWeekly.daysOfTheWeek),'"')[1]
| extend monthlyStartWeekMonth = split(tostring(properties.retentionPolicy.monthlySchedule.retentionScheduleWeekly.weeksOfTheMonth),'"')[1]
| extend MonthlyBackup = strcat("Every ", monthlyStartDayWeek, " ", monthlyStartWeekMonth, " Week, retention Duration ", monthlyLTR, " " , monthlyDurationType)
| extend yearDurationType = replace('"','',replace(':','',replace('durationType','',replace('{','',tostring(split(properties.retentionPolicy.yearlySchedule.retentionDuration,',')[0])))))
| extend yearLTR = replace('"','',replace(':','',replace('count','',replace('}','',tostring(split(properties.retentionPolicy.yearlySchedule.retentionDuration,',')[1])))))
| extend yearlyStartDayWeek = split(tostring(properties.retentionPolicy.yearlySchedule.retentionScheduleWeekly.daysOfTheWeek),'"')[1]
| extend yearlyStartWeekMonth = split(tostring(properties.retentionPolicy.yearlySchedule.retentionScheduleWeekly.weeksOfTheMonth),'"')[1]
| extend yearlyStartMonth = split(tostring(properties.retentionPolicy.yearlySchedule.monthsOfYear),'"')[1]
| extend YearlyBackup = strcat("Every month ", yearlyStartWeekMonth, " ", yearlyStartDayWeek, ", retention Duration ", yearLTR, " ", yearDurationType)
| project resourceId = tolower(tostring(properties.sourceResourceId)),policyID,policyName=name,DailyBackup,WeeklyBackup,MonthlyBackup,YearlyBackup,daylyLTR,weeklyLTR,monthlyLTR,yearLTR) on policyName
2nd Query
To get all Vm and if they have backup policy
Resources
| where type in~ ('microsoft.compute/virtualmachines','microsoft.classiccompute/virtualmachines')
| extend resourceId=tolower(id)
| extend sku = properties.storageProfile.imageReference.sku
| extend publisher = properties.storageProfile.imageReference.publisher
| extend offer = properties.storageProfile.imageReference.offer
| extend ostype = properties.storageProfile.osDisk.osType
| extend hardwareType = properties.hardwareProfile
| join kind = leftouter (
RecoveryServicesResources
| where type in~ ("microsoft.recoveryservices/vaults/backupfabrics/protectioncontainers/protecteditems", "microsoft.recoveryservices/vaults/backuppolicies")
| where properties.backupManagementType == "AzureIaasVM"
| extend backupItemid = id
| extend isBackedUp = isnotempty(id)
| extend policyName = case(type =~ 'Microsoft.RecoveryServices/vaults/backupFabrics/protectionContainers/protectedItems',properties.policyName, type =~ 'microsoft.dataprotection/backupVaults/backupInstances', properties.policyInfo.name, '--')
| extend vaultName = case(type =~ 'microsoft.dataprotection/backupVaults/backupInstances',split(split(id, '/Microsoft.DataProtection/backupVaults/')[1],'/')[0],type =~ 'Microsoft.RecoveryServices/vaults/backupFabrics/protectionContainers/protectedItems',split(split(id, '/Microsoft.RecoveryServices/vaults/')[1],'/')[0],'--')
| extend dataSourceType = case(type=~'Microsoft.RecoveryServices/vaults/backupFabrics/protectionContainers/protectedItems',properties.backupManagementType,type =~ 'microsoft.dataprotection/backupVaults/backupInstances',properties.dataSourceSetInfo.datasourceType,'--')
| project resourceId = tolower(tostring(properties.sourceResourceId)), backupItemid, isBackedUp, policyName, vaultName, dataSourceType ) on resourceId
| extend isProtected = isnotempty(backupItemid)
I try to combined them but getting following error,
Combined Query
Resources
| where type in~ ('microsoft.compute/virtualmachines','microsoft.classiccompute/virtualmachines')
| extend resourceId=tolower(id)
| extend sku = properties.storageProfile.imageReference.sku
| extend publisher = properties.storageProfile.imageReference.publisher
| extend offer = properties.storageProfile.imageReference.offer
| extend ostype = properties.storageProfile.osDisk.osType
| extend hardwareType = properties.hardwareProfile
| join kind = leftouter ( RecoveryServicesResources
| where type in~ ('Microsoft.RecoveryServices/vaults/backupFabrics/protectionContainers/protectedItems')
| extend vaultName = case(type =~ 'microsoft.dataprotection/backupVaults/backupInstances',split(split(id, '/Microsoft.DataProtection/backupVaults/')[1],'/')[0],type =~ 'Microsoft.RecoveryServices/vaults/backupFabrics/protectionContainers/protectedItems',split(split(id, '/Microsoft.RecoveryServices/vaults/')[1],'/')[0],'--')
| extend dataSourceType = case(type=~'Microsoft.RecoveryServices/vaults/backupFabrics/protectionContainers/protectedItems',properties.backupManagementType,type =~ 'microsoft.dataprotection/backupVaults/backupInstances',properties.dataSourceSetInfo.datasourceType,'--')
| extend friendlyName = properties.friendlyName
| extend dsResourceGroup = split(split(properties.dataSourceInfo.resourceID, '/resourceGroups/')[1],'/')[0]
| extend dsSubscription = split(split(properties.dataSourceInfo.resourceID, '/subscriptions/')[1],'/')[0]
| extend primaryLocation = properties.dataSourceInfo.resourceLocation
| extend policyName = case(type =~ 'Microsoft.RecoveryServices/vaults/backupFabrics/protectionContainers/protectedItems',properties.policyName, type =~ 'microsoft.dataprotection/backupVaults/backupInstances', properties.policyInfo.name, '--')
| extend protectionState = properties.currentProtectionState
| extend vmProperties = properties
| where protectionState in~ ('ConfiguringProtection','ProtectionConfigured','ConfiguringProtectionFailed','ProtectionStopped','SoftDeleted','ProtectionError')
| project id, friendlyName, dataSourceType, dsResourceGroup, dsSubscription, vmProperties, vaultName, protectionState, policyName,primaryLocation
| join kind=leftouter (RecoveryServicesResources
| where type == 'microsoft.recoveryservices/vaults/backuppolicies'
| extend xvaultName = case(type == 'microsoft.recoveryservices/vaults/backuppolicies', split(split(id, 'microsoft.recoveryservices/vaults/')[1],'/')[0],type == 'microsoft.recoveryservices/vaults/backuppolicies', split(split(id, 'microsoft.recoveryservices/vaults/')[1],'/')[0],'--')
| extend datasourceType = case(type == 'microsoft.recoveryservices/vaults/backuppolicies', properties.backupManagementType,type == 'microsoft.dataprotection/backupVaults/backupPolicies',properties.datasourceTypes[0],'--')
| extend policyID = id
| extend dailyDurationType = replace('"','',replace(':','',replace('durationType','',replace('{','',tostring(split(properties.retentionPolicy.dailySchedule.retentionDuration,',')[0])))))
| extend daylyLTR = replace('"','',replace(':','',replace('count','',replace('}','',tostring(split(properties.retentionPolicy.dailySchedule.retentionDuration,',')[1])))))
| extend DailyBackup = strcat("Daily, retention Duration ", daylyLTR, " ", dailyDurationType)
| extend weeklyDurationType = replace('"','',replace(':','',replace('durationType','',replace('{','',tostring(split(properties.retentionPolicy.weeklySchedule.retentionDuration,',')[0])))))
| extend weeklyLTR = replace('"','',replace(':','',replace('count','',replace('}','',tostring(split(properties.retentionPolicy.weeklySchedule.retentionDuration,',')[1])))))
| extend weeklyStartDate = split(tostring(properties.retentionPolicy.weeklySchedule.daysOfTheWeek),'"')[1]
| extend WeeklyBackup = strcat("Every ", weeklyStartDate, ", retention Duration ", weeklyLTR, " ", weeklyDurationType)
| extend monthlyDurationType = replace('"','',replace(':','',replace('durationType','',replace('{','',tostring(split(properties.retentionPolicy.monthlySchedule.retentionDuration,',')[0])))))
| extend monthlyLTR = replace('"','',replace(':','',replace('count','',replace('}','',tostring(split(properties.retentionPolicy.monthlySchedule.retentionDuration,',')[1])))))
| extend monthlyStartDayWeek = split(tostring(properties.retentionPolicy.monthlySchedule.retentionScheduleWeekly.daysOfTheWeek),'"')[1]
| extend monthlyStartWeekMonth = split(tostring(properties.retentionPolicy.monthlySchedule.retentionScheduleWeekly.weeksOfTheMonth),'"')[1]
| extend MonthlyBackup = strcat("Every ", monthlyStartDayWeek, " ", monthlyStartWeekMonth, " Week, retention Duration ", monthlyLTR, " " , monthlyDurationType)
| extend yearDurationType = replace('"','',replace(':','',replace('durationType','',replace('{','',tostring(split(properties.retentionPolicy.yearlySchedule.retentionDuration,',')[0])))))
| extend yearLTR = replace('"','',replace(':','',replace('count','',replace('}','',tostring(split(properties.retentionPolicy.yearlySchedule.retentionDuration,',')[1])))))
| extend yearlyStartDayWeek = split(tostring(properties.retentionPolicy.yearlySchedule.retentionScheduleWeekly.daysOfTheWeek),'"')[1]
| extend yearlyStartWeekMonth = split(tostring(properties.retentionPolicy.yearlySchedule.retentionScheduleWeekly.weeksOfTheMonth),'"')[1]
| extend yearlyStartMonth = split(tostring(properties.retentionPolicy.yearlySchedule.monthsOfYear),'"')[1]
| extend YearlyBackup = strcat("Every month ", yearlyStartWeekMonth, " ", yearlyStartDayWeek, ", retention Duration ", yearLTR, " ", yearDurationType)
| project resourceId = tolower(tostring(properties.sourceResourceId)),policyID,policyName=name,DailyBackup,WeeklyBackup,MonthlyBackup,YearlyBackup,daylyLTR,weeklyLTR,monthlyLTR,yearLTR) on policyName) on resourceId
Error
Please provide below info when asking for support: timestamp = 2022-01-25T05:23:02.8691470Z, correlationId = e0cf5a44-aacb-4325-b6b1-edbdb44c2ac0. (Code:BadRequest)
Details:
Table RecoveryServicesResources was referenced as right table 2 times, which exceeded the limit of 1. Please see https://aka.ms/resourcegraph-tables for help. (Code:DisallowedMaxNumberOfRemoteTables)
I would like to use Spark to parse network messages and group them into logical entities in a stateful manner.
Problem Description
Let's assume each message is in one row of an input dataframe, depicted below.
| row | time | raw payload |
+-------+------+---------------+
| 1 | 10 | TEXT1; |
| 2 | 20 | TEXT2;TEXT3; |
| 3 | 30 | LONG- |
| 4 | 40 | TEXT1; |
| 5 | 50 | TEXT4;TEXT5;L |
| 6 | 60 | ONG |
| 7 | 70 | -TEX |
| 8 | 80 | T2; |
The task is to parse the logical messages in the raw payload, and provide them in a new output dataframe. In the example each logical message in the payload ends with a semicolon (delimiter).
The desired output dataframe could then look as follows:
| row | time | message |
+-------+------+---------------+
| 1 | 10 | TEXT1; |
| 2 | 20 | TEXT2; |
| 3 | 20 | TEXT3; |
| 4 | 30 | LONG-TEXT1; |
| 5 | 50 | TEXT4; |
| 6 | 50 | TEXT5; |
| 7 | 50 | LONG-TEXT2; |
Note that some messages rows do not yield a new row in the result (e.g. rows 4, 6,7,8), and some yield even multiple rows (e.g. rows 2, 5)
My questions:
is this a use case for UDAF? If so, how for example should i implement the merge function? i have no idea what its purpose is.
since the message ordering matters (i cannot process LONGTEXT-1, LONGTEXT-2 properly without respecting the message order), can i tell spark to parallelize perhaps on a higer level (e.g. per calendar day of messages) but not parallelize within a day (e.g. events at time 50,60,70,80 need to be processed in order).
follow up question: is it conceivable that the solution will be usable not just in traditional spark, but also in spark structured streaming? Or does the latter require its own kind of stateful processing method?
Generally, you can run arbitrary stateful aggregations on spark streaming by using mapGroupsWithState of flatMapGroupsWithState. You can find some examples here. None of those though will guarantee that the processing of the stream will be ordered by event time.
If you need to enforce data ordering, you should try to use window operations on event time. In that case, you need to run stateless operations instead, but if the number of elements in each window group is small enough, you can use collectList for instance and then apply a UDF (where you can manage the state for each window group) on each list.
ok i figured it out in the meantime how to do this with an UDAF.
class TagParser extends UserDefinedAggregateFunction {
override def inputSchema: StructType = StructType(StructField("value", StringType) :: Nil)
override def bufferSchema: StructType = StructType(
StructField("parsed", ArrayType(StringType)) ::
StructField("rest", StringType)
:: Nil)
override def dataType: DataType = ArrayType(StringType)
override def deterministic: Boolean = true
override def initialize(buffer: MutableAggregationBuffer): Unit = {
buffer(0) = IndexedSeq[String]()
buffer(1) = null
}
def doParse(str: String, buffer: MutableAggregationBuffer): Unit = {
buffer(0) = IndexedSeq[String]()
val prevRest = buffer(1)
var idx = -1
val strToParse = if (prevRest != null) prevRest + str else str
do {
val oldIdx = idx;
idx = strToParse.indexOf(';', oldIdx + 1)
if (idx == -1) {
buffer(1) = strToParse.substring(oldIdx + 1)
} else {
val newlyParsed = strToParse.substring(oldIdx + 1, idx)
buffer(0) = buffer(0).asInstanceOf[IndexedSeq[String]] :+ newlyParsed
buffer(1) = null
}
} while (idx != -1)
}
override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
if (buffer == null) {
return
}
doParse(input.getAs[String](0), buffer)
}
override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = throw new UnsupportedOperationException
override def evaluate(buffer: Row): Any = buffer(0)
}
Here a demo app the uses the above UDAF to solve the problem from above:
case class Packet(time: Int, payload: String)
object TagParserApp extends App {
val spark, sc = ... // kept out for brevity
val df = sc.parallelize(List(
Packet(10, "TEXT1;"),
Packet(20, "TEXT2;TEXT3;"),
Packet(30, "LONG-"),
Packet(40, "TEXT1;"),
Packet(50, "TEXT4;TEXT5;L"),
Packet(60, "ONG"),
Packet(70, "-TEX"),
Packet(80, "T2;")
)).toDF()
val tp = new TagParser
val window = Window.rowsBetween(Window.unboundedPreceding, Window.currentRow)
val df2 = df.withColumn("msg", tp.apply(df.col("payload")).over(window))
df2.show()
}
this yields:
+----+-------------+--------------+
|time| payload| msg|
+----+-------------+--------------+
| 10| TEXT1;| [TEXT1]|
| 20| TEXT2;TEXT3;|[TEXT2, TEXT3]|
| 30| LONG-| []|
| 40| TEXT1;| [LONG-TEXT1]|
| 50|TEXT4;TEXT5;L|[TEXT4, TEXT5]|
| 60| ONG| []|
| 70| -TEX| []|
| 80| T2;| [LONG-TEXT2]|
+----+-------------+--------------+
the main issue for me was to figure out how to actually apply this UDAF, namely using this:
df.withColumn("msg", tp.apply(df.col("payload")).over(window))
the only thing i need now to figure out are the aspects of parallelization (which i only want to happen where we do not rely on ordering) but that's a separate issue for me.