Azure graph query to get data - azure
I'm writing azure graph query to extract the all vms, which include backup policy if associated it, if backup policy associated, need to extract the retention details of the backup. but I'm getting issue saying
Table RecoveryServicesResources was referenced as right table 2 times, which exceeded the limit of 1. Please see https://aka.ms/resourcegraph-tables for help. (Code:DisallowedMaxNumberOfRemoteTables)
to get all backup polices and associated devices
Graph Query 1
RecoveryServicesResources
| where type in~ ('Microsoft.RecoveryServices/vaults/backupFabrics/protectionContainers/protectedItems')
| where properties.backupManagementType == "AzureIaasVM"
| extend vaultName = case(type =~ 'microsoft.dataprotection/backupVaults/backupInstances',split(split(id, '/Microsoft.DataProtection/backupVaults/')[1],'/')[0],type =~ 'Microsoft.RecoveryServices/vaults/backupFabrics/protectionContainers/protectedItems',split(split(id, '/Microsoft.RecoveryServices/vaults/')[1],'/')[0],'--')
| extend dataSourceType = case(type=~'Microsoft.RecoveryServices/vaults/backupFabrics/protectionContainers/protectedItems',properties.backupManagementType,type =~ 'microsoft.dataprotection/backupVaults/backupInstances',properties.dataSourceSetInfo.datasourceType,'--')
| extend friendlyName = properties.friendlyName
| extend dsResourceGroup = split(split(properties.dataSourceInfo.resourceID, '/resourceGroups/')[1],'/')[0]
| extend dsSubscription = split(split(properties.dataSourceInfo.resourceID, '/subscriptions/')[1],'/')[0]
| extend primaryLocation = properties.dataSourceInfo.resourceLocation
| extend policyName = case(type =~ 'Microsoft.RecoveryServices/vaults/backupFabrics/protectionContainers/protectedItems',properties.policyName, type =~ 'microsoft.dataprotection/backupVaults/backupInstances', properties.policyInfo.name, '--')
| extend protectionState = properties.currentProtectionState
| extend vmProperties = properties
| where protectionState in~ ('ConfiguringProtection','ProtectionConfigured','ConfiguringProtectionFailed','ProtectionStopped','SoftDeleted','ProtectionError')
| project id, friendlyName, dataSourceType, dsResourceGroup, dsSubscription, vmProperties, vaultName, protectionState, policyName,primaryLocation
| join kind = leftouter (
RecoveryServicesResources
| where type == 'microsoft.recoveryservices/vaults/backuppolicies'
| extend xvaultName = case(type == 'microsoft.recoveryservices/vaults/backuppolicies', split(split(id, 'microsoft.recoveryservices/vaults/')[1],'/')[0],type == 'microsoft.recoveryservices/vaults/backuppolicies', split(split(id, 'microsoft.recoveryservices/vaults/')[1],'/')[0],'--')
| extend datasourceType = case(type == 'microsoft.recoveryservices/vaults/backuppolicies', properties.backupManagementType,type == 'microsoft.dataprotection/backupVaults/backupPolicies',properties.datasourceTypes[0],'--')
| extend policyID = id
| extend dailyDurationType = replace('"','',replace(':','',replace('durationType','',replace('{','',tostring(split(properties.retentionPolicy.dailySchedule.retentionDuration,',')[0])))))
| extend daylyLTR = replace('"','',replace(':','',replace('count','',replace('}','',tostring(split(properties.retentionPolicy.dailySchedule.retentionDuration,',')[1])))))
| extend DailyBackup = strcat("Daily, retention Duration ", daylyLTR, " ", dailyDurationType)
| extend weeklyDurationType = replace('"','',replace(':','',replace('durationType','',replace('{','',tostring(split(properties.retentionPolicy.weeklySchedule.retentionDuration,',')[0])))))
| extend weeklyLTR = replace('"','',replace(':','',replace('count','',replace('}','',tostring(split(properties.retentionPolicy.weeklySchedule.retentionDuration,',')[1])))))
| extend weeklyStartDate = split(tostring(properties.retentionPolicy.weeklySchedule.daysOfTheWeek),'"')[1]
| extend WeeklyBackup = strcat("Every ", weeklyStartDate, ", retention Duration ", weeklyLTR, " ", weeklyDurationType)
| extend monthlyDurationType = replace('"','',replace(':','',replace('durationType','',replace('{','',tostring(split(properties.retentionPolicy.monthlySchedule.retentionDuration,',')[0])))))
| extend monthlyLTR = replace('"','',replace(':','',replace('count','',replace('}','',tostring(split(properties.retentionPolicy.monthlySchedule.retentionDuration,',')[1])))))
| extend monthlyStartDayWeek = split(tostring(properties.retentionPolicy.monthlySchedule.retentionScheduleWeekly.daysOfTheWeek),'"')[1]
| extend monthlyStartWeekMonth = split(tostring(properties.retentionPolicy.monthlySchedule.retentionScheduleWeekly.weeksOfTheMonth),'"')[1]
| extend MonthlyBackup = strcat("Every ", monthlyStartDayWeek, " ", monthlyStartWeekMonth, " Week, retention Duration ", monthlyLTR, " " , monthlyDurationType)
| extend yearDurationType = replace('"','',replace(':','',replace('durationType','',replace('{','',tostring(split(properties.retentionPolicy.yearlySchedule.retentionDuration,',')[0])))))
| extend yearLTR = replace('"','',replace(':','',replace('count','',replace('}','',tostring(split(properties.retentionPolicy.yearlySchedule.retentionDuration,',')[1])))))
| extend yearlyStartDayWeek = split(tostring(properties.retentionPolicy.yearlySchedule.retentionScheduleWeekly.daysOfTheWeek),'"')[1]
| extend yearlyStartWeekMonth = split(tostring(properties.retentionPolicy.yearlySchedule.retentionScheduleWeekly.weeksOfTheMonth),'"')[1]
| extend yearlyStartMonth = split(tostring(properties.retentionPolicy.yearlySchedule.monthsOfYear),'"')[1]
| extend YearlyBackup = strcat("Every month ", yearlyStartWeekMonth, " ", yearlyStartDayWeek, ", retention Duration ", yearLTR, " ", yearDurationType)
| project resourceId = tolower(tostring(properties.sourceResourceId)),policyID,policyName=name,DailyBackup,WeeklyBackup,MonthlyBackup,YearlyBackup,daylyLTR,weeklyLTR,monthlyLTR,yearLTR) on policyName
2nd Query
To get all Vm and if they have backup policy
Resources
| where type in~ ('microsoft.compute/virtualmachines','microsoft.classiccompute/virtualmachines')
| extend resourceId=tolower(id)
| extend sku = properties.storageProfile.imageReference.sku
| extend publisher = properties.storageProfile.imageReference.publisher
| extend offer = properties.storageProfile.imageReference.offer
| extend ostype = properties.storageProfile.osDisk.osType
| extend hardwareType = properties.hardwareProfile
| join kind = leftouter (
RecoveryServicesResources
| where type in~ ("microsoft.recoveryservices/vaults/backupfabrics/protectioncontainers/protecteditems", "microsoft.recoveryservices/vaults/backuppolicies")
| where properties.backupManagementType == "AzureIaasVM"
| extend backupItemid = id
| extend isBackedUp = isnotempty(id)
| extend policyName = case(type =~ 'Microsoft.RecoveryServices/vaults/backupFabrics/protectionContainers/protectedItems',properties.policyName, type =~ 'microsoft.dataprotection/backupVaults/backupInstances', properties.policyInfo.name, '--')
| extend vaultName = case(type =~ 'microsoft.dataprotection/backupVaults/backupInstances',split(split(id, '/Microsoft.DataProtection/backupVaults/')[1],'/')[0],type =~ 'Microsoft.RecoveryServices/vaults/backupFabrics/protectionContainers/protectedItems',split(split(id, '/Microsoft.RecoveryServices/vaults/')[1],'/')[0],'--')
| extend dataSourceType = case(type=~'Microsoft.RecoveryServices/vaults/backupFabrics/protectionContainers/protectedItems',properties.backupManagementType,type =~ 'microsoft.dataprotection/backupVaults/backupInstances',properties.dataSourceSetInfo.datasourceType,'--')
| project resourceId = tolower(tostring(properties.sourceResourceId)), backupItemid, isBackedUp, policyName, vaultName, dataSourceType ) on resourceId
| extend isProtected = isnotempty(backupItemid)
I try to combined them but getting following error,
Combined Query
Resources
| where type in~ ('microsoft.compute/virtualmachines','microsoft.classiccompute/virtualmachines')
| extend resourceId=tolower(id)
| extend sku = properties.storageProfile.imageReference.sku
| extend publisher = properties.storageProfile.imageReference.publisher
| extend offer = properties.storageProfile.imageReference.offer
| extend ostype = properties.storageProfile.osDisk.osType
| extend hardwareType = properties.hardwareProfile
| join kind = leftouter ( RecoveryServicesResources
| where type in~ ('Microsoft.RecoveryServices/vaults/backupFabrics/protectionContainers/protectedItems')
| extend vaultName = case(type =~ 'microsoft.dataprotection/backupVaults/backupInstances',split(split(id, '/Microsoft.DataProtection/backupVaults/')[1],'/')[0],type =~ 'Microsoft.RecoveryServices/vaults/backupFabrics/protectionContainers/protectedItems',split(split(id, '/Microsoft.RecoveryServices/vaults/')[1],'/')[0],'--')
| extend dataSourceType = case(type=~'Microsoft.RecoveryServices/vaults/backupFabrics/protectionContainers/protectedItems',properties.backupManagementType,type =~ 'microsoft.dataprotection/backupVaults/backupInstances',properties.dataSourceSetInfo.datasourceType,'--')
| extend friendlyName = properties.friendlyName
| extend dsResourceGroup = split(split(properties.dataSourceInfo.resourceID, '/resourceGroups/')[1],'/')[0]
| extend dsSubscription = split(split(properties.dataSourceInfo.resourceID, '/subscriptions/')[1],'/')[0]
| extend primaryLocation = properties.dataSourceInfo.resourceLocation
| extend policyName = case(type =~ 'Microsoft.RecoveryServices/vaults/backupFabrics/protectionContainers/protectedItems',properties.policyName, type =~ 'microsoft.dataprotection/backupVaults/backupInstances', properties.policyInfo.name, '--')
| extend protectionState = properties.currentProtectionState
| extend vmProperties = properties
| where protectionState in~ ('ConfiguringProtection','ProtectionConfigured','ConfiguringProtectionFailed','ProtectionStopped','SoftDeleted','ProtectionError')
| project id, friendlyName, dataSourceType, dsResourceGroup, dsSubscription, vmProperties, vaultName, protectionState, policyName,primaryLocation
| join kind=leftouter (RecoveryServicesResources
| where type == 'microsoft.recoveryservices/vaults/backuppolicies'
| extend xvaultName = case(type == 'microsoft.recoveryservices/vaults/backuppolicies', split(split(id, 'microsoft.recoveryservices/vaults/')[1],'/')[0],type == 'microsoft.recoveryservices/vaults/backuppolicies', split(split(id, 'microsoft.recoveryservices/vaults/')[1],'/')[0],'--')
| extend datasourceType = case(type == 'microsoft.recoveryservices/vaults/backuppolicies', properties.backupManagementType,type == 'microsoft.dataprotection/backupVaults/backupPolicies',properties.datasourceTypes[0],'--')
| extend policyID = id
| extend dailyDurationType = replace('"','',replace(':','',replace('durationType','',replace('{','',tostring(split(properties.retentionPolicy.dailySchedule.retentionDuration,',')[0])))))
| extend daylyLTR = replace('"','',replace(':','',replace('count','',replace('}','',tostring(split(properties.retentionPolicy.dailySchedule.retentionDuration,',')[1])))))
| extend DailyBackup = strcat("Daily, retention Duration ", daylyLTR, " ", dailyDurationType)
| extend weeklyDurationType = replace('"','',replace(':','',replace('durationType','',replace('{','',tostring(split(properties.retentionPolicy.weeklySchedule.retentionDuration,',')[0])))))
| extend weeklyLTR = replace('"','',replace(':','',replace('count','',replace('}','',tostring(split(properties.retentionPolicy.weeklySchedule.retentionDuration,',')[1])))))
| extend weeklyStartDate = split(tostring(properties.retentionPolicy.weeklySchedule.daysOfTheWeek),'"')[1]
| extend WeeklyBackup = strcat("Every ", weeklyStartDate, ", retention Duration ", weeklyLTR, " ", weeklyDurationType)
| extend monthlyDurationType = replace('"','',replace(':','',replace('durationType','',replace('{','',tostring(split(properties.retentionPolicy.monthlySchedule.retentionDuration,',')[0])))))
| extend monthlyLTR = replace('"','',replace(':','',replace('count','',replace('}','',tostring(split(properties.retentionPolicy.monthlySchedule.retentionDuration,',')[1])))))
| extend monthlyStartDayWeek = split(tostring(properties.retentionPolicy.monthlySchedule.retentionScheduleWeekly.daysOfTheWeek),'"')[1]
| extend monthlyStartWeekMonth = split(tostring(properties.retentionPolicy.monthlySchedule.retentionScheduleWeekly.weeksOfTheMonth),'"')[1]
| extend MonthlyBackup = strcat("Every ", monthlyStartDayWeek, " ", monthlyStartWeekMonth, " Week, retention Duration ", monthlyLTR, " " , monthlyDurationType)
| extend yearDurationType = replace('"','',replace(':','',replace('durationType','',replace('{','',tostring(split(properties.retentionPolicy.yearlySchedule.retentionDuration,',')[0])))))
| extend yearLTR = replace('"','',replace(':','',replace('count','',replace('}','',tostring(split(properties.retentionPolicy.yearlySchedule.retentionDuration,',')[1])))))
| extend yearlyStartDayWeek = split(tostring(properties.retentionPolicy.yearlySchedule.retentionScheduleWeekly.daysOfTheWeek),'"')[1]
| extend yearlyStartWeekMonth = split(tostring(properties.retentionPolicy.yearlySchedule.retentionScheduleWeekly.weeksOfTheMonth),'"')[1]
| extend yearlyStartMonth = split(tostring(properties.retentionPolicy.yearlySchedule.monthsOfYear),'"')[1]
| extend YearlyBackup = strcat("Every month ", yearlyStartWeekMonth, " ", yearlyStartDayWeek, ", retention Duration ", yearLTR, " ", yearDurationType)
| project resourceId = tolower(tostring(properties.sourceResourceId)),policyID,policyName=name,DailyBackup,WeeklyBackup,MonthlyBackup,YearlyBackup,daylyLTR,weeklyLTR,monthlyLTR,yearLTR) on policyName) on resourceId
Error
Please provide below info when asking for support: timestamp = 2022-01-25T05:23:02.8691470Z, correlationId = e0cf5a44-aacb-4325-b6b1-edbdb44c2ac0. (Code:BadRequest)
Details:
Table RecoveryServicesResources was referenced as right table 2 times, which exceeded the limit of 1. Please see https://aka.ms/resourcegraph-tables for help. (Code:DisallowedMaxNumberOfRemoteTables)
Related
Sphinx Results Take Huge Time To Show (Slow Index)
I'm new to Sphinx, i have simple table tbl_urls with two columns (domain_id,url) i created my index as below to get domain id and number of urls for any giving keyword source src2 { type = mysql sql_host = 0.0.0.0 sql_user = spnx sql_pass = 123 sql_db = db_spnx sql_port = 3306 # optional, default is 3306 sql_query = select id,domain_id,url from tbl_domain_urls sql_attr_uint = domain_id sql_field_string = url } index url_tbl { source = src2 path =/var/lib/sphinx/data/url_tbl } indexer { mem_limit = 2047M } searchd { listen = 0.0.0.0:9312 listen = 0.0.0.0:9306:mysql41 listen = /home/charlie/sphinx-3.4.1/bin/searchd.sock:sphinx log = /var/log/sphinx/sphinx.log query_log = /var/log/sphinx/query.log read_timeout = 5 max_children = 30 pid_file = /var/run/sphinx/sphinx.pid max_filter_values = 20000 seamless_rotate = 1 preopen_indexes = 0 unlink_old = 1 workers = threads # for RT indexes to work binlog_path = /var/lib/sphinx/data max_batch_queries = 128 } problem is the time taken to show results is over one min SELECT domain_id,count(*) as url_counter FROM ul_tbl WHERE MATCH('games') group by domain_id limit 1000000 OPTION max_matches=1000000;show meta; +-----------+-------+ | domain_id | url | +-----------+-------+ | 9900 | 444 | | 41309 | 48 | | 62308 | 491 | | 85798 | 401 | | 595 | 4851 | 13545 rows in set (3 min 22.56 sec) +---------------+--------+ | Variable_name | Value | +---------------+--------+ | total | 13545 | | total_found | 13545 | | time | 1.406 | | keyword[0] | games | | docs[0] | 456667 | | hits[0] | 514718 | +---------------+--------+ table tbl_domain_urls 100,821,614 rows dedicated server HP Proliant 2xL5420 16GB RAM 2x1TB HDD I need your support to optimize my QUERY or config settings, i need the results in the lowest time possible, i really appreciate any new idea to test Note: I tried distributed index to use multiple core for processing without any noticable results
Exclude Temporary Storage (D:) from KQL QUERY
I have a KQL query from disk logs from Azure Log Insights. Please let me know how to exclude a particular drive like D: or any temporary storage from this query. InsightsMetrics | where Name == "FreeSpaceMB" | extend Tags = parse_json(Tags) | extend mountId = tostring(Tags["vm.azm.ms/mountId"]) ,diskSizeMB = toreal(Tags["vm.azm.ms/diskSizeMB"]) | project-rename FreeSpaceMB = Val | summarize arg_max(TimeGenerated, diskSizeMB, FreeSpaceMB) by Computer, mountId ,FreeSpacePercentage = round(FreeSpaceMB / diskSizeMB * 100, 1) | extend diskSizeGB = round(diskSizeMB / 1024, 1) ,FreeSpaceGB = round(FreeSpaceMB / 1024, 1) | project TimeGenerated, Computer, mountId, diskSizeGB, FreeSpaceGB, FreeSpacePercentage | order by Computer asc, mountId asc
You just need to do a where statement | where mountId != "D:" So in your query it will be InsightsMetrics | where Name == "FreeSpaceMB" | extend Tags = parse_json(Tags) | extend mountId = tostring(Tags["vm.azm.ms/mountId"]) ,diskSizeMB = toreal(Tags["vm.azm.ms/diskSizeMB"]) | where mountId != "D:" | project-rename FreeSpaceMB = Val | summarize arg_max(TimeGenerated, diskSizeMB, FreeSpaceMB) by Computer, mountId ,FreeSpacePercentage = round(FreeSpaceMB / diskSizeMB * 100, 1) | extend diskSizeGB = round(diskSizeMB / 1024, 1) ,FreeSpaceGB = round(FreeSpaceMB / 1024, 1) | project TimeGenerated, Computer, mountId, diskSizeGB, FreeSpaceGB, FreeSpacePercentage | order by Computer asc, mountId asc And if you wanted to exclude multiple drives from the query, you can use the !in operator, will look like below InsightsMetrics | where Name == "FreeSpaceMB" | extend Tags = parse_json(Tags) | extend mountId = tostring(Tags["vm.azm.ms/mountId"]) ,diskSizeMB = toreal(Tags["vm.azm.ms/diskSizeMB"]) | where mountId !in ("D:", "E:") | project-rename FreeSpaceMB = Val | summarize arg_max(TimeGenerated, diskSizeMB, FreeSpaceMB) by Computer, mountId ,FreeSpacePercentage = round(FreeSpaceMB / diskSizeMB * 100, 1) | extend diskSizeGB = round(diskSizeMB / 1024, 1) ,FreeSpaceGB = round(FreeSpaceMB / 1024, 1) | project TimeGenerated, Computer, mountId, diskSizeGB, FreeSpaceGB, FreeSpacePercentage | order by Computer asc, mountId asc
Comparing elements from 2 list kotlin
im having 2 list of different variable, so i want to compare and update the 'Check' value from list 2 if the 'Brand' from list 2 is found in list 1 -------------------- -------------------- | Name | Brand | | Brand | Check | -------------------- -------------------- | vga x | Asus | | MSI | X | | vga b | Asus | | ASUS | - | | mobo x | MSI | | KINGSTON | - | | memory | Kingston| | SAMSUNG | - | -------------------- -------------------- so usually i just did for(x in list1){ for(y in list2){ if(y.brand == x.brand){ y.check == true } } } is there any simple solution for that?
Since you're mutating the objects, it doesn't really get any cleaner than what you have. It can be done using any like this, but in my opinion is not any clearer to read: list2.forEach { bar -> bar.check = bar.check || list1.any { it.brand == bar.brand } } The above is slightly more efficient than what you have since it inverts the iteration of the two lists so you don't have to check every element of list1 unless it's necessary. The same could be done with yours like this: for(x in list2){ for(y in list1){ if(y.brand == x.brand){ x.check = true break } } }
data class Item(val name: String, val brand: String) fun main() { val list1 = listOf( Item("vga_x", "Asus"), Item("vga_b", "Asus"), Item("mobo_x", "MSI"), Item("memory", "Kingston") ) val list2 = listOf( Item("", "MSI"), Item("", "ASUS"), Item("", "KINGSTON"), Item("", "SAMSUNG") ) // Get intersections val intersections = list1.map{it.brand}.intersect(list2.map{it.brand}) println(intersections) // Returns => [MSI] // Has any intersections val intersected = list1.map{it.brand}.any { it in list2.map{it.brand} } println(intersected) // Returns ==> true } UPDATE: I just see that this isn't a solution for your problem. But I'll leave it here.
How to output multiple variables using Azure Kusto?
I'm fairly new to Azure Kusto query-language. I'm trying to output 2 variables. This has to be something very simple, I just don't know how. I have tried using datatable, make-series, print, etc. functions to no avail. Here's my current code: let allrequests = requests | project itemCount, resultCode, success, timestamp | where timestamp > now(-1h) and timestamp < now(-5m); let requestcount = allrequests | summarize sum(itemCount); let errorcount = allrequests | where toint(resultCode) >= 400 and toint(resultCode) <= 499 | summarize sum(itemCount); requestcount; errorcount
Using union is one way, but if you want them on a single row use the print statement (docs): let requestcount = requests | summarize sum(itemCount); let errorcount = exceptions | summarize count(); print requests = toscalar(requestcount), exceptions = toscalar(errorcount)
I figured it out. You can join results using the union operator. let allrequests = requests | project itemCount, resultCode, success, timestamp | where timestamp > now(-1h) and timestamp < now(-5m); let requestcount = allrequests | summarize sum(itemCount); let errorcount = allrequests | where toint(resultCode) >= 400 and toint(resultCode) <= 499 | summarize sum(itemCount); errorcount | union requestcount
processing network packets in spark in a stateful manner
I would like to use Spark to parse network messages and group them into logical entities in a stateful manner. Problem Description Let's assume each message is in one row of an input dataframe, depicted below. | row | time | raw payload | +-------+------+---------------+ | 1 | 10 | TEXT1; | | 2 | 20 | TEXT2;TEXT3; | | 3 | 30 | LONG- | | 4 | 40 | TEXT1; | | 5 | 50 | TEXT4;TEXT5;L | | 6 | 60 | ONG | | 7 | 70 | -TEX | | 8 | 80 | T2; | The task is to parse the logical messages in the raw payload, and provide them in a new output dataframe. In the example each logical message in the payload ends with a semicolon (delimiter). The desired output dataframe could then look as follows: | row | time | message | +-------+------+---------------+ | 1 | 10 | TEXT1; | | 2 | 20 | TEXT2; | | 3 | 20 | TEXT3; | | 4 | 30 | LONG-TEXT1; | | 5 | 50 | TEXT4; | | 6 | 50 | TEXT5; | | 7 | 50 | LONG-TEXT2; | Note that some messages rows do not yield a new row in the result (e.g. rows 4, 6,7,8), and some yield even multiple rows (e.g. rows 2, 5) My questions: is this a use case for UDAF? If so, how for example should i implement the merge function? i have no idea what its purpose is. since the message ordering matters (i cannot process LONGTEXT-1, LONGTEXT-2 properly without respecting the message order), can i tell spark to parallelize perhaps on a higer level (e.g. per calendar day of messages) but not parallelize within a day (e.g. events at time 50,60,70,80 need to be processed in order). follow up question: is it conceivable that the solution will be usable not just in traditional spark, but also in spark structured streaming? Or does the latter require its own kind of stateful processing method?
Generally, you can run arbitrary stateful aggregations on spark streaming by using mapGroupsWithState of flatMapGroupsWithState. You can find some examples here. None of those though will guarantee that the processing of the stream will be ordered by event time. If you need to enforce data ordering, you should try to use window operations on event time. In that case, you need to run stateless operations instead, but if the number of elements in each window group is small enough, you can use collectList for instance and then apply a UDF (where you can manage the state for each window group) on each list.
ok i figured it out in the meantime how to do this with an UDAF. class TagParser extends UserDefinedAggregateFunction { override def inputSchema: StructType = StructType(StructField("value", StringType) :: Nil) override def bufferSchema: StructType = StructType( StructField("parsed", ArrayType(StringType)) :: StructField("rest", StringType) :: Nil) override def dataType: DataType = ArrayType(StringType) override def deterministic: Boolean = true override def initialize(buffer: MutableAggregationBuffer): Unit = { buffer(0) = IndexedSeq[String]() buffer(1) = null } def doParse(str: String, buffer: MutableAggregationBuffer): Unit = { buffer(0) = IndexedSeq[String]() val prevRest = buffer(1) var idx = -1 val strToParse = if (prevRest != null) prevRest + str else str do { val oldIdx = idx; idx = strToParse.indexOf(';', oldIdx + 1) if (idx == -1) { buffer(1) = strToParse.substring(oldIdx + 1) } else { val newlyParsed = strToParse.substring(oldIdx + 1, idx) buffer(0) = buffer(0).asInstanceOf[IndexedSeq[String]] :+ newlyParsed buffer(1) = null } } while (idx != -1) } override def update(buffer: MutableAggregationBuffer, input: Row): Unit = { if (buffer == null) { return } doParse(input.getAs[String](0), buffer) } override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = throw new UnsupportedOperationException override def evaluate(buffer: Row): Any = buffer(0) } Here a demo app the uses the above UDAF to solve the problem from above: case class Packet(time: Int, payload: String) object TagParserApp extends App { val spark, sc = ... // kept out for brevity val df = sc.parallelize(List( Packet(10, "TEXT1;"), Packet(20, "TEXT2;TEXT3;"), Packet(30, "LONG-"), Packet(40, "TEXT1;"), Packet(50, "TEXT4;TEXT5;L"), Packet(60, "ONG"), Packet(70, "-TEX"), Packet(80, "T2;") )).toDF() val tp = new TagParser val window = Window.rowsBetween(Window.unboundedPreceding, Window.currentRow) val df2 = df.withColumn("msg", tp.apply(df.col("payload")).over(window)) df2.show() } this yields: +----+-------------+--------------+ |time| payload| msg| +----+-------------+--------------+ | 10| TEXT1;| [TEXT1]| | 20| TEXT2;TEXT3;|[TEXT2, TEXT3]| | 30| LONG-| []| | 40| TEXT1;| [LONG-TEXT1]| | 50|TEXT4;TEXT5;L|[TEXT4, TEXT5]| | 60| ONG| []| | 70| -TEX| []| | 80| T2;| [LONG-TEXT2]| +----+-------------+--------------+ the main issue for me was to figure out how to actually apply this UDAF, namely using this: df.withColumn("msg", tp.apply(df.col("payload")).over(window)) the only thing i need now to figure out are the aspects of parallelization (which i only want to happen where we do not rely on ordering) but that's a separate issue for me.