How do I use the DAX function ParallelPeriod

How do I use the DAX function ParallelPeriod - excel

The ParaellePeriod function allows for the comparison of values between points in time (how do sales compare to a year ago). I'm doing something wrong in my use of it, but have no idea what that thing may be.
Set up
I created a bog simple PowerPivot SQL Server 2008+ source query and named it Source. The query generates 168 rows: 6 IDs (100-600) and 28 dates (first of a month from Jan 2010 to Apr 2012) all cross applied together.
; WITH SRC (groupKey, eventDate, value) AS
(
SELECT G.groupKey, D.eventDate, CAST(rand(G.groupKey * year(D.eventDate) * month(D.eventDate)) * 100 AS int)
FROM
(
SELECT 100
UNION ALL SELECT 200
UNION ALL SELECT 300
UNION ALL SELECT 400
UNION ALL SELECT 500
UNION ALL SELECT 600
) G (groupKey)
CROSS APPLY
(
SELECT CAST('2010-01-01' AS date)
UNION ALL SELECT CAST('2010-02-01' AS date)
UNION ALL SELECT CAST('2010-03-01' AS date)
UNION ALL SELECT CAST('2010-04-01' AS date)
UNION ALL SELECT CAST('2010-05-01' AS date)
UNION ALL SELECT CAST('2010-06-01' AS date)
UNION ALL SELECT CAST('2010-07-01' AS date)
UNION ALL SELECT CAST('2010-08-01' AS date)
UNION ALL SELECT CAST('2010-09-01' AS date)
UNION ALL SELECT CAST('2010-10-01' AS date)
UNION ALL SELECT CAST('2010-11-01' AS date)
UNION ALL SELECT CAST('2010-12-01' AS date)
UNION ALL SELECT CAST('2011-01-01' AS date)
UNION ALL SELECT CAST('2011-02-01' AS date)
UNION ALL SELECT CAST('2011-03-01' AS date)
UNION ALL SELECT CAST('2011-04-01' AS date)
UNION ALL SELECT CAST('2011-05-01' AS date)
UNION ALL SELECT CAST('2011-06-01' AS date)
UNION ALL SELECT CAST('2011-07-01' AS date)
UNION ALL SELECT CAST('2011-08-01' AS date)
UNION ALL SELECT CAST('2011-09-01' AS date)
UNION ALL SELECT CAST('2011-10-01' AS date)
UNION ALL SELECT CAST('2011-11-01' AS date)
UNION ALL SELECT CAST('2011-12-01' AS date)
UNION ALL SELECT CAST('2012-01-01' AS date)
UNION ALL SELECT CAST('2012-02-01' AS date)
UNION ALL SELECT CAST('2012-03-01' AS date)
UNION ALL SELECT CAST('2012-04-01' AS date)
) D (eventDate)
)
SELECT
*
FROM
SRC;
I added a derived column in PowerPivot using a formula I lifted from MSDN
=CALCULATE(SUM(Source[value]), PARALLELPERIOD(Source[eventDate], -1, year))
There are no errors displayed but there's never any calculated data. I've tried different intervals (-1, +1) and periods (year, month) but to no avail.
The only thing I could observe that was different between my demo and the MSDN was theirs had a separate dimension defined for the date. Easy enough to rectify so I created a Dates query with the following. This query generates a row for all the days between 2010-01-01 and 2012-06-01 (1096 rows)
DECLARE
#start int = 20100101
, #stop int = 20120601;
WITH L0 AS
(
SELECT
0 AS C
UNION ALL
SELECT
0
)
, L1 AS
(
SELECT
0 AS c
FROM
L0 AS A
CROSS JOIN L0 AS B
)
, L2 AS
(
SELECT
0 AS c
FROM
L1 AS A
CROSS JOIN L1 AS B
)
, L3 AS
(
SELECT
0 AS c
FROM
L2 AS A
CROSS JOIN L2 AS B
)
, L4 AS
(
SELECT
0 AS c
FROM
L3 AS A
CROSS JOIN L3 AS B
)
, L5 AS
(
SELECT
0 AS c
FROM
L4 AS A
CROSS JOIN L4 AS B
)
, NUMS AS
(
SELECT
ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) AS number
FROM
L5
)
, YEARS AS
(
SELECT
Y.number
FROM
NUMS Y
WHERE
Y.number BETWEEN #start / 10000 AND #stop / 10000
)
, MONTHS AS
(
SELECT
Y.number
FROM
NUMS Y
WHERE
Y.number BETWEEN 1 and 12
)
, DAYS AS
(
SELECT
Y.number
FROM
NUMS Y
WHERE
Y.number BETWEEN 1 and 31
)
, CANDIDATES_0 AS
(
SELECT
Y.number * 10000 + M.number * 100 + D.number AS SurrogateKey
, CAST(Y.number * 10000 + M.number * 100 + D.number AS char(8)) AS DateValue
FROM
YEARS Y
CROSS APPLY
MONTHS M
CROSS APPLY
DAYS D
)
, HC AS
(
SELECT
Y.number * 10000 + M.number * 100 + D.number AS SurrogateKey
, CAST(Y.number * 10000 + M.number * 100 + D.number AS char(8)) AS DateValue
FROM
YEARS Y
CROSS APPLY
MONTHS M
CROSS APPLY
DAYS D
WHERE
D.number < 31
AND M.number IN (4,6,9,11)
UNION ALL
SELECT
Y.number * 10000 + M.number * 100 + D.number AS SurrogateKey
, CAST(Y.number * 10000 + M.number * 100 + D.number AS char(8)) AS DateValue
FROM
YEARS Y
CROSS APPLY
MONTHS M
CROSS APPLY
DAYS D
WHERE
D.number < 32
AND M.number IN (1,3,5,7,8,10,12)
UNION ALL
SELECT
Y.number * 10000 + M.number * 100 + D.number AS SurrogateKey
, CAST(Y.number * 10000 + M.number * 100 + D.number AS char(8)) AS DateValue
FROM
YEARS Y
CROSS APPLY
MONTHS M
CROSS APPLY
DAYS D
WHERE
D.number < 29
AND M.number = 2
AND
(
Y.number % 4 > 0
OR Y.number % 100 = 0 AND Y.number % 400 > 0
)
UNION ALL
SELECT
Y.number * 10000 + M.number * 100 + D.number AS SurrogateKey
, CAST(Y.number * 10000 + M.number * 100 + D.number AS char(8)) AS DateValue
FROM
YEARS Y
CROSS APPLY
MONTHS M
CROSS APPLY
DAYS D
WHERE
D.number < 30
AND M.number = 2
AND
(
Y.number % 4 = 0
OR Y.number % 100 = 0 AND Y.number % 400 = 0
)
)
, CANDIDATES AS
(
SELECT
C.SurrogateKey
, CAST(C.DateValue as date) As DateValue
FROM
HC C
WHERE
ISDATE(c.DateValue) = 1
)
, PARTS
(
DateKey
, FullDateAlternateKey
, DayNumberOfWeek
, EnglishDayNameOfWeek
, DayNumberOfMonth
, DayNumberOfYear
, WeekNumberOfYear
, EnglishMonthName
, MonthNumberOfYear
, CalendarQuarter
, CalendarYear
, CalendarSemester
--,FiscalQuarter
--,FiscalYear
--,FiscalSemester
) AS
(
SELECT
CAST(C.SurrogateKey AS int)
, C.DateValue
, DATEPART(WEEKDAY, C.DateValue)
, DATENAME(WEEKDAY, C.DateValue)
, DATEPART(DAY, C.DateValue)
, DATEPART(DAYOFYEAR, C.DateValue)
, DATEPART(WEEK, C.DateValue)
, DATENAME(MONTH, C.DateValue)
, DATEPART(MONTH, C.DateValue)
, DATEPART(QUARTER, C.DateValue)
, DATEPART(YEAR, C.DateValue)
, DATEPART(WEEK, C.DateValue)
FROM
CANDIDATES C
WHERE
C.DateValue IS NOT NULL
)
SELECT
P.*
FROM
--HC P
PARTS P
ORDER BY 1;
With data generated, I created a relationship between the Source and Dates and tried this formula with no luck either
=CALCULATE(SUM(Source[value]), PARALLELPERIOD(Dates[FullDateAlternateKey], -1, year))
The PowerPivot designer looks like
Any thoughts on what I'm doing wrong?
References
PARALLELPERIOD Function
PowerPivot DAX PARALLELPERIOD vs DATEADD

The DAX expression you used in the derived column should be a measure and defined in the calculation area...
MeasurePriorPeriodValue := CALCULATE(SUM(Source[value]), PARALLELPERIOD(Source[eventDate], -1, year))
...as long as the column you use in the parallelperiod function is configured as a date datatype, it should still work. Having the date table separated from the rest is "best practice" but not required...because it allows you to ensure that there are no gaps (which can cause problems with some DAX Time-Intelligence functions) and things like that.

Related

Pyspark Window function: Counting number of categorical variables and calculating percentages

I have the dataframe of the below format. There are different IDs, and product names and types associated for each product.
ID Prod Name Type Total Qty
1 ABC A 200
1 DEF B 350
1 GEH B 120
1 JIK C 100
1 LMO A 40
2 ABC A 10
2 DEF A 20
2 GEH C 30
2 JIK C 40
2 LMO A 50
So I am trying to get the percentage of total A's, B's and C's for that product name and ID in a separate column. As a first step, I was trying to use window function, but it gave me the count of "A" across the whole column.
df.withColumn("count_cat", F.count("Type").over(Window.partitionBy("Type")))
But I need something like this
ID total Products Total Qty % of A % of B % of C
1 5 810 0.29 0.58 0.12

Approach 1: Group By Aggregation
Based on your expected output, aggregates based on a GROUP BY Id would be sufficient.
You may achieve this using the following assuming your initial dataset is stored in a dataframe input_df
Using spark sql
ensure your dataframe is accessible by creating a temporary view
input_df.createOrReplaceTempView("input_df")
Running the sql below on your spark session
output_df = sparkSession.sql("""
SELECT
ID,
COUNT(Prod_Name) as `total products`,
SUM(Total_Qty) as `Total Qty`,
SUM(
CASE WHEN Type='A' THEN Total_Qty END
) / SUM(Total_Qty) as `% of A`,
SUM(
CASE WHEN Type='B' THEN Total_Qty END
) / SUM(Total_Qty) as `% of B`,
SUM(
CASE WHEN Type='C' THEN Total_Qty END
) / SUM(Total_Qty) as `% of C`
FROM
input_df
GROUP BY
ID
""").na.fill(0)
Using the pyspark API
from pyspark.sql import functions as F
output_df = (
input_df.groupBy("ID")
.agg(
F.count("Prod_Name").alias("total products"),
F.sum("Total_Qty").alias("Total Qty"),
(F.sum(
F.when(
F.col("Type")=="A",F.col("Total_Qty")
).otherwise(0)
) / F.sum("Total_Qty")).alias("% of A"),
(F.sum(
F.when(
F.col("Type")=="B",F.col("Total_Qty")
).otherwise(0)
)/ F.sum("Total_Qty")).alias("% of B"),
(F.sum(
F.when(
F.col("Type")=="C",F.col("Total_Qty")
).otherwise(0)
)/ F.sum("Total_Qty")).alias("% of C")
)
)
Approach 2: Using Windows
If it is that you would like to add these as 5 addition columns to your dataset you may use similar aggregations with the following window OVER (PARTITION BY ID) or Window.partitionBy("ID") as shown below
Using spark sql
ensure your dataframe is accessible by creating a temporary view
input_df.createOrReplaceTempView("input_df")
Running the sql below on your spark session
output_df = sparkSession.sql("""
SELECT
*,
COUNT(Prod_Name) OVER (PARTITION BY ID) as `total products`,
SUM(Total_Qty) OVER (PARTITION BY ID) as `Total Qty`,
SUM(
CASE WHEN Type='A' THEN Total_Qty END
) OVER (PARTITION BY ID) / SUM(Total_Qty) OVER (PARTITION BY ID) as `% of A`,
SUM(
CASE WHEN Type='B' THEN Total_Qty END
) OVER (PARTITION BY ID)/ SUM(Total_Qty) OVER (PARTITION BY ID) as `% of B`,
SUM(
CASE WHEN Type='C' THEN Total_Qty END
) OVER (PARTITION BY ID) / SUM(Total_Qty) OVER (PARTITION BY ID) as `% of C`
FROM
input_df
GROUP BY
ID
""").na.fill(0)
Using the pyspark API
from pyspark.sql import functions as F
from pyspark.sql import Window
agg_window = Window.partitionBy("Id")
output_df = (
input_df.withColumn(
"total products",
F.count("Prod_Name").over(agg_window)
)
.withColumn(
"Total Qty",
F.sum("Total_Qty").over(agg_window)
)
.withColumn(
"% of A",
F.sum(
F.when(
F.col("Type")=="A",F.col("Total_Qty")
).otherwise(0)
).over(agg_window) / F.sum("Total_Qty").over(agg_window)
)
.withColumn(
"% of B",
F.sum(
F.when(
F.col("Type")=="B",F.col("Total_Qty")
).otherwise(0)
).over(agg_window) / F.sum("Total_Qty").over(agg_window)
)
.withColumn(
"% of C",
F.sum(
F.when(
F.col("Type")=="C",F.col("Total_Qty")
).otherwise(0)
).over(agg_window) / F.sum("Total_Qty").over(agg_window)
)
)
Let me know if this works for you.

One approach (without repeating A B C etc), is using pivot. The idea is grouping first then pivoting the type:
from pyspark.sql import functions as F
from pyspark.sql import Window as W
(df
.groupBy('ID', 'Type')
.agg(F.sum('Total Qty').alias('qty'))
.withColumn('pct', F.col('qty') / F.sum('qty').over(W.partitionBy('ID')))
.groupBy('ID')
.pivot('Type')
.agg(F.first('pct'))
.show()
)
# Output
# +---+------------------+------------------+-------------------+
# | ID| A| B| C|
# +---+------------------+------------------+-------------------+
# | 1|0.2962962962962963|0.5802469135802469|0.12345679012345678|
# | 2|0.5333333333333333| null| 0.4666666666666667|
# +---+------------------+------------------+-------------------+

Pyspark How to create columns and fill True/False if rolling datetime record exists

Data-set contains products with daily record but sometime it misses out so i want to create extra columns to show whether it exists or not in the past few days
i have conditions below
Create T-1, T-2 and so on columns and fill it with below
Fill T-1 with 1 the record exist, otherwise zero
Original Table :
Item Cat DateTime Value
A C1 1-1-2021 10
A C1 2-1-2021 10
A C1 3-1-2021 10
A C1 4-1-2021 10
A C1 5-1-2021 10
A C1 6-1-2021 10
B C1 1-1-2021 20
B C1 4-1-2021 20
Expect Result :
Item Cat DateTime Value T-1 T-2 T-3 T-4 T-5
A C1 1-1-2021 10 0 0 0 0 0
A C1 2-1-2021 10 1 0 0 0 0 (T-1 is 1 as we have 1-1-2021 record)
A C1 3-1-2021 10 1 1 0 0 0
A C1 4-1-2021 10 1 1 1 0 0
A C1 5-1-2021 10 1 1 1 1 0
A C1 6-1-2021 10 1 1 1 1 1
B C1 1-1-2021 20 0 0 0 0 0
B C1 2-1-2021 0 1 0 0 0 0 (2-1-2021 record need to be created with value zero since we miss this from original data-set, plus T-1 is 1 as we have this record from original data-set)
B C1 3-1-2021 0 0 1 0 0 0
B C1 4-1-2021 20 0 0 1 0 0
B C1 5-1-2021 0 1 0 0 1 0

Let's assume you have the original table data stored in original_data, we can
create a temporary view to query with spark sql named daily_records
generate possible dates . This was done by identifying the amount of days between the min and max dates from the dataset then generating the possible dates using table generating function explode and spaces
generate all possible item, date records
join these records with the actual to have a complete dataset with values
Use spark sql to query the view and create the additional column using the left joins and CASE statements
# Step 1
original_data.createOrReplaceTempView("daily_records")
# Step 2-4
daily_records = sparkSession.sql("""
WITH date_bounds AS (
SELECT min(DateTime) as mindate, max(DateTime) as maxdate FROM daily_records
),
possible_dates AS (
SELECT
date_add(mindate,index.pos) as DateTime
FROM
date_bounds
lateral view posexplode(split(space(datediff(maxdate,mindate)),"")) index
),
unique_items AS (
SELECT DISTINCT Item, Cat from daily_records
),
possible__item_dates AS (
SELECT Item, Cat, DateTime FROM unique_items INNER JOIN possible_dates ON 1=1
),
possible_records AS (
SELECT
p.Item,
p.Cat,
p.DateTime,
r.Value
FROM
possible__item_dates p
LEFT JOIN
daily_records r on p.Item = r.Item and p.DateTime = r.DateTime
)
select * from possible_records
""")
daily_records.createOrReplaceTempView("daily_records")
daily_records.show()
# Step 5 - store results in desired_result
# This is optional, but I have chosen to generate the sql to create this dataframe
periods = 5 # Number of periods to check for
period_columns = ",".join(["""
CASE
WHEN t{0}.Value IS NULL THEN 0
ELSE 1
END as `T-{0}`
""".format(i) for i in range(1,periods+1)])
period_joins = " ".join(["""
LEFT JOIN
daily_records t{0} on datediff(to_date(t.DateTime),to_date(t{0}.DateTime))={0} and t.Item = t{0}.Item
""".format(i) for i in range(1,periods+1)])
period_sql = """
SELECT
t.*
{0}
FROM
daily_records t
{1}
ORDER BY
Item, DateTime
""".format(
"" if len(period_columns)==0 else ",{0}".format(period_columns),
period_joins
)
desired_result= sparkSession.sql(period_sql)
desired_result.show()
Actual SQL generated:
SELECT
t.*,
CASE
WHEN t1.Value IS NULL THEN 0
ELSE 1
END as `T-1`,
CASE
WHEN t2.Value IS NULL THEN 0
ELSE 1
END as `T-2`,
CASE
WHEN t3.Value IS NULL THEN 0
ELSE 1
END as `T-3`,
CASE
WHEN t4.Value IS NULL THEN 0
ELSE 1
END as `T-4`,
CASE
WHEN t5.Value IS NULL THEN 0
ELSE 1
END as `T-5`
FROM
daily_records t
LEFT JOIN
daily_records t1 on datediff(to_date(t.DateTime),to_date(t1.DateTime))=1 and t.Item = t1.Item
LEFT JOIN
daily_records t2 on datediff(to_date(t.DateTime),to_date(t2.DateTime))=2 and t.Item = t2.Item
LEFT JOIN
daily_records t3 on datediff(to_date(t.DateTime),to_date(t3.DateTime))=3 and t.Item = t3.Item
LEFT JOIN
daily_records t4 on datediff(to_date(t.DateTime),to_date(t4.DateTime))=4 and t.Item = t4.Item
LEFT JOIN
daily_records t5 on datediff(to_date(t.DateTime),to_date(t5.DateTime))=5 and t.Item = t5.Item
ORDER BY
Item, DateTime
NB. to_date is optional if DateTime is already formatted as a date field or in the format yyyy-mm-dd

Amazon Athena (Presto) SELECT statement to create (n^2 + n)/2 (𝑛th triangular number)

I'm using Athena and trying to find a way to create a select statement that will return a sequence in the below format:
Numer
1
2
2
3
3
3
4
4
4
4
And so on, up to 200.
Is it even possible?

Combine sequence() with UNNEST:
SELECT n FROM UNNEST(sequence(1, 5)) t(n)
CROSS JOIN UNNEST(sequence(1, n)) x(y);
presto:default> SELECT n
-> FROM UNNEST(sequence(1, 5)) t(n)
-> CROSS JOIN UNNEST(sequence(1, n)) x(y);
n
---
1
2
2
3
3
3
4
4
4
4
5
5
5
5
5
(15 rows)
(tested in Presto 326 but will work in Athena too)

Run:
SELECT numbers FROM (
SELECT * FROM (
VALUES flatten(
transform(
sequence(1, 4),
x -> repeat(x, cast(x AS INT))
)
)
) AS x (a) CROSS JOIN UNNEST(a) AS t (numbers)
);
it will return:
numbers
---------
1
2
2
3
3
3
4
4
4
4
(10 rows)

segmenting or grouping a df based on parameters or differences within columns going down the dataframe rows?

I was trying to figure out if there was a way in which where I had a dataframe with multiple fields and I wanted to segment or group the dataframe into a new dataframe based on if the values of specific columns were within x amount of each other?
I.D | Created_Time | Home_Longitude | Home_Latitude | Work_Longitude | Home_Latitude
Faa1 2019-02-23 20:01:13.362 -77.0364 38.8951 -72.0364 38.8951
Above is how the original df looks with multiple rows.
I want to create a new dataframe where all rows or I.Ds contain created times that are within x amount of minutes of each other, and using haversine within x miles of one another homes, and x miles within one another work.
So Basically trying to filter this dataframe into a df that only contains rows that are within x minutes of order created time, x miles within one another homes and , x miles within each work column value.

I did this by
calculating the distances (in miles) and time relative to the first row
My logic
if n rows are within x minutes/miles of the first row, then those n rows are within x minutes/miles of each other
filter the data using the required distance and time filter conditions
Generate some dummy data
random co-ordinates
# Generate random Lat-Long points
def newpoint():
return uniform(-180,180), uniform(-90, 90)
home_points = (newpoint() for x in range(289))
work_points = (newpoint() for x in range(289))
df = pd.DataFrame(home_points, columns=['Home_Longitude', 'Home_Latitude'])
df[['Work_Longitude', 'Work_Latitude']] = pd.DataFrame(work_points)
# Insert `ID` column as sequence of integers
df.insert(0, 'ID', range(289))
# Generate random datetimes, separated by 5 minute intervals
# (you can choose your own interval)
times = pd.date_range('2012-10-01', periods=289, freq='5min')
df.insert(1, 'Created_Time', times)
print(df.head())
ID Created_Time Home_Longitude Home_Latitude Work_Longitude Work_Latitude
0 0 2012-10-01 00:00:00 -48.885981 -39.412351 -68.756244 24.739860
1 1 2012-10-01 00:05:00 58.584893 59.851739 -119.978429 -87.687858
2 2 2012-10-01 00:10:00 -18.623484 85.435248 -14.204142 -3.693993
3 3 2012-10-01 00:15:00 -29.721788 71.671103 -69.833253 -12.446204
4 4 2012-10-01 00:20:00 168.257968 -13.247833 60.979050 -18.393925
Create Python helper function with haversine distance formula (vectorized haversine distance formula, in km)
def haversine(lat1, lon1, lat2, lon2, to_radians=False, earth_radius=6371):
"""
slightly modified version: of http://stackoverflow.com/a/29546836/2901002
Calculate the great circle distance between two points
on the earth (specified in decimal degrees or in radians)
All (lat, lon) coordinates must have numeric dtypes and be of equal length.
"""
if to_radians:
lat1, lon1, lat2, lon2 = np.radians([lat1, lon1, lat2, lon2])
a = np.sin((lat2-lat1)/2.0)**2 + \
np.cos(lat1) * np.cos(lat2) * np.sin((lon2-lon1)/2.0)**2
return earth_radius * 2 * np.arcsin(np.sqrt(a))
Calculate distances (relative to first row) in km, using haversine formula. Then, convert km to miles
df['Home_dist_miles'] = \
haversine(df.Home_Longitude, df.Home_Latitude,
df.loc[0, 'Home_Longitude'], df.loc[0, 'Home_Latitude'])*0.621371
df['Work_dist_miles'] = \
haversine(df.Work_Longitude, df.Work_Latitude,
df.loc[0, 'Work_Longitude'], df.loc[0, 'Work_Latitude'])*0.621371
Calculate time differences, in minutes (relative to first row)
for the dummy data here, the time differences will be in multiples of 5 minutes (but in real data, they could be anything)
df['time'] = df['Created_Time'] - df.loc[0, 'Created_Time']
df['time_min'] = (df['time'].dt.days * 24 * 60 * 60 + df['time'].dt.seconds)/60
Apply filters (method 1) and then select any 2 rows that satisfy the conditions stated in the OP
home_filter = df['Home_dist_miles']<=12000 # within 12,000 miles
work_filter = df['Work_dist_miles']<=8000 # within 8,000 miles
time_filter = df['time_min']<=25 # within 25 minutes
df_filtered = df.loc[(home_filter) & (work_filter) & (time_filter)]
# Select any 2 rows that satisfy required conditions
df_any2rows = df_filtered.sample(n=2)
print(df_any2rows)
ID Created_Time Home_Longitude Home_Latitude Work_Longitude Work_Latitude Home_dist_miles Work_dist_miles time time_min
0 0 2012-10-01 00:00:00 -168.956448 -42.970705 -6.340945 -12.749469 0.000000 0.000000 00:00:00 0.0
4 4 2012-10-01 00:20:00 -73.120352 13.748187 -36.953587 23.528789 6259.078588 5939.425019 00:20:00 20.0
Apply filters (method 2) and then select any 2 rows that satisfy the conditions stated in the OP
multi_query = """Home_dist_miles<=12000 & \
Work_dist_miles<=8000 & \
time_min<=25"""
df_filtered = df.query(multi_query)
# Select any 2 rows that satisfy required conditions
df_any2rows = df_filtered.sample(n=2)
print(df_any2rows)
ID Created_Time Home_Longitude Home_Latitude Work_Longitude Work_Latitude Home_dist_miles Work_dist_miles time time_min
0 0 2012-10-01 00:00:00 -168.956448 -42.970705 -6.340945 -12.749469 0.000000 0.000000 00:00:00 0.0
4 4 2012-10-01 00:20:00 -73.120352 13.748187 -36.953587 23.528789 6259.078588 5939.425019 00:20:00 20.0

column values in a row

I have following table
id count hour age range
-------------------------------------
0 5 10 61 10-200
1 6 20 61 10-200
2 7 15 61 10-200
5 9 5 61 201-300
7 10 25 61 201-300
0 5 10 62 10-20
1 6 20 62 10-20
2 7 15 62 10-20
5 9 5 62 21-30
1 8 6 62 21-30
7 10 25 62 21-30
10 15 30 62 31-40
I need to select distinct values of column range
I tried following query
Select distinct range as interval from table name where age = 62;
its result is in a column as follows:
interval
----------
10-20
21-30
31-41
How can I get result as follows?
10-20, 21-30, 31-40
EDITED:
I am now trying following query:
select sys_connect_by_path(range,',') interval
from
(select distinct NVL(range,'0') range , ROW_NUMBER() OVER (ORDER BY RANGE) rn
from table_name where age = 62)
where connect_by_isleaf = 1 CONNECT BY rn = PRIOR rn+1 start with rn = 1;
Which is giving me output as:
Interval
----------------------------------------------------------------------------
, 10-20,10-20,10-20,21-30,21-30, 31-40
guys plz help me to get my desired output.

If you are on 11.2 rather than just 11.1, you can use the LISTAGG aggregate function
SELECT listagg( interval, ',' )
WITHIN GROUP( ORDER BY interval )
FROM (SELECT DISTINCT range AS interval
FROM table_name
WHERE age = 62)
If you are using an earlier version of Oracle, you could use one of the other Oracle string aggregation techniques on Tim Hall's page. Prior to 11.2, my personal preference would be to create a user-defined aggregate function so that you can then
SELECT string_agg( interval )
FROM (SELECT DISTINCT range AS interval
FROM table_name
WHERE age = 62)
If you don't want to create a function, however, you can use the ROW_NUMBER and SYS_CONNECT_BY_PATH approach though that tends to get a bit harder to follow
with x as (
SELECT DISTINCT range AS interval
FROM table_name
WHERE age = 62 )
select ltrim( max( sys_connect_by_path(interval, ','))
keep (dense_rank last order by curr),
',') range
from (select interval,
row_number() over (order by interval) as curr,
row_number() over (order by interval) -1 as prev
from x)
connect by prev = PRIOR curr
start with curr = 1

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

How do I use the DAX function ParallelPeriod - excel

Related

Pyspark Window function: Counting number of categorical variables and calculating percentages

Pyspark How to create columns and fill True/False if rolling datetime record exists

Amazon Athena (Presto) SELECT statement to create (n^2 + n)/2 (𝑛th triangular number)

segmenting or grouping a df based on parameters or differences within columns going down the dataframe rows?

column values in a row

Categories

Resources