Get name / alias of column in PySpark - apache-spark

I am defining a column object like this:
column = F.col('foo').alias('bar')
I know I can get the full expression using str(column).
But how can I get the column's alias only?
In the example, I'm looking for a function get_column_name where get_column_name(column) returns the string bar.

One way is through regular expressions:
from pyspark.sql.functions import col
column = col('foo').alias('bar')
print(column)
#Column<foo AS `bar`>
import re
print(re.findall("(?<=AS `)\w+(?=`>$)", str(column)))[0]
#'bar'

Alternatively, we could use a wrapper function to tweak the behavior of Column.alias and Column.name methods to store the alias only in an AS attribute:
from pyspark.sql import Column, SparkSession
from pyspark.sql.functions import col, explode, array, struct, lit
SparkSession.builder.getOrCreate()
def alias_wrapper(self, *alias, **kwargs):
renamed_col = Column._alias(self, *alias, **kwargs)
renamed_col.AS = alias[0] if len(alias) == 1 else alias
return renamed_col
Column._alias, Column.alias, Column.name, Column.AS = Column.alias, alias_wrapper, alias_wrapper, None
which then guarantees:
assert(col("foo").alias("bar").AS == "bar")
# `name` should act like `alias`
assert(col("foo").name("bar").AS == "bar")
# column without alias should have None in `AS`
assert(col("foo").AS is None)
# multialias should be handled
assert(explode(array(struct(lit(1), lit("a")))).alias("foo", "bar").AS == ("foo", "bar"))

Regex is not needed. For PySpark 3.x it looks like backticks were replaced with quotes, so this might not work out of the box on earlier spark versions, but should be easy enough to modify.
from pyspark.sql import Column
def get_column_name(col: Column) -> str:
"""
PySpark doesn't allow you to directly access the column name with respect to aliases
from an unbound column. We have to parse this out from the string representation.
This works on columns with one or more aliases as well as unaliased columns.
Returns:
Col name as str, with respect to aliasing
"""
c = str(col).removeprefix("Column<'").removesuffix("'>")
return c.split(' AS ')[-1]
Some tests to validate behavior:
import pytest
from pyspark.sql import SparkSession
#pytest.fixture(scope="session")
def spark() -> SparkSession:
# Provide a session spark fixture for all tests
yield SparkSession.builder.getOrCreate()
def test_get_col_name(spark):
col = f.col('a')
actual = get_column_name(col)
assert actual == 'a'
def test_get_col_name_alias(spark):
col = f.col('a').alias('b')
actual = get_column_name(col)
assert actual == 'b'
def test_get_col_name_multiple_alias(spark):
col = f.col('a').alias('b').alias('c')
actual = get_column_name(col)
assert actual == 'c'
def test_get_col_name_longer(spark: SparkSession):
"""Added this test due to identifying a bug in the old implementation (if you use lstrip/rstrip, this will fail)"""
col = f.col("local")
actual = get_column_name(col)
assert actual == "local"

I've noticed that in some systems you may have backticks surrounding column names. The following options work both with backticks and without.
Option 1 (no regex): str(col).replace("`", "").split("'")[-2].split(" AS ")[-1])
from pyspark.sql.functions import col
col_1 = col('foo')
col_2 = col('foo').alias('bar')
col_3 = col('foo').alias('bar').alias('baz')
s = str(col_1)
print(col_1)
print(s.replace("`", "").split("'")[-2].split(" AS ")[-1])
# Column<'foo'>
# foo
s = str(col_2)
print(col_2)
print(s.replace("`", "").split("'")[-2].split(" AS ")[-1])
# Column<'foo AS bar'>
# bar
s = str(col_3)
print(col_3)
print(s.replace("`", "").split("'")[-2].split(" AS ")[-1])
# Column<'foo AS bar AS baz'>
# baz
Option 2 (regex): pattern '.*?`?(\w+)`?' looks safe enough:
re.search(r"'.*?`?(\w+)`?'", str(col)).group(1)
from pyspark.sql.functions import col
col_1 = col('foo')
col_2 = col('foo').alias('bar')
col_3 = col('foo').alias('bar').alias('baz')
import re
print(col_1)
print(re.search(r"'.*?`?(\w+)`?'", str(col_1)).group(1))
# Column<'foo'>
# foo
print(col_2)
print(re.search(r"'.*?`?(\w+)`?'", str(col_2)).group(1))
# Column<'foo AS bar'>
# bar
print(col_3)
print(re.search(r"'.*?`?(\w+)`?'", str(col_3)).group(1))
# Column<'foo AS bar AS baz'>
# baz

Related

How to create a DataFrame from a list that each column is created by a regex expression

I have a list as such:
lst = ['2021_01_21__11_10_54_1__13928_snapshot.jpg',
'2021_01_21__12_27_44_1__13934_snapshot.jpg',
'2021_01_21__11_11_08_2__13928_snapshot.jpg',
'2021_01_21__12_27_56_2__13934_snapshot.jpg',
'2021_01_21__11_11_19_3__13928_snapshot.jpg',
'2021_01_21__12_28_08_3__13934_snapshot.jpg']
I want to create a DataFrame so that each column will be represented by:
def by_number(path):
base_name = os.path.basename(path)
return re.findall('[\_]{2}(\d{5})',lst)
And the rows will be represented by:
def by_index(path):
base_name = os.path.basename(path)
return re.findall('\_(\d)[\_]{2}',lst)
So eventually I'll get a DataFrame that looks something like this:
name_list = ['2021_01_21__11_10_54_1__13928_snapshot.jpg',
'2021_01_21__12_27_44_1__13934_snapshot.jpg',
'2021_01_21__11_11_08_2__13928_snapshot.jpg',
'2021_01_21__12_27_56_2__13934_snapshot.jpg',
'2021_01_21__11_11_19_3__13928_snapshot.jpg',
'2021_01_21__12_28_08_3__13934_snapshot.jpg']
import re
import pandas as pd
df = pd.DataFrame([[0]], columns=['count']) # initialize dataframe
for name in name_list:
count = re.search('\_(\d)[\_]{2}',name).group(1)
col = re.search('[\_]{2}(\d{5})',name).group(1)
if ((df['count'] == count)).any():
df.loc[df['count'] == count, col] = name
else:
new_row = pd.DataFrame([[count,name]], columns=['count',col])
df = df.append(new_row)
df.set_index('count', inplace=True)
print(df)

How to pass list in Pyspark function "Withcolumn"

I am doing ltrim and rtrim on multiple columns of dataframe but now i am able to do it individually . like
# selected_colums = selected_colums.withColumn("last_name", ltrim(selected_colums.last_name))
# selected_colums = selected_colums.withColumn("last_name", rtrim(selected_colums.last_name))
# selected_colums = selected_colums.withColumn("email", ltrim(selected_colums.email))
# selected_colums = selected_colums.withColumn("email", rtrim(selected_colums.email))
# selected_colums = selected_colums.withColumn("phone_number", ltrim(selected_colums.phone_number))
# selected_colums = selected_colums.withColumn("phone_number", rtrim(selected_colums.phone_number))
But I want to do it in loop like below
sdk = ['first_name','last_name','email','phone_number','email_alt','phone_number_alt']
for x in sdk:
selected_colums = selected_colums.withColumn(x, ltrim(selected_colums.last_name))
Its giving me syntax error.
Please help me to optimize this code so that for any number of column i can able to do ltrim or rtrim just passing list.
Check below code.
Import required functions
>>> from pyspark.sql.functions import col
Apply ltrim and rtrim on all columns
>>> columnExprs = map(lambda c: rtrim(ltrim(col(c))).alias(c),df.columns)
Apply columnExprs in select
df.select(*columnExprs).show()

Using langdetect output to be imported into a new column in my dataframe

Being rather new to programming with python I tried to language detect segments of text in pandas data frame.
So first I made a function for the 'langdetect' package
import pandas as pd
from langdetect import detect
def language_detect(x):
lang = detect(x)
print(lang)
My second step would be to feed in the data frame for processing. All the segments that need detecting are in separate rows in the dataframe under the same column header.
result = [language_detect(x) for x in df['column_name']]
df['l_detect'] = pd.append(result)
In the output I see the texts being recognized properly.
But when I try to print result.
it returns me with only the value for every entry 'none'
So my questions are:
why do I get 'none' when the the print output from the function has the right values
How can I attach this to my current data frame, since when I try to append it I get 'none' on
every field as well.
Thanks in advance.
The problem is that result is empty because your function language_detect() doesn't return anything (it is only printing the results).
import pandas as pd
from langdetect import detect
lst = [('this is a test', 1), ('what language is this?', 4), ('stackoverflow is a website', 23)]
df = pd.DataFrame(lst, columns = ['text', 'something'])
def language_detect(x):
lang = detect(x)
print(lang)
result = [language_detect(x) for x in df['text']]
result
#Output:[None, None, None]
Just give it a return value:
def language_detect(x):
lang = detect(x)
return lang
df['l_detect'] = df['text'].apply(language_detect)
df.head()
#Output:
# text something l_detect
#0 this is a test 1 en
#1 what language is this? 4 en
#2 stackoverflow is a website 23 en
and it will work as expected.

Python3 - using pandas to group rows, where two colums contain values in forward or reverse order: v1,v2 or v2,v1

I'm fairly new to python and pandas, but I've written code that reads an excel workbook, and groups rows based on the values contained in two columns.
So where Col_1=A and Col_2=B, or Col_1=B and Col_2=A, both would be assigned a GroupID=1.
sample spreadsheet data, with rows color coded for ease of visibility
I've manged to get this working, but I wanted to know if there's a more simpler/efficient/cleaner/less-clunky way to do this.
import pandas as pd
df = pd.read_excel('test.xlsx')
# get column values into a list
col_group = df.groupby(['Header_2','Header_3'])
original_list = list(col_group.groups)
# parse list to remove 'reverse-duplicates'
new_list = []
for a,b in original_list:
if (b,a) not in new_list:
new_list.append((a,b))
# iterate through each row in the DataFrame
# check to see if values in the new_list[] exist, in forward or reverse
for index, row in df.iterrows():
for a,b in new_list:
# if the values exist in forward direction
if (a in df.loc[index, "Header_2"]) and (b in df.loc[index,"Header_3"]):
# GroupID value given, where value is index in the new_list[]
df.loc[index,"GroupID"] = new_list.index((a,b))+1
# else check if value exists in the reverse direction
if (b in df.loc[index, "Header_2"]) and (a in df.loc[index,"Header_3"]):
df.loc[index,"GroupID"] = new_list.index((a,b))+1
# Finally write the DataFrame to a new spreadsheet
writer = pd.ExcelWriter('output.xlsx')
df.to_excel(writer, 'Sheet1')
I know of the pandas.groupby([columnA, columnB]) option, but I couldn't figure a way to create groups that contained both (v1, v2) and (v2,v1).
A boolean mask should do the trick:
import pandas as pd
df = pd.read_excel('test.xlsx')
mask = ((df['Header_2'] == 'A') & (df['Header_3'] == 'B') |
(df['Header_2'] == 'B') & (df['Header_3'] == 'A'))
# Label each row in the original DataFrame with
# 1 if it matches the specified criteria, and
# 0 if it does not.
# This column can now be used in groupby operations.
df.loc[:, 'match_flag'] = mask.astype(int)
# Get rows that match the criteria
df[mask]
# Get rows that do not match the criteria
df[~mask]
EDIT: updated answer to address the groupby requirement.
I would do something like this.
import pandas as pd
df = pd.read_excel('test.xlsx')
#make the ordering consistent
df["group1"] = df[["Header_2","Header_3"]].max(axis=1)
df["group2"] = df[["Header_2","Header_3"]].min(axis=1)
#group them together
df = df.sort_values(by=["group1","group2"])
If you need to deal with more than two columns, I can write up a more general way to do this.

pandas dataframe output need to be a string instead of a list

I have a requirement that the result value should be a string. But when I calculate the maximum value of dataframe it gives the result as a list.
import pandas as pd
def answer_one():
df_copy = [df['# Summer'].idxmax()]
return (df_copy)
df = pd.read_csv('olympics.csv', index_col=0, skiprows=1)
for col in df.columns:
if col[:2]=='01':
df.rename(columns={col:'Gold'+col[4:]}, inplace=True)
if col[:2]=='02':
df.rename(columns={col:'Silver'+col[4:]}, inplace=True)
if col[:2]=='03':
df.rename(columns={col:'Bronze'+col[4:]}, inplace=True)
if col[:1]=='№':
df.rename(columns={col:'#'+col[1:]}, inplace=True)
names_ids = df.index.str.split('\s\(')
df.index = names_ids.str[0] # the [0] element is the country name (new index)
df['ID'] = names_ids.str[1].str[:3] # the [1] element is the abbreviation or ID (take first 3 characters from that)
df = df.drop('Totals')
df.head()
answer_one()
But here the answer_one() will give me a List as an output and not a string. Can someone help me know how this came be converted to a string or how can I get the answer directly from dataframe as a string. I don't want to convert the list to a string using str(df_copy).
Your first solution would be as #juanpa.arrivillaga put it: To not wrap it. Your function becomes:
def answer_one():
df_copy = df['# Summer'].idxmax()
return (df_copy)
>>> 1
Another thing that you might not be expecting but idxmax() will return the index of the max, perhaps you want to do:
def answer_one():
df_copy = df['# Summer'].max()
return (df_copy)
>>> 30
Since you don't want to do str(df_copy) you can do df_copy.astype(str) instead.
Here is how I would write your function:
def get_max_as_string(data, column_name):
""" Return Max Value from a column as a string."""
return data[column_name].max().astype(str)
get_max_as_string(df, '# Summer')
>>> '30'

Resources