I have a column in a spark data frame which has several messages. Here is a sample:
message = [
(1, "Sempre com #mariahcarey fazendo aquele aquecimento na voz antes dos shows. Quem lembra dessa? 🎤❤️"),
(2, "Happy Easter from the real bunny 💙🐰"),
(3, "Anakku aku udah diajak nonton malam mingguan kemarin😅🤣 tua😂 Haduhhh bener2 deh😂😂😂 #gadiiing #raffinagita1717")
]
rdd1 = sc.parallelize(message)
df=sqlContext.createDataFrame(rdd1,['id', 'message'])
I need to find all the emojies in the messages. Using the following code it is possible to find the first match:
import emoji
import re
emojis_list = map(lambda x: ''.join(x.split()), emoji.UNICODE_EMOJI.keys())
escape_list = '|'.join(re.escape(p) for p in emojis_list)
df.withColumn("emoji_in_post", fn.regexp_extract("message", escape_list, 0))
But I need all of them. So I tried to create a UDF using vanila python.
from pyspark.sql.types import ArrayType, StructType, StructField, StringType, IntegerType
import pyspark.sql.functions as fn
def find_all_emo(plain_text):
emo_list = re.findall(escape_list, plain_text)
return emo_list
search_all_emojis = fn.udf(lambda y: find_all_emo(y), ArrayType(StringType()))
But when applying that function to dataframe I am getting error.
TypeError: expected string or bytes-like object
If some one know the problem or has any better solution. Thanks in Advance.
So I found the problem. There is few rows when the value of message is null. So I had to extend the find all function.
def find_all_emo(plain_text):
if plain_text is None:
return None
emo_list = regex.findall(plain_text)
return emo_list
search_all_emojis = fn.udf(lambda y: find_all_emo(y), ArrayType(StringType()))
test = df.withColumn("emoji_in_post", search_all_emojis(fn.col("message")))
Now the output result is empty array or array with some emoji.
Related
I have dataframe table having values :
id Country Interest
00 Russian Digestion;Destillation
I want to pivot the Interest column and name new column in azure databricks in python like this :
id Country Int Interest
00Q7 Russ Digestion Digestion;Destillation
00Q7 Russ Destillation Digestion;Destillation
Please advise how it can be done
Regards
RK
I have created a sample dataframe similar to yours using the following code:
data = [['00Q7','Russian Federation','Digestion;Destillation'],['00Q6','United States','Oils;Automobiles']]
df = spark.createDataFrame(data=data,schema = ['id','country','interests'])
display(df)
To get the desired output (like yours), first I have split the data in interests column using pyspark.sql.functions.split.
from pyspark.sql.functions import split,col
df1 = df.withColumn("interest", split(col("interests"), ";"))
display(df1)
Now I have exploded the new column interest using pyspark.sql.functions.explode to get the required output.
from pyspark.sql.functions import explode
op = df1.withColumn('interest',explode(col('interest')))
display(op)
UPDATE:
data = [['00Q7','Russian Federation','01_Digestion;02_Destillation']]
df = spark.createDataFrame(data=data,schema = ['id','country','interests'])
#display(df)
from pyspark.sql.functions import split,col
df1 = df.withColumn("interest", split(col("interests"), ";"))
from pyspark.sql.functions import explode
op = df1.withColumn('interest',explode(col('interest')))
#UPDATE
from pyspark.sql.functions import concat,lit
op.withColumn("set",concat(lit('Set'),split(col('interest'),'_').getItem(0))).show(truncate=False)
UPDATE-2:
pdf['set']= pdf['interest'].str.split('_').str[0]
import numpy as np
pdf["set"] = np.where(pdf["set"].astype(int)<10 , 'Set'+pdf['set'].str[1], 'Set'+pdf['set'])
I'm building the following global function in Pyspark to go through each column in my CSV that is in different formats and convert them all to one unique format separated by "-."
I am new to the python world, I am getting
TypeError: Column is not iterable
employeesDF =is reading csv file from local sys
I tried the below code:
def colrename(df):
for col in employeesDF.columns:
F.col(col).alias(col.replace('/s,#', '_'))
return employeesDF
ndf = colrename (employeesDF.columns)
Input:
OutPut:
This will work
import re
def colrename(column):
reg = re.sub(r'\s|#', '_',column)
return reg
df2 = df2.toDF(*(colrename(c) for c in df2.columns))
In case any one interested, I used the code below to do it. I hope this information is useful. Thanks
from pyspark.sql import *
import re
spark = SparkSession.builder.master("local").appName("test").getOrCreate()
df=spark.read.format('csv')\
.option('header',True)\
.option('inferschema',True)\
.load('C:\\bigdata\\datasets\\employee10000_records.csv')
def colrename(df):
for names in df.schema.names:
df = df.withColumnRenamed(names, re.sub(r'([^A-Za-z0-9])','_',names))
return df
colrename (df).show()
from pyspark.sql import Row
df = spark.sparkContext.parallelize([
Row(name='Angel', age=5, height=None,weight=40,desc = "Where is Angel"),
Row(name='Bobby', age=None, height=40,weight=50,desc = "This is Bobby")
]).toDF()
df.select(map(col("desc"), col("age")).alias("complex_map"))\
.selectExpr("explode(complex_map)").show(2)
while running the above code geting an error :
TypeError: Column is not iterable
Please let me know where I am going wrong.
You need to use the create_map function, not the native Python map:
import pyspark.sql.functions as F
df.select(F.create_map(F.col("desc"), F.col("age")).alias("complex_map"))\
.selectExpr("explode(complex_map)").show(2)
To simplify the code further,
df.select(
F.explode(
F.create_map(F.col("desc"), F.col("age"))
).alias("complex_map")
).show(2)
i am new to AWS glue. I have a created job that would modify phone number's from a column and update the data frame.
Below script working fine in my local machine where i running with pyspark,
This basically add '+00' against those phone numbers which are not starting with '0'
## Phonenubercolum
6-451-512-3627
0-512-582-3548
1-043-733-0050
def addCountry_code(phoneNo):
countryCode= '+00'+phoneNo
if phoneNo[:1] !='0':
return str(countryCode)
else:
return str(phoneNo)
phone_replace_udf=udf(lambda x: addCountry_code(x), StringType())
phoneNo_rep_DF= concatDF.withColumn("phoneNumber", phone_replace_udf(sf.col('phoneNumber')))#.drop('phoneNumber')
##output
+006-451-512-3627
0-512-582-3548
+001-043-733-0050
But when i ran the same code in the glue context, it throws following error
addCountry_code countryCode= '+00'+phoneNo **TypeError: must be str, not NoneType**
I am wondering how this function fails in glue?
Appreciate if anyone can help on this?
This should give the desired result. Use spark.udf.register to register the function
import json
import boto3
import pyspark.sql.dataframe
from pyspark.sql.types import StringType
ds = [{'phoneNumber': '6-451-512-3627'},
{'phoneNumber': '0-512-582-3548'},
{'phoneNumber': '1-043-733-0050'}]
sf = spark.createDataFrame(ds)
def addCountry_code(phoneNo):
countryCode= '+00'+phoneNo
if phoneNo[:1] !='0':
return str(countryCode)
else:
return str(phoneNo)
spark.udf.register('phone_replace_udf', lambda x: addCountry_code(x), StringType())
sf.createOrReplaceTempView('sf')
spark.sql('select phone_replace_udf(phoneNumber) from sf').collect()
You can achieve this without using udf (udfs are generally slower than in-built functions).
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, lit
spark = SparkSession.builder.getOrCreate()
## Phonenubercolum
ds = [{'PhoneNumber': '6-451-512-3627'},
{'PhoneNumber': '0-512-582-3548'},
{'PhoneNumber': '1-043-733-0050'}]
df = spark.createDataFrame(ds)
df = df.withColumn('PhoneNumber', when(
~df['PhoneNumber'].startswith('0'), concat(lit('+00'), df['PhoneNumber'])) \
.otherwise(df['PhoneNumber']))
df.show()
+-----------------+
| PhoneNumber|
+-----------------+
|+006-451-512-3627|
| 0-512-582-3548|
|+001-043-733-0050|
+-----------------+
Im trying to convert String Datatype to Timestamp data type but Im getting NONE as a result
Sample Data and Code
20181016T192403.635918+02:00
date_format = "yyyyMMdd'T'HHmmss.SSSSSSZ”
data_frame = data_frame.withColumn('dob_ts', unix_timestamp('dob', date_format).cast(‘timestamp’)
Other formats (yyyyMMdd'T'HHmmss.SSS) works fine but not this one.
How to convert this format to timestamp?
You can using udf to define your function. Hence, in the user defined function you can handle this case by an if or what you want:
from pyspark.sql.functions import udf
from datetime import datetime
from pyspark.sql.types import TimestampType
def date_time_to_date(input_date_time):
split_ind = input_date_time.find('T')
new_date = input_date_time
if split_ind > -1:
new_date = input_date_time[:split_ind] + input_date_time[split_ind + 1:]
return datetime.strptime(input_date_time, '%Y%m%d %H%M%S.%f')
udf_date_time_to_date = udf(new_date, TimestampType())
data_frame = data_frame.withColumn('dob_ts', udf_date_time_to_date('dob'))