Converting string with nano seconds to timestamp - apache-spark

Im trying to convert String Datatype to Timestamp data type but Im getting NONE as a result
Sample Data and Code
20181016T192403.635918+02:00
date_format = "yyyyMMdd'T'HHmmss.SSSSSSZ”
data_frame = data_frame.withColumn('dob_ts', unix_timestamp('dob', date_format).cast(‘timestamp’)
Other formats (yyyyMMdd'T'HHmmss.SSS) works fine but not this one.
How to convert this format to timestamp?

You can using udf to define your function. Hence, in the user defined function you can handle this case by an if or what you want:
from pyspark.sql.functions import udf
from datetime import datetime
from pyspark.sql.types import TimestampType
def date_time_to_date(input_date_time):
split_ind = input_date_time.find('T')
new_date = input_date_time
if split_ind > -1:
new_date = input_date_time[:split_ind] + input_date_time[split_ind + 1:]
return datetime.strptime(input_date_time, '%Y%m%d %H%M%S.%f')
udf_date_time_to_date = udf(new_date, TimestampType())
data_frame = data_frame.withColumn('dob_ts', udf_date_time_to_date('dob'))

Related

Python function to iterate each unique column and transform using pyspark

I'm building the following global function in Pyspark to go through each column in my CSV that is in different formats and convert them all to one unique format separated by "-."
I am new to the python world, I am getting
TypeError: Column is not iterable
employeesDF =is reading csv file from local sys
I tried the below code:
def colrename(df):
for col in employeesDF.columns:
F.col(col).alias(col.replace('/s,#', '_'))
return employeesDF
ndf = colrename (employeesDF.columns)
Input:
OutPut:
This will work
import re
def colrename(column):
reg = re.sub(r'\s|#', '_',column)
return reg
df2 = df2.toDF(*(colrename(c) for c in df2.columns))
In case any one interested, I used the code below to do it. I hope this information is useful. Thanks
from pyspark.sql import *
import re
spark = SparkSession.builder.master("local").appName("test").getOrCreate()
df=spark.read.format('csv')\
.option('header',True)\
.option('inferschema',True)\
.load('C:\\bigdata\\datasets\\employee10000_records.csv')
def colrename(df):
for names in df.schema.names:
df = df.withColumnRenamed(names, re.sub(r'([^A-Za-z0-9])','_',names))
return df
colrename (df).show()

How to turn this datetime format without any other characters?

I want my datetime format to be like this "20210613172123" from this "2021-06-13 17:21:23.039823". Is this possible in python?
import datetime
now = datetime.datetime.now()
print(now)
So this is what you need to do :
from datetime import datetime
now = datetime.now()
formatted_time_expression = f'{now.year}{now.month}{now.day}{now.hour}{now.minute}{now.second}{now.microsecond}'
print(formatted_time_expression)
Use strftime to convert datetime object to the required string format.
from datetime import datetime
now = datetime.now()
print(now)
print(now.strftime("%Y%m%d%H%M%S"))
datetime to timestamp
from datetime import datetime
# current date and time
now = datetime.now()
timestamp = datetime.timestamp(now)
print("timestamp =", timestamp)
timestamp to datetime
dt_object = datetime.fromtimestamp(timestamp)
print("dt_object =", dt_object)
print("type(dt_object) =", type(dt_object))

Automatically Extracting the Datetime Format from a Pandas Series [duplicate]

I am trying to format the column 'Data' to make a pattern with dates.
The formats I have are:
1/30/20 16:00
1/31/2020 23:59
2020-02-02T23:43:02
Here is the code for the dataframe.
import requests
import pandas as pd
import numpy as np
url = "https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_daily_reports"
csv_only = [i.split("=")[1][1:-1] for i in requests.get(url).text.split(" ") if '.csv' in i and 'title' in i]
combo = [pd.read_csv(url.replace("github","raw.githubusercontent").replace("/tree/","/")+"/"+f) for f in csv_only]
one_df = pd.concat(combo,ignore_index=True)
one_df["País"] = one_df["Country/Region"].fillna(one_df["Country_Region"])
one_df["Data"] = one_df["Last Update"].fillna(one_df["Last_Update"])
I tried adding the code bellow but it doesn't bring the result I wanted
pd.to_datetime(one_df['Data'])
one_df.style.format({"Data": lambda t: t.strftime("%m/%d/%Y")})
Any help?
UPDATE
This is the complete code, but it doesn't work. Many exceptions printed with different date formats.
import requests
import pandas as pd
import numpy as np
from datetime import datetime
url = "https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_daily_reports"
csv_only = [i.split("=")[1][1:-1] for i in requests.get(url).text.split(" ") if '.csv' in i and 'title' in i]
combo = [pd.read_csv(url.replace("github","raw.githubusercontent").replace("/tree/","/")+"/"+f) for f in csv_only]
one_df = pd.concat(combo,ignore_index=True)
df = pd.DataFrame()
DATE_FORMATS = ["%m/%d/%y %H:%M", "%m/%d/%Y %H:%M", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M:%S"]
df["Região"] = one_df["Province/State"].fillna(one_df["Admin2"])
df["País"] = one_df["Country/Region"].fillna(one_df["Country_Region"])
df["Data"] = one_df["Last Update"].fillna(one_df["Last_Update"])
df["Confirmados"] = one_df["Confirmed"]
df["Mortes"] = one_df["Deaths"]
df["Recuperados"] = one_df["Recovered"]
def parse(x_):
for fmt in DATE_FORMATS :
try:
tmp = datetime.strptime(x_, fmt).strftime("%m/%d/%Y")
return tmp
except ValueError:
print(x_)
pd.to_datetime(df['Data'])
df['Data'] = df['Data'].apply(lambda x: parse(x))
#df['Data'].strftime('%m/%d/%Y')
#df['Data'] = df['Data'].map(lambda x: x.strftime('%m/%d/%Y') if x else '')
df.to_excel(r'C:\Users\guilh\Downloads\Covid2\Covid-19.xlsx', index=False, encoding="utf8")
print(df)
from datetime import datetime
import pandas as pd
You could save all possible formats in a list as -
DATE_FORMATS = ["%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S", "%m/%d/%y %H:%M", "%m/%d/%Y %H:%M"]
Define a function that loops through the formats and tries to parse it.
(Fixed a bug, where the print statement should have been outside the for loop)
issues = set()
def parse(x_):
for fmt in DATE_FORMATS:
try:
return datetime.strptime(x_, fmt).strftime("%m/%d/%Y")
except ValueError:
pass
issues.add(x_)
sample = ["1/30/20 16:00", "1/31/2020 23:59", "2020-02-02T23:43:02"]
df = pd.DataFrame({'data': sample})
df['data'] = df['data'].apply(lambda x: parse(x))
assert df['Data'].isna().sum() == len(issues) == 0, "Issues observed, nulls observed in dataframe"
print("Done")
Output
data
0 01/30/2020
1 01/31/2020
2 02/02/2020
If df.apply() comes across a particular date format that hasn't been defined in the list, it would simply print None since nothing would be returned by the function parse()
also here, letting pd.to_datetime infer the format does the trick:
import pandas as pd
s = pd.to_datetime(["1/30/20 16:00", "1/31/2020 23:59", "2020-02-02T23:43:02"])
print(s)
# DatetimeIndex(['2020-01-30 16:00:00', '2020-01-31 23:59:00',
# '2020-02-02 23:43:02'],
# dtype='datetime64[ns]', freq=None)
Note that if your date/time format generally provides the day first (e.g. 30.1.2021 for Jan 30th 2021), set keyword dayfirst=True.

Series format pandas

import pandas as pd
from datetime import datetime
import os
# get username
user = os.getlogin()
def file_process():
data = pd.read_excel('C:\\Users\\' + user + '\\My Documents\\XINVST.xls')
# Change the date and time formatting
data["INVDAT"] = data["INVDAT"].apply(lambda x: datetime.combine(x, datetime.min.time()))
data["INVDAT"] = data["INVDAT"].dt.strftime("%m-%d-%Y")
print(data)
# output to new file
# new_data = data
# new_data.to_excel('C:\\Users\\' + user + '\\Desktop\\XINVST.xls', index=None)
if __name__ == '__main__':
file_process()
I'm trying to format the INVDAT column to correct date format like 11/25/19, I've tried multiple solutions but keep running into errors like this one: TypeError: combine() argument 1 must be datetime.date, not int, I then tried to convert the integer to date type but it errors also.
Or you can simply use df["INVDAT"] = pd.to_datetime(df["INVDAT"], format="%m/%d/%y"), in this case you don't need the datetime pakage. For further information you should look the docs.
data['INVDAT'] = data['INVDAT'].astype('str')
data["INVDAT"] = pd.to_datetime(data["INVDAT"])
data["INVDAT"] = data["INVDAT"].dt.strftime("%m/%d/%Y")
This solution works but if the date representation is a single month like 12519 ( expected output 1/25/19), it fails. I tried using a conditional to add a 0 to the front if len() < 6 but it gives me an error that the dtype is int64.
import pandas as pd
import os
# get username
user = os.getlogin()
def file_process():
data = pd.read_excel('C:\\Users\\' + user + '\\My Documents\\XINVST.xls')
# Change the date and time formatting
data['INVDAT'] = data['INVDAT'].astype('str')
length = len(data['INVDAT'])
data['INVDAT'].pop(length - 1)
for i in data['INVDAT'].str.len():
if i <= 5:
data['INVDAT'] = data['INVDAT'].apply(lambda x: '{0:0>6}'.format(x))
length = len(data['INVDAT'])
data['INVDAT'].pop(length - 1)
data["INVDAT"] = pd.to_datetime(data["INVDAT"])
data["INVDAT"] = data["INVDAT"].dt.strftime("%m/%d/%Y")
else:
data["INVDAT"] = pd.to_datetime(data["INVDAT"])
data["INVDAT"] = data["INVDAT"].dt.strftime("%m/%d/%Y")
# output to new file
new_data = data
new_data.to_excel('C:\\Users\\' + user + '\\Desktop\\XINVST.xls', index=None)
if __name__ == '__main__':
file_process()
This is the solution, it's sloppy but works

How to Find all emoji from a PySpark Dataframe Column?

I have a column in a spark data frame which has several messages. Here is a sample:
message = [
(1, "Sempre com #mariahcarey fazendo aquele aquecimento na voz antes dos shows. Quem lembra dessa? 🎤❤️"),
(2, "Happy Easter from the real bunny 💙🐰"),
(3, "Anakku aku udah diajak nonton malam mingguan kemarin😅🤣 tua😂 Haduhhh bener2 deh😂😂😂 #gadiiing #raffinagita1717")
]
rdd1 = sc.parallelize(message)
df=sqlContext.createDataFrame(rdd1,['id', 'message'])
I need to find all the emojies in the messages. Using the following code it is possible to find the first match:
import emoji
import re
emojis_list = map(lambda x: ''.join(x.split()), emoji.UNICODE_EMOJI.keys())
escape_list = '|'.join(re.escape(p) for p in emojis_list)
df.withColumn("emoji_in_post", fn.regexp_extract("message", escape_list, 0))
But I need all of them. So I tried to create a UDF using vanila python.
from pyspark.sql.types import ArrayType, StructType, StructField, StringType, IntegerType
import pyspark.sql.functions as fn
def find_all_emo(plain_text):
emo_list = re.findall(escape_list, plain_text)
return emo_list
search_all_emojis = fn.udf(lambda y: find_all_emo(y), ArrayType(StringType()))
But when applying that function to dataframe I am getting error.
TypeError: expected string or bytes-like object
If some one know the problem or has any better solution. Thanks in Advance.
So I found the problem. There is few rows when the value of message is null. So I had to extend the find all function.
def find_all_emo(plain_text):
if plain_text is None:
return None
emo_list = regex.findall(plain_text)
return emo_list
search_all_emojis = fn.udf(lambda y: find_all_emo(y), ArrayType(StringType()))
test = df.withColumn("emoji_in_post", search_all_emojis(fn.col("message")))
Now the output result is empty array or array with some emoji.

Resources