Merging series of pandas dataframe into single dataframe - python-3.x

I have a series of pandas data frame stored in variable df similar to below:
df
| 0 | 1 |
+-------+--------+
|ABCD | WXYZ |
| 0 | 1 |
+-------+--------+
|DEFJ | HJKL |
| 0 | 1 |
+-------+--------+
|ZXCT | WYOM |
| 0 | 1 |
+-------+--------+
|TYZX | NMEX |
I want to merge them to a single pandas data frame as below :
| 0 | 1 |
+-------+--------+
|ABCD | WXYZ |
|DEFJ | HJKL |
|ZXCT | WYOM |
|TYZX | NMEX |
So how can I merge series of pandas dataframe into one single pandas dataframe ?

As your code is now, you're only outputing one dataframe with one row only (overwriting the others).
Try this:
# Copy the names to pandas dataframes and save them in a list
import pandas as pd
dfs = []
for j in range(0,5):
for i in divs[j].find_elements_by_tag_name('a'):
i = i.get_attribute('text')
i = parse_name(i)
df = pd.DataFrame(i)
df = df.transpose()
dfs.append(df)
# Aggregate all dataframes in one
new_df = dfs[0]
for df in dfs[1:]:
new_df = new_df.append(df)
# Update index
new_df = new_df.reset_index(drop=True)
# Print first five rows
new_df.head()
0 1
0 Lynn Batten Emeritus Professor
1 Andrzej Goscinski Emeritus Professor
2 Jemal Abawajy Professor
3 Maia Angelova Professor
4 Gleb Beliakov Professor

There are four ways to concat or merge dataframes, you may refer to this post
these are the most common implementations
import pandas as pd
df1 = pd.DataFrame({0:['ABCD'], 1:['WXYX']})
df2 = pd.DataFrame({0:['DEFJ'], 1:['HJKL']})
df3 = pd.DataFrame({0:['ZXCT'], 1:['WYOM']})
...
df = pd.concat([df1, df2, df3], axis=0)
print(df.head())
or if you have a list of dataframes with the same headers you can try
dfs = [df1, df2, df3 ..]
df = pd.concat(dfs, axis=0)
and the most simple way is to just use df.append
df = df.append(anotherdf)

Related

break one DF row to multiple row in another DF

I am looking to convert one DF into another.
the difference is 1 row in DF1 may be 3 rows in DF2
example DF1
cust_id | email_id_1 | email_id_2 | email_id_3 |
1 |one_1#m.com | one_2#m.com| one_3#m.com|
then DF2 will be like
cust_id | email_id |
1 |one_1#m.com |
1 |one_2#m.com |
1 |one_3#m.com |
I have written below code , which is giving me error AttributeError: 'str' object has no attribute 'cast'
# Create a schema for the dataframe
dfSchema = StructType([
StructField('CUST_ID', LongType()),
StructField('EMAIL_ADDRESS', StringType())
])
dfData = []
for row in initialCustEmailDetailsDF.rdd.collect():
if row["email_address_1"]!="":
temp1 = [row["cust_id"].cast(LongType()),row["email_address_1"]]
# error : AttributeError: 'str' object has no attribute 'cast'
dfData.append(temp1)
if row["email_address_2"]!="":
temp2 = [row["cust_id"].cast(LongType()),row["email_address_2"]]
dfData.append(temp2)
if row["email_address_3"]!="":
temp3 = [row["cust_id"].cast(LongType())row["email_address_3"]]
dfData.append(temp3)
# Convert list to RDD
rdd = spark.sparkContext.parallelize(dfData)
# Create data frame
df = spark.createDataFrame(rdd, dfSchema)
df.show()
You may be looking for explode_outer:
df.show()
+-------+-----------+-----------+-----------+
|cust_id| email_id_1| email_id_2| email_id_3|
+-------+-----------+-----------+-----------+
| 1|one_1#m.com|one_2#m.com| null|
| 2|one_1#m.com| null|one_3#m.com|
| 3|one_1#m.com|one_2#m.com|one_3#m.com|
+-------+-----------+-----------+-----------+
import pyspark.sql.functions as F
df2 = df.select(
'cust_id',
F.explode_outer(
F.array('email_id_1', 'email_id_2', 'email_id_3')
).alias('email_id')
)
df2.show()
+-------+-----------+
|cust_id| email_id|
+-------+-----------+
| 1|one_1#m.com|
| 1|one_2#m.com|
| 1| null|
| 2|one_1#m.com|
| 2| null|
| 2|one_3#m.com|
| 3|one_1#m.com|
| 3|one_2#m.com|
| 3|one_3#m.com|
+-------+-----------+

Identify cells with characters other than numbers in dataframe

I have dataframe with two columns named A and B. How can I fill column B so that a cell will display "text" if A contains something other than a number or the number itself comma separated, and "number" when it is just a number?
(See the example below)
You could do it using apply on column A :
import pandas as pd
data = [{'A': '4 tons'},
{'A': '2.0*'},
{'A': 4.1},
{'A': 4.2},
{'A': '4,2'},
{'A': '6,3'}]
df = pd.DataFrame(data)
def checkType(x):
try:
#Trying to convert the value into a float type
float(x)
return 'number'
except:
#If there's an error, it's a text
return 'text'
df['B'] = df.A.apply(lambda x : checkType(x))
Output
| A | B |
|:-------|:-------|
| 4 tons | text |
| 2.0* | text |
| 4.1 | number |
| 4.2 | number |
| 4,2 | text |
| 6,3 | text |
What about this?
import pandas as pd
# making just column A
ls = ['4 tons', '4.0*', '4,0', '4.1', '5.2', '6,3']
df = pd.DataFrame()
df['A'] = ls
df['B'] = ls # copying for now
df["C"] = df["A"].str.replace(".", "") # creating a pivot column
df["C"] = df['C'].str.isdigit()
i = 0
for elem in df['C']:
if elem:
df['B'][i] = 'number'
else:
df['B'][i] = 'text'
i = i+1
del df['C'] # delete pivot column

Spark: Join two dataframes on an array type column

I have a simple use case
I have two dataframes df1 and df2, and I am looking for an efficient way to join them?
df1: Contains my main dataframe (billions of records)
+--------+-----------+--------------+
|doc_id |doc_name |doc_type_id |
+--------+-----------+--------------+
| 1 |doc_name_1 |[1,4] |
| 2 |doc_name_2 |[3,2,6] |
+--------+-----------+--------------+
df2: Contains labels of doc types(40000 records), as it's a small one I am broadcasting it.
+------------+----------------+
|doc_type_id |doc_type_name |
+------------+----------------+
| 1 |doc_type_1 |
| 2 |doc_type_2 |
| 3 |doc_type_3 |
| 4 |doc_type_4 |
| 5 |doc_type_5 |
| 6 |doc_type_5 |
+------------+----------------+
I would like to join these two dataframes to result in somthing like this:
+--------+------------+--------------+----------------------------------------+
|doc_id |doc_name |doc_type_id |doc_type_name |
+--------+------------+--------------+----------------------------------------+
| 1 |doc_name_1 |[1,4] |["doc_type_1","doc_type_4"] |
| 2 |doc_name_2 |[3,2,6] |["doc_type_3","doc_type_2","doc_type_6"]|
+--------+------------+--------------+----------------------------------------+
Thanks
We can use array_contains + groupBy + collect_list functions for this case.
Example:
val df1=Seq(("1","doc_name_1",Seq(1,4)),("2","doc_name_2",Seq(3,2,6))).toDF("doc_id","doc_name","doc_type_id")
val df2=Seq(("1","doc_type_1"),("2","doc_type_2"),("3","doc_type_3"),("4","doc_type_4"),("5","doc_type_5"),("6","doc_type_6")).toDF("doc_type_id","doc_type_name")
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
df1.createOrReplaceTempView("tbl")
df2.createOrReplaceTempView("tbl2")
spark.sql("select a.doc_id,a.doc_name,a.doc_type_id,collect_list(b.doc_type_name) doc_type_name from tbl a join tbl2 b on array_contains(a.doc_type_id,int(b.doc_type_id)) = TRUE group by a.doc_id,a.doc_name,a.doc_type_id").show(false)
//+------+----------+-----------+------------------------------------+
//|doc_id|doc_name |doc_type_id|doc_type_name |
//+------+----------+-----------+------------------------------------+
//|2 |doc_name_2|[3, 2, 6] |[doc_type_2, doc_type_3, doc_type_6]|
//|1 |doc_name_1|[1, 4] |[doc_type_1, doc_type_4] |
//+------+----------+-----------+------------------------------------+
Other way to achieve is by using explode + join + collect_list:
val df3=df1.withColumn("arr",explode(col("doc_type_id")))
df3.join(df2,df2.col("doc_type_id") === df3.col("arr"),"inner").
groupBy(df3.col("doc_id"),df3.col("doc_type_id"),df3.col("doc_name")).
agg(collect_list(df2.col("doc_type_name")).alias("doc_type_name")).
show(false)
//+------+-----------+----------+------------------------------------+
//|doc_id|doc_type_id|doc_name |doc_type_name |
//+------+-----------+----------+------------------------------------+
//|1 |[1, 4] |doc_name_1|[doc_type_1, doc_type_4] |
//|2 |[3, 2, 6] |doc_name_2|[doc_type_2, doc_type_3, doc_type_6]|
//+------+-----------+----------+------------------------------------+

Fill dataframe cells entry using dataframe column names and index

I try to fill a datafame using following approach:
I generate a mxn size dataframe
Column names for the dataframe areA to N and are read from a list passed to the method.
define the index for the dataframe.
fill the dataframe entries with Column name + _ + index
import numpy as np
import pandas as pd
from tabulate import tabulate
def generate_data(N_rows, N_cols,names_df =[]):
if N_rows == 4:
d16 = ['RU19-24', 'RU13-18', 'RU7-12', 'RU1-6']
df = pd.DataFrame(np.zeros((N_rows, N_cols)), index=d16 ,columns=names_df)
else:
print("The Elevation for each domain is defined by 4, you defined elevation: ", N_rows)
df = None
# df.loc[[],'Z'] = 3
return tabulate(df, headers='keys', tablefmt='psql')
a = generate_data(4,2, ['A', 'B'])
print(a)
Out:
+---------+-----+-----+
| | A | B |
|---------+-----+-----|
| RU19-24 | 0 | 0 |
| RU13-18 | 0 | 0 |
| RU7-12 | 0 | 0 |
| RU1-6 | 0 | 0 |
+---------+-----+-----+
Is it possible to take the index and concatenate with the column names to get the following output ?
+---------+-------------+-------------+
| | A | B |
|---------+-------------+-------------|
| RU19-24 | A_RU19-24 | B_RU19-24 |
| RU13-18 | A_RU13-18 | B_RU13-18 |
| RU7-12 | A_RU7-12 | B_RU7-12 |
| RU1-6 | A_RU1-6 | B_RU1-6 |
+---------+-------------+-------------+
IIUC, you can use, apply which take each column of the dataframe as a pd.Series, with an index (the dataframe index) and a series name(the dataframe column header):
df = pd.DataFrame(index=['RU19-24','RU13-18','RU7-12','RU1-6'], columns = ['A','B'])
df.apply(lambda x: x.name+'_'+x.index)
Output:
A B
RU19-24 A_RU19-24 B_RU19-24
RU13-18 A_RU13-18 B_RU13-18
RU7-12 A_RU7-12 B_RU7-12
RU1-6 A_RU1-6 B_RU1-6
or use np.add.outer
df = pd.DataFrame(index=['RU19-24','RU13-18','RU7-12','RU1-6'], columns = ['A','B'])
df_out = pd.DataFrame(np.add.outer(df.columns+'_',df.index).T, index=df.index, columns=df.columns)
df_out
Output:
A B
RU19-24 A_RU19-24 B_RU19-24
RU13-18 A_RU13-18 B_RU13-18
RU7-12 A_RU7-12 B_RU7-12
RU1-6 A_RU1-6 B_RU1-6

Iterate cols PySpark

I have a SQL table containing 40 columns: ID, Product, Product_ID, Date etc. and would like to iterate over all columns to get distinct values.
Customer table (sample):
ID Product
1 gadget
2 VR
2 AR
3 hi-fi
I have tried using dropDuplicates within a function that loops over all columns but the resultant output is only spitting out one distinct value per column instead of all possible distinct values.
Expected Result:
Column Value
ID 1
ID 2
ID 3
Product gadget
Product VR
Product AR
Product hi-fi
Actual Result:
Column Value
ID 1
Product gadget
The idea is to use collect_set() to fetch distinct elements in a column and then exploding the dataframe.
#All columns which need to be aggregated should be added here in col_list.
col_list = ['ID','Product']
exprs = [collect_set(x) for x in col_list]
Let's start aggregating.
from pyspark.sql.functions import lit , collect_set, explode, array, struct, col, substring, length, expr
df = spark.createDataFrame([(1,'gadget'),(2,'VR'),(2,'AR'),(3,'hi-fi')], schema = ['ID','Product'])
df = df.withColumn('Dummy',lit('Dummy'))
#While exploding later, the datatypes must be the same, so we have to cast ID as a String.
df = df.withColumn('ID',col('ID').cast('string'))
#Creating the list of distinct values.
df = df.groupby("Dummy").agg(*exprs)
df.show(truncate=False)
+-----+---------------+-----------------------+
|Dummy|collect_set(ID)|collect_set(Product) |
+-----+---------------+-----------------------+
|Dummy|[3, 1, 2] |[AR, VR, hi-fi, gadget]|
+-----+---------------+-----------------------+
def to_transpose(df, by):
# Filter dtypes and split into column names and type description
cols, dtypes = zip(*((c, t) for (c, t) in df.dtypes if c not in by))
# Spark SQL supports only homogeneous columns
assert len(set(dtypes)) == 1, "All columns have to be of the same type"
# Create and explode an array of (column_name, column_value) structs
kvs = explode(array([
struct(lit(c).alias("key"), col(c).alias("val")) for c in cols
])).alias("kvs")
return df.select(by + [kvs]).select(by + ["kvs.key", "kvs.val"])
df = to_transpose(df, ['Dummy']).drop('Dummy')
df.show()
+--------------------+--------------------+
| key| val|
+--------------------+--------------------+
| collect_set(ID)| [3, 1, 2]|
|collect_set(Product)|[AR, VR, hi-fi, g...|
+--------------------+--------------------+
df = df.withColumn('val', explode(col('val')))
df = df.withColumnRenamed('key', 'Column').withColumnRenamed('val', 'Value')
df = df.withColumn('Column', expr("substring(Column,13,length(Column)-13)"))
df.show()
+-------+------+
| Column| Value|
+-------+------+
| ID| 3|
| ID| 1|
| ID| 2|
|Product| AR|
|Product| VR|
|Product| hi-fi|
|Product|gadget|
+-------+------+
Note: All the columns which are not strings, should be converted into String like df = df.withColumn('ID',col('ID').cast('string')). Otherwise, you will get error.

Resources