In [16]:
#Import Libraries
import pandas as pd
import jellyfish
In [17]:
# Read first dataset
df1 = pd.read_csv('data1.csv')
df1
Out[17]:
data1 data2
0 sandeep 80
1 riitwiikaa 1000
2 rituuraaj 20
3 roohit 70
In [18]:
# Read second dataset
df2 = pd.read_csv('data2.csv')
df2
Out[18]:
data3 data4
0 saandep 50
1 riitwika 2000
2 riituraj 30
3 rohiiet 60
In [19]:
# Create join column in first dataset
df1['soundex'] = df1['data1'].apply(lambda x: jellyfish.soundex(x))
df1
Out[19]:
data1 data2 soundex
0 sandeep 80 S531
1 riitwiikaa 1000 R320
2 rituuraaj 20 R362
3 roohit 70 R300
In [20]:
# Create join column in second dataset
df2['soundex'] = df2['data3'].apply(lambda x: jellyfish.soundex(x))
df2
Out[20]:
data3 data4 soundex
0 saandep 50 S531
1 riitwika 2000 R320
2 riituraj 30 R362
3 rohiiet 60 R300
In [21]:
# Merge both datasets
df = pd.merge(df1, df2, on=['soundex'])
df
Out[21]:
data1 data2 soundex data3 data4
0 sandeep 80 S531 saandep 50
1 riitwiikaa 1000 R320 riitwika 2000
2 rituuraaj 20 R362 riituraj 30
3 roohit 70 R300 rohiiet 60
In [22]:
# Export file to csv
df.to_csv('joined_data.csv', encoding='utf-8', index=False)