Select column value from Pandas dataframe based on closest value in another dataframe

I have two pandas dataframes, df1 and df2, both with timestamp and data columns sorted by time. For each timestamp in df1, I want to find the row with the closest timestamp in df2. I then want to assign the df2 data for that row to the df1 row.

I am able to get the correct index position and values for each row using get_indexer, but I can’t figure out how to reference the value in the corresponding data column. With the below code, I recieve the error:

ValueError: cannot reindex from a duplicate axis

import pandas as pd

df1 = pd.DataFrame({'Time1': ['2023-11-30 13:01:51.813', 
                              '2023-11-28 10:01:51.760', 
                              '2023-11-28 10:01:51.347', 
                              '2023-11-27 10:01:51.320', 
                              '2023-11-26 10:01:51.260'], 
                    'Data1': [2, 4, 6, 8, 10]})

df2 = pd.DataFrame({'Time2': ['2023-11-30 10:00:03.733', 
                              '2023-11-29 08:00:03.767', 
                              '2023-11-28 16:00:03.523', 
                              '2023-11-27 14:00:03.827', 
                              '2023-11-26 12:00:03.417'], 
                    'Data2': ['A', 'B', 'C', 'D', 'E']})

#Format times as datetime
df1['Time1'] = pd.to_datetime(df1['Time1'], format="%Y-%m-%d %H:%M:%S.%f")
df2['Time2'] = pd.to_datetime(df2['Time2'], format="%Y-%m-%d %H:%M:%S.%f")

#Set Time2 as Data2 index
df2 = df2.set_index('Time2')

#For each Time1, get index position of closest Time2
df1['Data2Index']= df2.index.get_indexer(df1['Time1'], method='nearest')

#For each Time1, get index value for closest Time2
df1['Data2Time'] = df2.index[df2.index.get_indexer(df1['Time1'], method='nearest')]

print(df1)
print(df2)

                    Time1  Data1  Data2Index               Data2Time
0 2023-11-30 13:01:51.813      2           0 2023-11-30 10:00:03.733
1 2023-11-28 10:01:51.760      4           2 2023-11-28 16:00:03.523
2 2023-11-28 10:01:51.347      6           2 2023-11-28 16:00:03.523
3 2023-11-27 10:01:51.320      8           3 2023-11-27 14:00:03.827
4 2023-11-26 10:01:51.260     10           4 2023-11-26 12:00:03.417
                        Data2
Time2
2023-11-30 10:00:03.733     A
2023-11-29 08:00:03.767     B
2023-11-28 16:00:03.523     C
2023-11-27 14:00:03.827     D
2023-11-26 12:00:03.417     E

#For each Time1, get Data2 value for closest Time2 (does not work)
df1['Data2']= df2.Data2[df2.index.get_indexer(df1['Time1'], method='nearest')]

I have used merge_asof. Check this:

import pandas as pd

df1 = pd.DataFrame({'Time1': ['2023-11-30 13:01:51.813', 
                              '2023-11-28 10:01:51.760', 
                              '2023-11-28 10:01:51.347', 
                              '2023-11-27 10:01:51.320', 
                              '2023-11-26 10:01:51.260'], 
                    'Data1': [2, 4, 6, 8, 10]})

df2 = pd.DataFrame({'Time2': ['2023-11-30 10:00:03.733', 
                              '2023-11-29 08:00:03.767', 
                              '2023-11-28 16:00:03.523', 
                              '2023-11-27 14:00:03.827', 
                              '2023-11-26 12:00:03.417'], 
                    'Data2': ['A', 'B', 'C', 'D', 'E']})

#Format times as datetime
df1['Time1'] = pd.to_datetime(df1['Time1'], format="%Y-%m-%d %H:%M:%S.%f")
df2['Time2'] = pd.to_datetime(df2['Time2'], format="%Y-%m-%d %H:%M:%S.%f")

df1 = df1.sort_values(by=['Time1'])
df2 = df2.sort_values(by=['Time2'])

#Set Time2 as Data2 index
df1 = df1.set_index('Time1')
df2 = df2.set_index('Time2')
df3 = pd.merge_asof(left=df1,right=df2,right_index=True,left_index=True,direction='nearest')
df3

                         Data1 Data2
Time1                               
2023-11-26 10:01:51.260     10     E
2023-11-27 10:01:51.320      8     D
2023-11-28 10:01:51.347      6     C
2023-11-28 10:01:51.760      4     C
2023-11-30 13:01:51.813      2     A

If you want both Time1 and Time2 appearing you need to do the same but not declare these columns as index. Like this:

import pandas as pd

df1 = pd.DataFrame({'Time1': ['2023-11-30 13:01:51.813', 
                              '2023-11-28 10:01:51.760', 
                              '2023-11-28 10:01:51.347', 
                              '2023-11-27 10:01:51.320', 
                              '2023-11-26 10:01:51.260'], 
                    'Data1': [2, 4, 6, 8, 10]})

df2 = pd.DataFrame({'Time2': ['2023-11-30 10:00:03.733', 
                              '2023-11-29 08:00:03.767', 
                              '2023-11-28 16:00:03.523', 
                              '2023-11-27 14:00:03.827', 
                              '2023-11-26 12:00:03.417'], 
                    'Data2': ['A', 'B', 'C', 'D', 'E']})

#Format times as datetime
df1['Time1'] = pd.to_datetime(df1['Time1'], format="%Y-%m-%d %H:%M:%S.%f")
df2['Time2'] = pd.to_datetime(df2['Time2'], format="%Y-%m-%d %H:%M:%S.%f")

df1 = df1.sort_values(by=['Time1'])
df2 = df2.sort_values(by=['Time2'])

df3 = pd.merge_asof(left=df1,right=df2,left_on='Time1',right_on='Time2',direction='nearest')
df3

                    Time1  Data1                   Time2 Data2
0 2023-11-26 10:01:51.260     10 2023-11-26 12:00:03.417     E
1 2023-11-27 10:01:51.320      8 2023-11-27 14:00:03.827     D
2 2023-11-28 10:01:51.347      6 2023-11-28 16:00:03.523     C
3 2023-11-28 10:01:51.760      4 2023-11-28 16:00:03.523     C
4 2023-11-30 13:01:51.813      2 2023-11-30 10:00:03.733     A

if you want to merge only specific columns than in the merge line you should limit the columns, e.g.:

df3 = pd.merge_asof(left=df1[['Time1', 'Data1']],right=df2[['Time2', 'Data2']],left_on='Time1',right_on='Time2',direction='nearest')

Leave a Comment