Multiple “Vlookups” and creating multiple new columns using 2 dataframes and merging them all together

How would I get from df1 and df2 to df3 and then merge them to create df4.

df1 = {'Date': ['2023-01-01', '2023-01-02','2023-01-03','2023-01-04','2023-01-05','2023-01-06','2023-01-07','2023-01-08','2023-01-09',],
       'High': [10,20,30,40,50,60,70,80,90],
       'Low': [1,2,3,4,5,6,7,8,9]}
    
df2 = {'Beg Date': ['2023-01-01', '2023-01-02','2023-01-03','2023-01-04','2023-01-05'],
       'RT_1 top date':['2023-01-01', '2023-01-02','2023-01-03','2023-01-04',np.nan],
       'RT_2 top date':['2023-01-02', np.nan, '2023-01-05', np.nan, np.nan],
       'RT_3 top date':['2023-01-03', np.nan, np.nan, '2023-01-05', np.nan],
       'RT_4 top date':['2023-01-04', np.nan, np.nan, '2023-01-02', np.nan],
       'RT_5 top date':['2023-01-05', np.nan, np.nan, np.nan, np.nan],
       'random': [3,4,9,10,10],
       'RT_1 bottom date':['2023-01-01', '2023-01-02','2023-01-03','2023-01-04','2023-01-05'],
       'RT_2 bottom date':['2023-01-02', np.nan, '2023-01-05', np.nan, np.nan],
       'RT_3 bottom date':['2023-01-03', np.nan, np.nan, '2023-01-05', np.nan],
       'RT_4 bottom date':['2023-01-04', np.nan, np.nan, '2023-01-02', np.nan],
       'RT_5 bottom date':['2023-01-05', np.nan, np.nan, np.nan, np.nan],}
    
df1 = pd.DataFrame(df1)
df2 = pd.DataFrame(df2)
    
df3 = {'RT_1_top_price': [10, 20, 30, 40, np.nan],
       'RT_2_top_price':[20, np.nan, 50, np.nan, np.nan],
       'RT_3_top_price':[30, np.nan, np.nan,50, np.nan],
       'RT_4_top_price':[40, np.nan, np.nan,20, np.nan],
       'RT_5_top_price':[50, np.nan, np.nan,np.nan, np.nan],
       'RT_1_bottom_price':[1,2,3,4,5],
       'RT_2_bottom_price':[2, np.nan, 5, np.nan, np.nan],
       'RT_3_bottom_price':[3, np.nan, np.nan,5, np.nan],
       'RT_4_bottom_price':[4, np.nan, np.nan,2, np.nan],
       'RT_5_bottom_price':[5, np.nan, np.nan,np.nan, np.nan],
                      }
    
df3 = pd.DataFrame(df3)
    
df4 = merge(df1, df2, df3).set_index(df1['Date']) # not perfect code just showing what I would like

In df4 I would like the “Beg Date” col to align with the same date in the index.

Been trying different versions of this:

df3 = pd.DataFrame()

    
for col in df2.filter(regex='top|bottom').columns:
    # create a new column name based on the first four characters of col and '_price'
    new_col = col[:4] + '_price'
        
    result = pd.merge(df1_copy, df2, left_on='Date', right_on=col, how='left')
        
    df3[new_col] = result['price']

If I get past this, I usually have merge issues due to NaT values.

One option is to flip the second dataframe into a long form before merging, that should be more performant and easier to work with:

I use pyjanitor pivot_longer for the wide to long transformation because of its simplicity and performance; you can achieve the same with melt/stack/wide_to_long:

# pip install pyjanitor
import janitor
import pandas as pd

flipped = (df2
          .pivot_longer(
              index = 'Beg Date', 
              column_names = "RT*",
              names_to = ("RT", ".value"), 
              names_pattern = r"(RT_\d)\s(top|bottom)\sdate", 
              dropna=True)
        )

top = (df1
       .merge(flipped.loc[:, ['Beg Date', 'RT', 'top']], 
              left_on = 'Date', 
              right_on = 'top')
       .pivot(index='Beg Date', columns="RT", values="High")
      )
bottom = (df1
         .merge(flipped.loc[:, ['Beg Date', 'RT', 'bottom']], 
                left_on = 'Date', 
                right_on = 'bottom')
         .pivot(index='Beg Date', columns="RT", values="Low")
        )

top.columns = [f"{column}_top_price" for column in top]
bottom.columns = [f"{column}_bottom_price" for column in bottom]

pd.concat([top, bottom], axis=1)

            RT_1_top_price  RT_2_top_price  RT_3_top_price  RT_4_top_price  RT_5_top_price  RT_1_bottom_price  RT_2_bottom_price  RT_3_bottom_price  RT_4_bottom_price  RT_5_bottom_price
Beg Date                                                                                                                                                                                 
2023-01-01            10.0            20.0            30.0            40.0            50.0                1.0                2.0                3.0                4.0                5.0
2023-01-02            20.0             NaN             NaN             NaN             NaN                2.0                NaN                NaN                NaN                NaN
2023-01-03            30.0            50.0             NaN             NaN             NaN                3.0                5.0                NaN                NaN                NaN
2023-01-04            40.0             NaN            50.0            20.0             NaN                4.0                NaN                5.0                2.0                NaN
2023-01-05             NaN             NaN             NaN             NaN             NaN                5.0                NaN                NaN                NaN                NaN

Another solution using stack and map:

stacked = df2.set_index("Beg Date").drop(columns=["random"]).stack()
output = (stacked.map(dict(zip(df1["Date"],df1["High"])))
          .where(stacked.index.get_level_values(1).str.contains("top date"),
                 stacked.map(dict(zip(df1["Date"],df1["Low"]))))
          .unstack()
          )
output = output.rename(columns={c: c.replace(" ", "_").replace("date","price") for c in output.columns})

>>> output 
            RT_1_top_price  RT_2_top_price  RT_3_top_price  RT_4_top_price  \
Beg Date                                                                     
2023-01-01            10.0            20.0            30.0            40.0   
2023-01-02            20.0             NaN             NaN             NaN   
2023-01-03            30.0            50.0             NaN             NaN   
2023-01-04            40.0             NaN            50.0            20.0   
2023-01-05             NaN             NaN             NaN             NaN   

            RT_5_top_price  RT_1_bottom_price  RT_2_bottom_price  \
Beg Date                                                           
2023-01-01            50.0                1.0                2.0   
2023-01-02             NaN                2.0                NaN   
2023-01-03             NaN                3.0                5.0   
2023-01-04             NaN                4.0                NaN   
2023-01-05             NaN                5.0                NaN   

            RT_3_bottom_price  RT_4_bottom_price  RT_5_bottom_price  
Beg Date                                                             
2023-01-01                3.0                4.0                5.0  
2023-01-02                NaN                NaN                NaN  
2023-01-03                NaN                NaN                NaN  
2023-01-04                5.0                2.0                NaN  
2023-01-05                NaN                NaN                NaN  

Leave a Comment