How would I get from df1 and df2 to df3 and then merge them to create df4.
df1 = {'Date': ['2023-01-01', '2023-01-02','2023-01-03','2023-01-04','2023-01-05','2023-01-06','2023-01-07','2023-01-08','2023-01-09',],
'High': [10,20,30,40,50,60,70,80,90],
'Low': [1,2,3,4,5,6,7,8,9]}
df2 = {'Beg Date': ['2023-01-01', '2023-01-02','2023-01-03','2023-01-04','2023-01-05'],
'RT_1 top date':['2023-01-01', '2023-01-02','2023-01-03','2023-01-04',np.nan],
'RT_2 top date':['2023-01-02', np.nan, '2023-01-05', np.nan, np.nan],
'RT_3 top date':['2023-01-03', np.nan, np.nan, '2023-01-05', np.nan],
'RT_4 top date':['2023-01-04', np.nan, np.nan, '2023-01-02', np.nan],
'RT_5 top date':['2023-01-05', np.nan, np.nan, np.nan, np.nan],
'random': [3,4,9,10,10],
'RT_1 bottom date':['2023-01-01', '2023-01-02','2023-01-03','2023-01-04','2023-01-05'],
'RT_2 bottom date':['2023-01-02', np.nan, '2023-01-05', np.nan, np.nan],
'RT_3 bottom date':['2023-01-03', np.nan, np.nan, '2023-01-05', np.nan],
'RT_4 bottom date':['2023-01-04', np.nan, np.nan, '2023-01-02', np.nan],
'RT_5 bottom date':['2023-01-05', np.nan, np.nan, np.nan, np.nan],}
df1 = pd.DataFrame(df1)
df2 = pd.DataFrame(df2)
df3 = {'RT_1_top_price': [10, 20, 30, 40, np.nan],
'RT_2_top_price':[20, np.nan, 50, np.nan, np.nan],
'RT_3_top_price':[30, np.nan, np.nan,50, np.nan],
'RT_4_top_price':[40, np.nan, np.nan,20, np.nan],
'RT_5_top_price':[50, np.nan, np.nan,np.nan, np.nan],
'RT_1_bottom_price':[1,2,3,4,5],
'RT_2_bottom_price':[2, np.nan, 5, np.nan, np.nan],
'RT_3_bottom_price':[3, np.nan, np.nan,5, np.nan],
'RT_4_bottom_price':[4, np.nan, np.nan,2, np.nan],
'RT_5_bottom_price':[5, np.nan, np.nan,np.nan, np.nan],
}
df3 = pd.DataFrame(df3)
df4 = merge(df1, df2, df3).set_index(df1['Date']) # not perfect code just showing what I would like
In df4 I would like the “Beg Date” col to align with the same date in the index.
Been trying different versions of this:
df3 = pd.DataFrame()
for col in df2.filter(regex='top|bottom').columns:
# create a new column name based on the first four characters of col and '_price'
new_col = col[:4] + '_price'
result = pd.merge(df1_copy, df2, left_on='Date', right_on=col, how='left')
df3[new_col] = result['price']
If I get past this, I usually have merge issues due to NaT values.
One option is to flip the second dataframe into a long form before merging, that should be more performant and easier to work with:
I use pyjanitor pivot_longer for the wide to long transformation because of its simplicity and performance; you can achieve the same with melt/stack/wide_to_long:
# pip install pyjanitor
import janitor
import pandas as pd
flipped = (df2
.pivot_longer(
index = 'Beg Date',
column_names = "RT*",
names_to = ("RT", ".value"),
names_pattern = r"(RT_\d)\s(top|bottom)\sdate",
dropna=True)
)
top = (df1
.merge(flipped.loc[:, ['Beg Date', 'RT', 'top']],
left_on = 'Date',
right_on = 'top')
.pivot(index='Beg Date', columns="RT", values="High")
)
bottom = (df1
.merge(flipped.loc[:, ['Beg Date', 'RT', 'bottom']],
left_on = 'Date',
right_on = 'bottom')
.pivot(index='Beg Date', columns="RT", values="Low")
)
top.columns = [f"{column}_top_price" for column in top]
bottom.columns = [f"{column}_bottom_price" for column in bottom]
pd.concat([top, bottom], axis=1)
RT_1_top_price RT_2_top_price RT_3_top_price RT_4_top_price RT_5_top_price RT_1_bottom_price RT_2_bottom_price RT_3_bottom_price RT_4_bottom_price RT_5_bottom_price
Beg Date
2023-01-01 10.0 20.0 30.0 40.0 50.0 1.0 2.0 3.0 4.0 5.0
2023-01-02 20.0 NaN NaN NaN NaN 2.0 NaN NaN NaN NaN
2023-01-03 30.0 50.0 NaN NaN NaN 3.0 5.0 NaN NaN NaN
2023-01-04 40.0 NaN 50.0 20.0 NaN 4.0 NaN 5.0 2.0 NaN
2023-01-05 NaN NaN NaN NaN NaN 5.0 NaN NaN NaN NaN
Another solution using stack
and map
:
stacked = df2.set_index("Beg Date").drop(columns=["random"]).stack()
output = (stacked.map(dict(zip(df1["Date"],df1["High"])))
.where(stacked.index.get_level_values(1).str.contains("top date"),
stacked.map(dict(zip(df1["Date"],df1["Low"]))))
.unstack()
)
output = output.rename(columns={c: c.replace(" ", "_").replace("date","price") for c in output.columns})
>>> output
RT_1_top_price RT_2_top_price RT_3_top_price RT_4_top_price \
Beg Date
2023-01-01 10.0 20.0 30.0 40.0
2023-01-02 20.0 NaN NaN NaN
2023-01-03 30.0 50.0 NaN NaN
2023-01-04 40.0 NaN 50.0 20.0
2023-01-05 NaN NaN NaN NaN
RT_5_top_price RT_1_bottom_price RT_2_bottom_price \
Beg Date
2023-01-01 50.0 1.0 2.0
2023-01-02 NaN 2.0 NaN
2023-01-03 NaN 3.0 5.0
2023-01-04 NaN 4.0 NaN
2023-01-05 NaN 5.0 NaN
RT_3_bottom_price RT_4_bottom_price RT_5_bottom_price
Beg Date
2023-01-01 3.0 4.0 5.0
2023-01-02 NaN NaN NaN
2023-01-03 NaN NaN NaN
2023-01-04 5.0 2.0 NaN
2023-01-05 NaN NaN NaN