I was trying to convert CSV to JSON Objects and I need to decode the nested objects. For example.
"row_id",_emitted_at,_data
rid_1,124,"{""property_id"":""xx"",""date"":""20220221"",""active1DayUsers"":3}"
rid_2,124,"{""property_id"":""xx"",""date"":""20220222"",""active1DayUsers"":2}"
So what I need in
{
row_id: rid_1,
_emitted_at :124,
property_id : xx,
date: 20220221,
active1DayUsers: 3
}
Do I have any third-party library to flatten the nested objects in CSV in Python?
My current approach is like read first and second line and extract the keys.
Thanks
Your data is not exactly in CSV format, nor does it is in JSON format. This make it a little hard to parse. My approach is to parse the lines using regular expression.
import re
import json
PATTERN = re.compile(r'^(.*?),(.*?)"(.*)"$')
def parse(text):
text = text.strip()
if not text:
return None
matched = PATTERN.match(text)
if not matched:
return None
row_id, emitted, data = matched.groups()
data = data.replace('""', '"') # "" becomes "
data = json.loads(data)
data['row_id'] = row_id
data['_emitted'] = int(emitted)
return data
lines = """
"row_id",_emitted_at,_data
rid_1,124"{""property_id"":""xx"",""date"":""20220221"",""active1DayUsers"":3}"
rid_2,124"{""property_id"":""xx"",""date"":""20220222"",""active1DayUsers"":2}"
"""
lines = iter(lines.strip().splitlines())
next(lines) # Discard the headers
out = [
parsed
for line in lines
if (parsed := parse(line)) is not None
]
print(json.dumps(out, indent=4))
Output
[
{
"property_id": "xx",
"date": "20220221",
"active1DayUsers": 3,
"row_id": "rid_1",
"_emitted": 124
},
{
"property_id": "xx",
"date": "20220222",
"active1DayUsers": 2,
"row_id": "rid_2",
"_emitted": 124
}
]
Let is assume you save the example in a file as:
row_id,_emitted_at,_data
rid_1,124,"{""property_id"":""xx"",""date"":""20220221"",""active1DayUsers"":3}"
rid_2,124,"{""property_id"":""xx"",""date"":""20220222"",""active1DayUsers"":2}"
Then this code does what you need:
import pandas as pd
# Read the CSV file
df = pd.read_csv('your_file.csv')
# Iterate over each row in the DataFrame
for index, row in df.iterrows():
# Create a new dictionary to store the flattened row
flattened_row = {}
# Extract the value of 'row_id' and '_emitted_at' columns
flattened_row['row_id'] = row['row_id']
flattened_row['_emitted_at'] = row['_emitted_at']
# Decode the nested JSON object
nested_object = eval(row['_data'])
# Extract the keys and values from the nested object
for key, value in nested_object.items():
flattened_row[key] = value
# Print the flattened row
print(flattened_row)
Output
{'row_id': 'rid_1', '_emitted_at': 124, 'property_id': 'xx', 'date': '20220221', 'active1DayUsers': 3}
{'row_id': 'rid_2', '_emitted_at': 124, 'property_id': 'xx', 'date': '20220222', 'active1DayUsers': 2}
Try this
from io import StringIO
import json
s="""
"row_id",_emitted_at,_data
rid_1,124,"{""property_id"":""xx"",""date"":""20220221"",""active1DayUsers"":3}"
rid_2,124,"{""property_id"":""xx"",""date"":""20220222"",""active1DayUsers"":2}"
"""
f = StringIO(s)
def rf(f, skip=1): # number of lines to skip from file
r = []
for n, line in enumerate(f):
if n < skip: continue
if n == skip: # header line
line = line.replace('"','')
k0, k1, k2 = line.split(',')
continue
i = line.find('"')
if i < 0: continue
v0, v1 = line[:i-1].split(',')
line = line[i:].replace('""','"')
v2 = json.loads(line[line.find('{'):line.rfind('}')+1])
d = {k0:v0, k1:v1} # if you care about key order
d.update(v2)
r.append(d)
return r
Yes, you can use the pandas library in Python to read the CSV file and then flatten the nested objects into a single JSON object. Here’s an example of how you can do this:
import pandas as pd
import json
# Read the CSV file
df = pd.read_csv('your_csv_file.csv')
# Initialize an empty list to store the flattened objects
flattened_objects = []
# Iterate over each row in the DataFrame
for index, row in df.iterrows():
# Parse the nested JSON object in the _data column
nested_object = json.loads(row['_data'])
# Flatten the nested object
flattened_object = {
'row_id': row['row_id'],
'_emitted_at': row['_emitted_at'],
'property_id': nested_object['property_id'],
'date': nested_object['date'],
'active1DayUsers': nested_object['active1DayUsers']
}
# Append the flattened object to the list
flattened_objects.append(flattened_object)
# Convert the list of flattened objects to a DataFrame
flattened_df = pd.DataFrame(flattened_objects)
# Convert the DataFrame to JSON
flattened_json = flattened_df.to_json(orient="records", lines=True)
print(flattened_json)
Are you missing commas after the 124s?