Formating CSV to JSON Python (Nested Objects)

Question 1

I was trying to convert CSV to JSON Objects and I need to decode the nested objects. For example.

"row_id",_emitted_at,_data

rid_1,124,"{""property_id"":""xx"",""date"":""20220221"",""active1DayUsers"":3}"

rid_2,124,"{""property_id"":""xx"",""date"":""20220222"",""active1DayUsers"":2}"

So what I need in

{
 row_id: rid_1,
 _emitted_at :124,
 property_id : xx,
 date: 20220221,
 active1DayUsers: 3
}

Do I have any third-party library to flatten the nested objects in CSV in Python?

My current approach is like read first and second line and extract the keys.

Thanks

Question 2

Your data is not exactly in CSV format, nor does it is in JSON format. This make it a little hard to parse. My approach is to parse the lines using regular expression.

import re
import json


PATTERN = re.compile(r'^(.*?),(.*?)"(.*)"$')

def parse(text):
    text = text.strip()
    if not text:
        return None
        
    matched = PATTERN.match(text)
    if not matched:
        return None
        
    row_id, emitted, data = matched.groups()
    data = data.replace('""', '"') # "" becomes "
    data = json.loads(data)
    data['row_id'] = row_id
    data['_emitted'] = int(emitted)
    return data
    
lines = """
"row_id",_emitted_at,_data

rid_1,124"{""property_id"":""xx"",""date"":""20220221"",""active1DayUsers"":3}"

rid_2,124"{""property_id"":""xx"",""date"":""20220222"",""active1DayUsers"":2}"
"""


lines = iter(lines.strip().splitlines())
next(lines)  # Discard the headers

out = [
    parsed
    for line in lines
    if (parsed := parse(line)) is not None
]
print(json.dumps(out, indent=4))

Output

[
    {
        "property_id": "xx",
        "date": "20220221",
        "active1DayUsers": 3,
        "row_id": "rid_1",
        "_emitted": 124
    },
    {
        "property_id": "xx",
        "date": "20220222",
        "active1DayUsers": 2,
        "row_id": "rid_2",
        "_emitted": 124
    }
]

Question 3

Let is assume you save the example in a file as:

row_id,_emitted_at,_data
rid_1,124,"{""property_id"":""xx"",""date"":""20220221"",""active1DayUsers"":3}"
rid_2,124,"{""property_id"":""xx"",""date"":""20220222"",""active1DayUsers"":2}"

Then this code does what you need:

import pandas as pd

# Read the CSV file
df = pd.read_csv('your_file.csv')

# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    # Create a new dictionary to store the flattened row
    flattened_row = {}

    # Extract the value of 'row_id' and '_emitted_at' columns
    flattened_row['row_id'] = row['row_id']
    flattened_row['_emitted_at'] = row['_emitted_at']

    # Decode the nested JSON object
    nested_object = eval(row['_data'])

    # Extract the keys and values from the nested object
    for key, value in nested_object.items():
        flattened_row[key] = value

    # Print the flattened row
    print(flattened_row)

Output

{'row_id': 'rid_1', '_emitted_at': 124, 'property_id': 'xx', 'date': '20220221', 'active1DayUsers': 3}
{'row_id': 'rid_2', '_emitted_at': 124, 'property_id': 'xx', 'date': '20220222', 'active1DayUsers': 2}

Question 4

Try this

from io import StringIO
import json
s="""
"row_id",_emitted_at,_data

rid_1,124,"{""property_id"":""xx"",""date"":""20220221"",""active1DayUsers"":3}"

rid_2,124,"{""property_id"":""xx"",""date"":""20220222"",""active1DayUsers"":2}"
"""

f = StringIO(s)

def rf(f, skip=1):   # number of lines to skip from file 
    r = []
    for n, line in enumerate(f):
        if n < skip: continue
        if n == skip: # header line
            line = line.replace('"','')
            k0, k1, k2 = line.split(',')
            continue
        i = line.find('"')
        if i < 0: continue
        v0, v1 = line[:i-1].split(',')
        line = line[i:].replace('""','"')
        v2 = json.loads(line[line.find('{'):line.rfind('}')+1])
        d = {k0:v0, k1:v1} # if you care about key order
        d.update(v2)
        r.append(d)
    return r

Question 5

Yes, you can use the pandas library in Python to read the CSV file and then flatten the nested objects into a single JSON object. Here’s an example of how you can do this:

import pandas as pd
import json

# Read the CSV file
df = pd.read_csv('your_csv_file.csv')

# Initialize an empty list to store the flattened objects
flattened_objects = []

# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    # Parse the nested JSON object in the _data column
    nested_object = json.loads(row['_data'])

    # Flatten the nested object
    flattened_object = {
        'row_id': row['row_id'],
        '_emitted_at': row['_emitted_at'],
        'property_id': nested_object['property_id'],
        'date': nested_object['date'],
        'active1DayUsers': nested_object['active1DayUsers']
    }

    # Append the flattened object to the list
    flattened_objects.append(flattened_object)

# Convert the list of flattened objects to a DataFrame
flattened_df = pd.DataFrame(flattened_objects)

# Convert the DataFrame to JSON
flattened_json = flattened_df.to_json(orient="records", lines=True)

print(flattened_json)

Leave a Comment Cancel reply