1

我想将嵌套的 json 转换为 csv 格式,包括分组列表/字典的子行。

这是我的json

data =\
{
    "id": "1",
    "name": "HIGHLEVEL",
    "description": "HLD",
    "item": {
        "id": "11",
        "description": "description"
    },
    "packages": [{
            "id": "1",
            "label": "Package 1",
            "products": [{
                    "id": "1",
                    "price": 5
                }, {
                    "id": "2",
                    "price": 3
                }
            ]
        }, {
            "id": "2",
            "label": "Package 3",
            "products": [{
                    "id": "1",
                    "price": 5
                }, {
                    "id": "2",
                    "price": 3
                }
            ]
        }
    ]
}
import pandas as pd

df = pd.json_normalize(data)

# display(df)
  description id       name                                                                                                                                                                                                packages item.description item.id
0         HLD  1  HIGHLEVEL  [{'id': '1', 'label': 'Package 1', 'products': [{'id': '1', 'price': 5}, {'id': '2', 'price': 3}]}, {'id': '2', 'label': 'Package 3', 'products': [{'id': '1', 'price': 5}, {'id': '2', 'price': 3}]}]      description      11

JSON到CSV转换器的输出

"id","name","description","item__id","item__description","packages__id","packages__label","packages__products__id","packages__products__price"
"1","HIGHLEVEL","HLD","11","description","1","Package 1","1","5"
"","","","","","","","2","3"
"","","","","","2","Package 3","1","5"
"","","","","","","","2","3"

我尝试了 pandas 规范化,但结果与想要的不一样。JSON 数组不会转换为 csv 中的子行。我想在 csv 中保留空字符串。

我想用 Python 脚本做同样的事情。

4

2 回答 2

2

这应该适合你:

from copy import deepcopy
import pandas


def cross_join(left, right):
    new_rows = [] if right else left
    for left_row in left:
        for right_row in right:
            temp_row = deepcopy(left_row)
            for key, value in right_row.items():
                temp_row[key] = value
            new_rows.append(deepcopy(temp_row))
    return new_rows


def flatten_list(data):
    for elem in data:
        if isinstance(elem, list):
            yield from flatten_list(elem)
        else:
            yield elem


def json_to_dataframe(data_in):
    def flatten_json(data, prev_heading=''):
        if isinstance(data, dict):
            rows = [{}]
            for key, value in data.items():
                rows = cross_join(rows, flatten_json(value, prev_heading + '_' + key))
        elif isinstance(data, list):
            rows = []
            if(len(data) != 0):
                for i in range(len(data)):
                    [rows.append(elem) for elem in flatten_list(flatten_json(data[i], prev_heading))]
            else:
                data.append("")
                [rows.append(elem) for elem in flatten_list(flatten_json(data[0], prev_heading))]
        else:
            rows = [{prev_heading[1:]: data}]
        return rows

    return pandas.DataFrame(flatten_json(data_in))

def remove_duplicates(df):
    columns = list(df)[:7]
    for c in columns:
        df[c] = df[c].mask(df[c].duplicated(), "")

    return df


if __name__ == '__main__':
    df = json_to_dataframe(data)
    df = remove_duplicates(df)

    print(df)
    df.to_csv('data.csv', index=False)

输入 01:

data = {
    "id": "1",
    "name": "HIGHLEVEL",
    "description": "HLD",
    "item": {
        "id": "11",
        "description": "description"
    },
    "packages": [{
            "id": "1",
            "label": "Package 1",
            "products": [{
                    "id": "1",
                    "price": 5
                }, {
                    "id": "2",
                    "price": 3
                }, {
                    "id": "3",
                    "price": 9
                }
            ]
        }, {
            "id": "2",
            "label": "Package 3",
            "products": [{
                    "id": "1",
                    "price": 5
                }, {
                    "id": "2",
                    "price": 3
                }, {
                    "id": "3",
                    "price": 9
                }
            ]
        }
    ]
}

输出 01:

在此处输入图像描述

输入 02:

data = {
    "id": "1",
    "name": "HIGHLEVEL",
    "description": "HLD",
    "item": {
        "id": "11",
        "description": "description"
    },
    "packages": [{
            "id": "1",
            "label": "Package 1",
            "products": []
        }, {
            "id": "2",
            "label": "Package 3",
            "products": []
        }
    ]
}

输出 02: 在此处输入图像描述

希望它能解决您的问题。如果您需要任何解释,请告诉我。

谢谢

于 2021-08-23T21:39:30.020 回答
0

谢谢@特伦顿麦金尼

import pandas as pd
import json

data =\
{'description': 'HLD',
 'id': '1',
 'item': {'description': 'description', 'id': '11'},
 'name': 'HIGHLEVEL',
 'packages': [{'id': '1',
               'label': 'Package 1',
               'products': [{'id': '1', 'price': 5}, {'id': '2', 'price': 3}]},
              {'id': '2',
               'label': 'Package 3',
               'products': [{'id': '1', 'price': 5}, {'id': '2', 'price': 3}]}]}


df = pd.json_normalize(data, record_path=['packages'], meta=['id', 'name', 'description', ['item', 'id'], ['item', 'description']], meta_prefix='top_', sep='_')
df = df.explode('products')

df.rename({'id': 'packages_id', 'label': 'packages_label'}, axis=1, inplace=True)
df = df.join(pd.DataFrame(df.pop('products').values.tolist()))

df.rename({'id': 'packages_products_id', 'price': 'packages_products_price'}, axis=1, inplace=True)

df.columns = df.columns.str.replace('top_', '')

df = df[['id', 'name', 'description', 'item_id', 'item_description', 'packages_id', 'packages_label', 'packages_products_id', 'packages_products_price']]

columns_to_group = ["name", "description", "item_id", "item_description", "packages_id", "packages_label"]

for c in columns_to_group:
  df[c] = df[c].mask(
    df[c].duplicated(), ""
  )

print(df)

df.to_csv('data.csv', index=False)

现在我必须使它更通用,以便它可以用于任何结构化的 json。

于 2021-08-23T10:32:22.750 回答