- 最简单的做法是读取 csv 并使用额外信息更新 JSON 数据。
- 有很多不必要的复杂性可以删除,以使处理 JSON 更容易。
更新 JSON 数据
import csv
import pandas as pd
from ast import literal_eval
# read in the csv file
with open('test.csv', 'r') as f:
data = list(csv.reader(f, delimiter=';'))
# alter the json and create a list of only the json, which now contains all the information
new_json = list()
for i, (idx, date, json) in enumerate(data):
if i > 0:
json = literal_eval(json) # convert the str to a dict
json['id'] = idx # add unique id
json['date'] = date # add report date
json['Accounts'] = json['Person']['Accounts']['Account'] # move list to top level key
json['Alerts'] = json['Person']['Alerts']['Alert'] # move list to top level key
json['first_name'] = json['Person']['Name']['FirstName'] # move value to top level key
json['last_name'] = json['Person']['Name']['LastName'] # move value to top level key
json.pop('Person') # remove because it's no longer needed
new_json.append(json) # append to list
# print(new_json[0])
{'Accounts': [{'AccountNumber': 123, 'AccountStatus': 'G'},
{'AccountNumber': 137, 'AccountStatus': 'B'},
{'AccountNumber': 593, 'AccountStatus': 'VB'}],
'Alerts': [{'DT': '20200601', 'Msg': 'Lorem ipsum'},
{'DT': '20200615', 'Msg': 'Dolor sit amet', 'Msg2': 'Lorem'}],
'date': '20200601',
'first_name': 'John1',
'id': '123',
'last_name': 'Doe1'}
创建单独的数据框
# create accounts
accounts = pd.json_normalize(new_json, ['Accounts'], ['id', 'date'])
# display(accounts.head())
AccountNumber AccountStatus id date
0 123 G 123 20200601
1 137 B 123 20200601
2 593 VB 123 20200601
3 123 G 456 20200602
4 137 B 456 20200602
# create alerts
alerts = pd.json_normalize(new_json, ['Alerts'], ['id', 'date'])
# display(alerts.head())
DT Msg Msg2 id date
0 20200601 Lorem ipsum NaN 123 20200601
1 20200615 Dolor sit amet Lorem 123 20200601
2 20200601 Lorem ipsum NaN 456 20200602
3 20200615 Dolor sit amet Lorem 456 20200602
4 20200601 Lorem ipsum NaN 789 20200603
# create name
name = pd.json_normalize(new_json).drop(columns=['Accounts', 'Alerts'])
# display(name)
id date first_name last_name
0 123 20200601 John1 Doe1
1 456 20200602 John2 Doe2
2 789 20200603 John3 Doe3
3 123 20200606 John1 Doe1
使用的数据test.csv
:
id;date;json
123;20200601;{"Person": {"Name": {"FirstName": "John1", "LastName": "Doe1"}, "Accounts": {"Account": [{"AccountNumber":123, "AccountStatus": "G"}, {"AccountNumber":137, "AccountStatus": "B"}, {"AccountNumber":593, "AccountStatus": "VB"}]}, "Alerts": {"Alert": [{"DT":"20200601", "Msg": "Lorem ipsum"}, {"DT":"20200615", "Msg": "Dolor sit amet", "Msg2": "Lorem"}]}}}
456;20200602;{"Person": {"Name": {"FirstName": "John2", "LastName": "Doe2"}, "Accounts": {"Account": [{"AccountNumber":123, "AccountStatus": "G"}, {"AccountNumber":137, "AccountStatus": "B"}, {"AccountNumber":593, "AccountStatus": "VB"}]}, "Alerts": {"Alert": [{"DT":"20200601", "Msg": "Lorem ipsum"}, {"DT":"20200615", "Msg": "Dolor sit amet", "Msg2": "Lorem"}]}}}
789;20200603;{"Person": {"Name": {"FirstName": "John3", "LastName": "Doe3"}, "Accounts": {"Account": [{"AccountNumber":123, "AccountStatus": "G"}, {"AccountNumber":137, "AccountStatus": "B"}, {"AccountNumber":593, "AccountStatus": "VB"}]}, "Alerts": {"Alert": [{"DT":"20200601", "Msg": "Lorem ipsum"}, {"DT":"20200615", "Msg": "Dolor sit amet", "Msg2": "Lorem"}]}}}
123;20200606;{"Person": {"Name": {"FirstName": "John1", "LastName": "Doe1"}, "Accounts": {"Account": [{"AccountNumber":123, "AccountStatus": "G"}, {"AccountNumber":137, "AccountStatus": "B"}, {"AccountNumber":593, "AccountStatus": "VB"}]}, "Alerts": {"Alert": [{"DT":"20200601", "Msg": "Lorem ipsum"}, {"DT":"20200615", "Msg": "Dolor sit amet", "Msg2": "Lorem"}]}}}
作为一个函数
from typing import List, Tuple # used for type hints
import csv
import pandas as pd
from ast import literal_eval
def fix_json(data: List[List[str]]) -> List[dict]:
new_json = list()
for i, (idx, date, json) in enumerate(data):
if i > 0:
json = literal_eval(json)
json['id'] = idx
json['date'] = date
json['Accounts'] = json['Person']['Accounts']['Account']
json['Alerts'] = json['Person']['Alerts']['Alert']
json['first_name'] = json['Person']['Name']['FirstName']
json['last_name'] = json['Person']['Name']['LastName']
json.pop('Person')
new_json.append(json)
return new_json
def make_dataframes(file_path_name: str) -> Tuple[pd.DataFrame]:
with open(file_path_name, 'r') as f:
data = list(csv.reader(f, delimiter=';'))
new_json = fix_json(data)
accounts = pd.json_normalize(new_json, ['Accounts'], ['id', 'date'])
alerts = pd.json_normalize(new_json, ['Alerts'], ['id', 'date'])
names = pd.json_normalize(new_json).drop(columns=['Accounts', 'Alerts'])
return accounts, alerts, names
# function call
accounts, alerts, names = make_dataframes('test.csv')