- 将半结构化 JSON 数据规范化为平面表。
data
是你的 JSON 字典
- Pandas:索引和选择数据
- 数据:带有地理信息的推文(英文)(选择 1)
- 每个文件都包含多行字典。
- 它们不在列表或元组内,因此读取每一行。
- 的值
tweet_locations
是一个字典列表
- , 的值
user_location
是一个字典
- 对于
tweet_locations
空列表[]
而不是 的情况,由于期望查看字段
[{}]
的方式,该行不包含在数据框中。json_normalize
metadata
- 发件人
tweet_id
不会包含在数据
中{"tweet_id":"1256223765513584641","created_at":"Fri May 01 14:07:39 +0000 2020","user_id":"772487185031311360","geo_source":"user_location","user_location":{"country_code":"us"},"geo":{},"place":{},"tweet_locations":[]}
。
- 这可以通过设置
"tweet_locations" = [{}]
when "tweet_locations":[]
isTrue
import pandas as pd
import json
from pathlib import Path
# path to file, which contains the sample data at the bottom of this answer
file = Path('data/test.json') # some path to your file
# load file
data = list()
with file.open('r') as f:
for line in f: # the file is rows of dicts that must be read 1 at a time
data.append(json.loads(line))
# create dataframe
df = pd.json_normalize(data, 'tweet_locations', ['tweet_id', ['user_location', 'country_code']], errors='ignore')
# display(df.head())
country_code state county city tweet_id user_location.country_code
0 us Illinois McLean County Normal 1256223753220034566 NaN
1 ke Kiambu County NaN NaN 1256223748904161280 ca
2 us Illinois McLean County Normal 1256223744122593287 us
3 th Saraburi Province NaN NaN 1256223753463365632 NaN
4 in Assam Lanka NaN 1256223753463365632 NaN br
# filter for US in the two columns
us = df[(df.country_code == 'us') | (df['user_location.country_code'] == 'us')]
# display(us)
country_code state county city tweet_id user_location.country_code
0 us Illinois McLean County Normal 1256223753220034566 NaN
2 us Illinois McLean County Normal 1256223744122593287 us
15 us Michigan Sanilac County NaN 1256338355106672640 in
16 us West Virginia Clay County NaN 1256338355106672640 in
18 us Florida Taylor County NaN 1256338355106672640 in
# get unique tweet_id
df_tweet_ids = df.tweet_id.unique().tolist()
print(df_tweet_ids)
['1256223753220034566', '1256223744122593287', '1256338355106672640']
加载和解析所有 JSON 文件
# path to files
p = Path('c:/path_to_files')
# get of all json files
files = list(p.rglob('*.json'))
# parse files
us_data = list()
for file in files:
data = list()
with file.open('r', encoding='utf-8') as f:
for line in f:
data.append(json.loads(line))
# create dataframe
df = pd.json_normalize(data, 'tweet_locations', ['tweet_id', ['user_location', 'country_code']], errors='ignore')
# filter for US in the two columns
df = df[(df.country_code == 'us') | (df['user_location.country_code'] == 'us')]
us_data.append(df)
# combine all data into one dataframe
us = pd.concat(us_data)
# delete objects that are no longer needed
del(data)
del(df)
del(us_data)
只解析tweet_id
没有 pandas
- 因为文件是字典行,
ijson
所以不需要。
- 如所写,这将包括
tweet_id
if country_code
is 'us'
,即使tweet_locations
是一个空列表。
tweet_id
from将{"tweet_id":"1256223765513584641","created_at":"Fri May 01 14:07:39 +0000 2020","user_id":"772487185031311360","geo_source":"user_location","user_location":{"country_code":"us"},"geo":{},"place":{},"tweet_locations":[]}
包含在数据中。
file = Path('data/en_geo_2020-05-01/en_geo_2020-05-01.json')
tweet_ids = list()
with file.open('r') as f:
for line in f:
line = json.loads(line)
if line.get('user_location').get('country_code') == 'us':
tweet_ids.append(line.get('tweet_id'))
else:
if line['tweet_locations']: # if tweet_locations is a list and not empty (None)
tweet_locations_country_code = [i.get('country_code') for i in line['tweet_locations']] # get the coutry_code for each tweet
if 'us' in tweet_locations_country_code: # if 'us' is in the list
tweet_ids.append(line.get('tweet_id')) # append
print(tweet_ids)
['1256223753220034566', '1256223744122593287', '1256338355106672640']
样本数据
{"tweet_id":"1256223753220034566","created_at":"Fri May 01 14:07:36 +0000 2020","user_id":"916540973190078465","geo_source":"tweet_text","user_location":{},"geo":{},"place":{},"tweet_locations":[{"country_code":"us","state":"Illinois","county":"McLean County","city":"Normal"}]}
{"tweet_id":"1256223748904161280","created_at":"Fri May 01 14:07:35 +0000 2020","user_id":"697426379583983616","geo_source":"user_location","user_location":{"country_code":"ca"},"geo":{},"place":{},"tweet_locations":[{"country_code":"ke","state":"Kiambu County"}]}
{"tweet_id":"1256223744122593287","created_at":"Fri May 01 14:07:34 +0000 2020","user_id":"1277481013","geo_source":"user_location","user_location":{"country_code":"us","state":"Florida"},"geo":{},"place":{},"tweet_locations":[{"country_code":"us","state":"Illinois","county":"McLean County","city":"Normal"}]}
{"tweet_id":"1256223753463365632","created_at":"Fri May 01 14:07:36 +0000 2020","user_id":"596005899","geo_source":"tweet_text","user_location":{},"geo":{},"place":{},"tweet_locations":[{"country_code":"th","state":"Saraburi Province"},{"country_code":"in","state":"Assam","county":"Lanka"},{"country_code":"cz","state":"Northeast","county":"okres \u00dast\u00ed nad Orlic\u00ed"},{"country_code":"lk"}]}
{"tweet_id":"1256223753115238406","created_at":"Fri May 01 14:07:36 +0000 2020","user_id":"139159502","geo_source":"user_location","user_location":{"country_code":"ca"},"geo":{},"place":{},"tweet_locations":[{"country_code":"ve"},{"country_code":"ca","state":"Nova Scotia","county":"Pictou County","city":"Diamond"},{"country_code":"my","state":"Selangor","city":"Kajang"}]}
{"tweet_id":"1256223748161757190","created_at":"Fri May 01 14:07:35 +0000 2020","user_id":"1655021437","geo_source":"user_location","user_location":{"country_code":"af","state":"Nangarhar","county":"Kot"},"geo":{},"place":{},"tweet_locations":[{"country_code":"cz","state":"Northeast","county":"okres \u00dast\u00ed nad Orlic\u00ed"},{"country_code":"cz","state":"Northeast","county":"okres \u00dast\u00ed nad Orlic\u00ed"},{"country_code":"gb","state":"England","county":"Gloucestershire"}]}
{"tweet_id":"1256223749214437380","created_at":"Fri May 01 14:07:35 +0000 2020","user_id":"3244990814","geo_source":"user_location","user_location":{"country_code":"se"},"geo":{},"place":{},"tweet_locations":[{"country_code":"cg","state":"Kouilou","county":"Pointe-Noire"},{"country_code":"cn"}]}
{"tweet_id":"1256338355106672640","created_at":"Fri May 01 21:43:00 +0000 2020","user_id":"1205700416123486208","geo_source":"user_location","user_location":{"country_code":"in","state":"Delhi"},"geo":{},"place":{},"tweet_locations":[{"country_code":"us","state":"Michigan","county":"Sanilac County"},{"country_code":"us","state":"West Virginia","county":"Clay County"},{"country_code":"de","state":"Baden-W\u00fcrttemberg","county":"Verwaltungsgemeinschaft Friedrichshafen"},{"country_code":"us","state":"Florida","county":"Taylor County"}]}
{"tweet_id":"1256223764980944904","created_at":"Fri May 01 14:07:39 +0000 2020","user_id":"1124447266205503488","geo_source":"none","user_location":{},"geo":{},"place":{},"tweet_locations":[]}
{"tweet_id":"1256223760765595650","created_at":"Fri May 01 14:07:38 +0000 2020","user_id":"909477905737990144","geo_source":"tweet_text","user_location":{},"geo":{},"place":{},"tweet_locations":[{"country_code":"lr","state":"Grand Bassa County","county":"District # 2"}]}