使用pd.json_normalize拼合JSON文件

txu3uszq  于 5个月前  发布在  其他
关注(0)|答案(1)|浏览(56)

我正在尝试将JSON文件解析为表,并且遇到了一些嵌套列表的问题。
该文件是以下内容的重复列表:

[{
  "uuid": "a2d89c9b-6e2e-4e3a-8d60-bf3ce2fe3fda",
  "timestamp": "2023-11-23 00:26:31.851000 UTC",
  "process_timestamp": "2023-11-23 00:26:32.326000 UTC",
  "visitor_id": "oeu1700282566730r0.9025758502018271",
  "session_id": "AUTO",
  "account_id": "25408250069",
  "experiments": {
    "list": [{
      "element": {
        "campaign_id": "26314710187",
        "experiment_id": "26322360336",
        "variation_id": "26314800349",
        "is_holdback": "false"
      }
    }]
  },
  "entity_id": "25754820685",
  "attributes": {
    "list": [{
      "element": {
        "id": null,
        "name": "",
        "type": "browserId",
        "value": "gc"
      }
    }, {
      "element": {
        "id": null,
        "name": "",
        "type": "campaign",
        "value": "blablabla"
      }
    }, {
      "element": {
        "id": null,
        "name": "",
        "type": "device",
        "value": "desktop"
      }
    }, {
      "element": {
        "id": null,
        "name": "",
        "type": "device_type",
        "value": "desktop_laptop"
      }
    }, {
      "element": {
        "id": null,
        "name": "",
        "type": "referrer",
        "value": "https://bookings.perrito.com/21df6542"
      }
    }, {
      "element": {
        "id": null,
        "name": "",
        "type": "source_type",
        "value": "campaign"
      }
    }, {
      "element": {
        "id": null,
        "name": "",
        "type": "currentTimestamp",
        "value": "1700699073915"
      }
    }, {
      "element": {
        "id": null,
        "name": "",
        "type": "offset",
        "value": "300"
      }
    }]
  },
  "user_ip": "72.38.10.0",
  "user_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
  "referer": "https://bookings.perrito.com/",
  "event_type": "other",
  "event_name": "transaction",
  "revenue": "240939",
  "value": null,
  "quantity": null,
  "tags": {
    "key_value": [{
      "key": "tour_id",
      "value": "386"
    }, {
      "key": "booking_id",
      "value": "123456"
    }, {
      "key": "payment_type",
      "value": "creditcard"
    }, {
      "key": "revenue",
      "value": "240939"
    }, {
      "key": "pax",
      "value": "1"
    }, {
      "key": "tour_name",
      "value": "Best Viaje ever"
    }, {
      "key": "extras",
      "value": "245.00"
    }]
  },
  "revision": "859",
  "client_engine": "js",
  "client_version": "0.188.1",
  "element": {
    "campaign_id": "26314710187",
    "experiment_id": "26322360336",
    "variation_id": "26314800349",
    "is_holdback": "false"
  }
}]

字符串
它的某些部分很容易提取:

import json
# load data using Python JSON module
with open(url,'r') as f:
    data = json.loads(f.read())

 data = pd.json_normalize(data)


然而,在JSON的属性和标记字典中有一个列表,我无法解析。

nwnhqdif

nwnhqdif1#

我在a previous answer的笔记中写道:

import json

import pandas as pd

# I saved your data to a JSONL file.
with open("scratch_1.jsonl", mode="r") as f:
    data = json.loads(f.read())

# These fields are at the top level of your JSONL.
# We use them as metadata, i.e. index fields.
# Note that I excluded the fields in the "element" dict as they were 
# present in the "experiments.list" dict.
meta = [
    "uuid",
    "timestamp",
    "process_timestamp",
    "visitor_id",
    "session_id",
    "account_id",
    "entity_id",
    "user_ip",
    "user_agent",
    "referer",
    "event_type",
    "event_name",
    "revenue",
    "value",
    "quantity",
    "revision",
    "client_engine",
    "client_version",
]

# Normalize each nested list explicitly.
# Each of these objects is its own dataframe with the same metadata.
experiments_list = pd.json_normalize(
    data=data,
    record_path=["experiments", "list"],
    meta=meta,
    record_prefix="experiments.list.",
)
attributes_list = pd.json_normalize(
    data=data,
    record_path=["attributes", "list"],
    meta=meta,
    record_prefix="attributes.list.",
)
tags_key_value = pd.json_normalize(
    data=data,
    record_path=["tags", "key_value"],
    meta=meta,
    record_prefix="tags.key_value."
)
print(experiments_list)
experiments.list.element.campaign_id  ... client_version
0                          26314710187  ...        0.188.1
print(attributes_list)
attributes.list.element.id  ... client_version
0                       None  ...        0.188.1
1                       None  ...        0.188.1
2                       None  ...        0.188.1
3                       None  ...        0.188.1
4                       None  ...        0.188.1
5                       None  ...        0.188.1
6                       None  ...        0.188.1
7                       None  ...        0.188.1
print(tags_key_value)
tags.key_value.key tags.key_value.value  ... client_engine client_version
0            tour_id                  386  ...            js        0.188.1
1         booking_id               123456  ...            js        0.188.1
2       payment_type           creditcard  ...            js        0.188.1
3            revenue               240939  ...            js        0.188.1
4                pax                    1  ...            js        0.188.1
5          tour_name      Best Viaje ever  ...            js        0.188.1
6             extras               245.00  ...            js        0.188.1
import json

import pandas as pd

# I saved your data to a JSONL file.
with open("scratch_1.jsonl", mode="r") as f:
    data = json.loads(f.read())

# These fields are at the top level of your JSONL.
# We use them as metadata, i.e. index fields.
# Note that I excluded the fields in the "element" dict as they were 
# present in the "experiments.list" dict.
meta = [
    "uuid",
    "timestamp",
    "process_timestamp",
    "visitor_id",
    "session_id",
    "account_id",
    "entity_id",
    "user_ip",
    "user_agent",
    "referer",
    "event_type",
    "event_name",
    "revenue",
    "value",
    "quantity",
    "revision",
    "client_engine",
    "client_version",
]

# Normalize each nested list explicitly.
# Each of these objects is its own dataframe with the same metadata.
experiments_list = pd.json_normalize(
    data=data,
    record_path=["experiments", "list"],
    meta=meta,
    record_prefix="experiments.list.",
)
attributes_list = pd.json_normalize(
    data=data,
    record_path=["attributes", "list"],
    meta=meta,
    record_prefix="attributes.list.",
)
tags_key_value = pd.json_normalize(
    data=data,
    record_path=["tags", "key_value"],
    meta=meta,
    record_prefix="tags.key_value."
)
import json

import pandas as pd

# I saved your data to a JSONL file.
with open("scratch_1.jsonl", mode="r") as f:
    data = json.loads(f.read())

# These fields are at the top level of your JSONL.
# We use them as metadata, i.e. index fields.
# Note that I excluded the fields in the "element" dict as they were 
# present in the "experiments.list" dict.
meta = [
    "uuid",
    "timestamp",
    "process_timestamp",
    "visitor_id",
    "session_id",
    "account_id",
    "entity_id",
    "user_ip",
    "user_agent",
    "referer",
    "event_type",
    "event_name",
    "revenue",
    "value",
    "quantity",
    "revision",
    "client_engine",
    "client_version",
]

# Normalize each nested list explicitly.
# Each of these objects is its own dataframe with the same metadata.
experiments_list = pd.json_normalize(
    data=data,
    record_path=["experiments", "list"],
    meta=meta,
    record_prefix="experiments.list.",
)
attributes_list = pd.json_normalize(
    data=data,
    record_path=["attributes", "list"],
    meta=meta,
    record_prefix="attributes.list.",
)
tags_key_value = pd.json_normalize(
    data=data,
    record_path=["tags", "key_value"],
    meta=meta,
    record_prefix="tags.key_value."
)
print(experiments_list)
experiments.list.element.campaign_id  ... client_version
0                          26314710187  ...        0.188.1
print(attributes_list)
attributes.list.element.id  ... client_version
0                       None  ...        0.188.1
1                       None  ...        0.188.1
2                       None  ...        0.188.1
3                       None  ...        0.188.1
4                       None  ...        0.188.1
5                       None  ...        0.188.1
6                       None  ...        0.188.1
7                       None  ...        0.188.1
print(tags_key_value)
tags.key_value.key tags.key_value.value  ... client_engine client_version
0            tour_id                  386  ...            js        0.188.1
1         booking_id               123456  ...            js        0.188.1
2       payment_type           creditcard  ...            js        0.188.1
3            revenue               240939  ...            js        0.188.1
4                pax                    1  ...            js        0.188.1
5          tour_name      Best Viaje ever  ...            js        0.188.1
6             extras               245.00  ...            js        0.188.1
import json

import pandas as pd

# I saved your data to a JSONL file.
with open("scratch_1.jsonl", mode="r") as f:
    data = json.loads(f.read())

# These fields are at the top level of your JSONL.
# We use them as metadata, i.e. index fields.
# Note that I excluded the fields in the "element" dict as they were 
# present in the "experiments.list" dict.
meta = [
    "uuid",
    "timestamp",
    "process_timestamp",
    "visitor_id",
    "session_id",
    "account_id",
    "entity_id",
    "user_ip",
    "user_agent",
    "referer",
    "event_type",
    "event_name",
    "revenue",
    "value",
    "quantity",
    "revision",
    "client_engine",
    "client_version",
]

# Normalize each nested list explicitly.
# Each of these objects is its own dataframe with the same metadata.
experiments_list = pd.json_normalize(
    data=data,
    record_path=["experiments", "list"],
    meta=meta,
    record_prefix="experiments.list.",
)
attributes_list = pd.json_normalize(
    data=data,
    record_path=["attributes", "list"],
    meta=meta,
    record_prefix="attributes.list.",
)
tags_key_value = pd.json_normalize(
    data=data,
    record_path=["tags", "key_value"],
    meta=meta,
    record_prefix="tags.key_value."
)
import json

import pandas as pd

# I saved your data to a JSONL file.
with open("scratch_1.jsonl", mode="r") as f:
    data = json.loads(f.read())

# These fields are at the top level of your JSONL.
# We use them as metadata, i.e. index fields.
# Note that I excluded the fields in the "element" dict as they were 
# present in the "experiments.list" dict.
meta = [
    "uuid",
    "timestamp",
    "process_timestamp",
    "visitor_id",
    "session_id",
    "account_id",
    "entity_id",
    "user_ip",
    "user_agent",
    "referer",
    "event_type",
    "event_name",
    "revenue",
    "value",
    "quantity",
    "revision",
    "client_engine",
    "client_version",
]

# Normalize each nested list explicitly.
# Each of these objects is its own dataframe with the same metadata.
experiments_list = pd.json_normalize(
    data=data,
    record_path=["experiments", "list"],
    meta=meta,
    record_prefix="experiments.list.",
)
attributes_list = pd.json_normalize(
    data=data,
    record_path=["attributes", "list"],
    meta=meta,
    record_prefix="attributes.list.",
)
tags_key_value = pd.json_normalize(
    data=data,
    record_path=["tags", "key_value"],
    meta=meta,
    record_prefix="tags.key_value."
)
import json

import pandas as pd

# I saved your data to a JSONL file.
with open("scratch_1.jsonl", mode="r") as f:
    data = json.loads(f.read())

# These fields are at the top level of your JSONL.
# We use them as metadata, i.e. index fields.
# Note that I excluded the fields in the "element" dict as they were 
# present in the "experiments.list" dict.
meta = [
    "uuid",
    "timestamp",
    "process_timestamp",
    "visitor_id",
    "session_id",
    "account_id",
    "entity_id",
    "user_ip",
    "user_agent",
    "referer",
    "event_type",
    "event_name",
    "revenue",
    "value",
    "quantity",
    "revision",
    "client_engine",
    "client_version",
]

# Normalize each nested list explicitly.
# Each of these objects is its own dataframe with the same metadata.
experiments_list = pd.json_normalize(
    data=data,
    record_path=["experiments", "list"],
    meta=meta,
    record_prefix="experiments.list.",
)
attributes_list = pd.json_normalize(
    data=data,
    record_path=["attributes", "list"],
    meta=meta,
    record_prefix="attributes.list.",
)
tags_key_value = pd.json_normalize(
    data=data,
    record_path=["tags", "key_value"],
    meta=meta,
    record_prefix="tags.key_value."
)
import json

import pandas as pd

# I saved your data to a JSONL file.
with open("scratch_1.jsonl", mode="r") as f:
    data = json.loads(f.read())

# These fields are at the top level of your JSONL.
# We use them as metadata, i.e. index fields.
# Note that I excluded the fields in the "element" dict as they were 
# present in the "experiments.list" dict.
meta = [
    "uuid",
    "timestamp",
    "process_timestamp",
    "visitor_id",
    "session_id",
    "account_id",
    "entity_id",
    "user_ip",
    "user_agent",
    "referer",
    "event_type",
    "event_name",
    "revenue",
    "value",
    "quantity",
    "revision",
    "client_engine",
    "client_version",
]

# Normalize each nested list explicitly.
# Each of these objects is its own dataframe with the same metadata.
experiments_list = pd.json_normalize(
    data=data,
    record_path=["experiments", "list"],
    meta=meta,
    record_prefix="experiments.list.",
)
attributes_list = pd.json_normalize(
    data=data,
    record_path=["attributes", "list"],
    meta=meta,
    record_prefix="attributes.list.",
)
tags_key_value = pd.json_normalize(
    data=data,
    record_path=["tags", "key_value"],
    meta=meta,
    record_prefix="tags.key_value."
)
print(experiments_list)
print(experiments_list)
experiments.list.element.campaign_id  ... client_version
0                          26314710187  ...        0.188.1
print(attributes_list)
attributes.list.element.id  ... client_version
0                       None  ...        0.188.1
1                       None  ...        0.188.1
2                       None  ...        0.188.1
3                       None  ...        0.188.1
4                       None  ...        0.188.1
5                       None  ...        0.188.1
6                       None  ...        0.188.1
7                       None  ...        0.188.1
print(tags_key_value)
print(tags_key_value)
tags.key_value.key tags.key_value.value  ... client_engine client_version
0            tour_id                  386  ...            js        0.188.1
1         booking_id               123456  ...            js        0.188.1
2       payment_type           creditcard  ...            js        0.188.1
3            revenue               240939  ...            js        0.188.1
4                pax                    1  ...            js        0.188.1
5          tour_name      Best Viaje ever  ...            js        0.188.1
6             extras               245.00  ...            js        0.188.1
import json

import pandas as pd

# I saved your data to a JSONL file.
with open("scratch_1.jsonl", mode="r") as f:
    data = json.loads(f.read())

# These fields are at the top level of your JSONL.
# We use them as metadata, i.e. index fields.
# Note that I excluded the fields in the "element" dict as they were 
# present in the "experiments.list" dict.
meta = [
    "uuid",
    "timestamp",
    "process_timestamp",
    "visitor_id",
    "session_id",
    "account_id",
    "entity_id",
    "user_ip",
    "user_agent",
    "referer",
    "event_type",
    "event_name",
    "revenue",
    "value",
    "quantity",
    "revision",
    "client_engine",
    "client_version",
]

# Normalize each nested list explicitly.
# Each of these objects is its own dataframe with the same metadata.
experiments_list = pd.json_normalize(
    data=data,
    record_path=["experiments", "list"],
    meta=meta,
    record_prefix="experiments.list.",
)
attributes_list = pd.json_normalize(
    data=data,
    record_path=["attributes", "list"],
    meta=meta,
    record_prefix="attributes.list.",
)
tags_key_value = pd.json_normalize(
    data=data,
    record_path=["tags", "key_value"],
    meta=meta,
    record_prefix="tags.key_value."
)
import json

import pandas as pd

# I saved your data to a JSONL file.
with open("scratch_1.jsonl", mode="r") as f:
    data = json.loads(f.read())

# These fields are at the top level of your JSONL.
# We use them as metadata, i.e. index fields.
# Note that I excluded the fields in the "element" dict as they were 
# present in the "experiments.list" dict.
meta = [
    "uuid",
    "timestamp",
    "process_timestamp",
    "visitor_id",
    "session_id",
    "account_id",
    "entity_id",
    "user_ip",
    "user_agent",
    "referer",
    "event_type",
    "event_name",
    "revenue",
    "value",
    "quantity",
    "revision",
    "client_engine",
    "client_version",
]

# Normalize each nested list explicitly.
# Each of these objects is its own dataframe with the same metadata.
experiments_list = pd.json_normalize(
    data=data,
    record_path=["experiments", "list"],
    meta=meta,
    record_prefix="experiments.list.",
)
attributes_list = pd.json_normalize(
    data=data,
    record_path=["attributes", "list"],
    meta=meta,
    record_prefix="attributes.list.",
)
tags_key_value = pd.json_normalize(
    data=data,
    record_path=["tags", "key_value"],
    meta=meta,
    record_prefix="tags.key_value."
)

1b1b1b1x 1b1b1b1b1b1b1b1b1b1b1b1b1b1b1b1b1b1b1b1b1b1b1b1b1b1b1b1b1b1b1b1b1b
您可以将merge中的所有数据放在meta列上。

out = (
    pd.merge(left=experiments_list, right=attributes_list, on=meta)
    .merge(right=tags_key_value, on=meta)
)
print(out)
experiments.list.element.campaign_id  ... tags.key_value.value
0                           26314710187  ...                  386
1                           26314710187  ...               123456
2                           26314710187  ...           creditcard
3                           26314710187  ...               240939
4                           26314710187  ...                    1
5                           26314710187  ...      Best Viaje ever
6                           26314710187  ...               245.00
7                           26314710187  ...                  386
8                           26314710187  ...               123456
9                           26314710187  ...           creditcard
10                          26314710187  ...               240939
11                          26314710187  ...                    1
12                          26314710187  ...      Best Viaje ever
13                          26314710187  ...               245.00
14                          26314710187  ...                  386
15                          26314710187  ...               123456
16                          26314710187  ...           creditcard
17                          26314710187  ...               240939
18                          26314710187  ...                    1
19                          26314710187  ...      Best Viaje ever
20                          26314710187  ...               245.00
21                          26314710187  ...                  386
22                          26314710187  ...               123456
23                          26314710187  ...           creditcard
24                          26314710187  ...               240939
25                          26314710187  ...                    1
26                          26314710187  ...      Best Viaje ever
27                          26314710187  ...               245.00
28                          26314710187  ...                  386
29                          26314710187  ...               123456
30                          26314710187  ...           creditcard
31                          26314710187  ...               240939
32                          26314710187  ...                    1
33                          26314710187  ...      Best Viaje ever
34                          26314710187  ...               245.00
35                          26314710187  ...                  386
36                          26314710187  ...               123456
37                          26314710187  ...           creditcard
38                          26314710187  ...               240939
39                          26314710187  ...                    1
40                          26314710187  ...      Best Viaje ever
41                          26314710187  ...               245.00
42                          26314710187  ...                  386
43                          26314710187  ...               123456
44                          26314710187  ...           creditcard
45                          26314710187  ...               240939
46                          26314710187  ...                    1
47                          26314710187  ...      Best Viaje ever
48                          26314710187  ...               245.00
49                          26314710187  ...                  386
50                          26314710187  ...               123456
51                          26314710187  ...           creditcard
52                          26314710187  ...               240939
53                          26314710187  ...                    1
54                          26314710187  ...      Best Viaje ever
55                          26314710187  ...               245.00

的最小值
由于每个嵌套列表都必须与其他嵌套列表连接,因此可能存在重复项。

experiments_list.shape  # (1, 22)
attributes_list.shape  # (8, 22)
tags_key_value.shape  # (7, 20)


这些行的乘积将是56,即out Dataframe 的长度。

相关问题