How to extract Json data scraped from website - python

I used Beautiful soup to extract data from a Website. Content is in JSON and I need to extract all the display_name values. I have no clue how to naviagate and print the values I need to save in my CSV.
I tried using some array examples like this one
for productoslvl in soup2.findAll('script',{'id' :'searchResult'}):
element = jsons[0]['display_name']
print (element)
but I keep getting KeyError
This is the JSON data:
{
'page_size': -1,
'refinements': [{
'display_name': 'Brand',
'values': [{
'display_name': 'Acqua Di Parma',
'status': 4,
'value': 900096
}],
'type': 'checkboxes'
}, {
'display_name': 'Bristle Type',
'values': [{
'display_name': 'Addictive',
'status': 1,
'value': 14578019
}, {
'display_name': 'Casual',
'status': 1,
'value': 14578020
}, {
'display_name': 'Chic',
'status': 1,
'value': 14301148
}, {
'display_name': 'Polished',
'status': 1,
'value': 14578022
}],
'type': 'checkboxes'
}, {
'display_name': 'Coverage',
'values': [{
'display_name': 'Balanced',
'status': 1,
'value': 14301025
}, {
'display_name': 'Light',
'status': 1,
'value': 14577894
}, {
'display_name': 'Rich',
'status': 1,
'value': 14577895
}],
'type': 'checkboxes'
}, {
'display_name': 'Formulation',
'values': [{
'display_name': 'Cream',
'status': 1,
'value': 100069
}, {
'display_name': 'Spray',
'status': 1,
'value': 100072
}],
'type': 'checkboxes'
}

Related

How to groupby columns by value and make json from them? Python3 Pandas

I have a dataset containing all the professors in Turkey. I need to change the shape of this data structure, but I couldn't find a solution. In this data, there is information about the university, faculty, department and title of approximately 44 thousand academicians.
[ { "name": "XX", "title": "PROFESÖR", "university": "GEBZE TEKNİK ÜNİVERSİTESİ", "faculty": "MÜHENDİSLİK FAKÜLTESİ", "department": "BİLGİSAYAR MÜHENDİSLİĞİ BÖLÜMÜ" }, { "name": "YY", "title": "PROFESÖR", "university": "GEBZE TEKNİK ÜNİVERSİTESİ", "faculty": "MÜHENDİSLİK FAKÜLTESİ", "department": "ELEKTRONİK MÜHENDİSLİĞİ BÖLÜMÜ" } ]
I have 44000 yields as above and I want to process them. For example, there are nearly 200 universities, I want to separate them.
{ "universities": [ { "id": 1, "name": "GEBZE TEKNİK ÜNİVERSİTESİ", "faculties": [ { "id" : 1, "name": "MÜHENDİSLİK FAKÜLTESİ", "departments" : [ { "id" : 1, "name" : "ELEKTRONİK MÜHENDİSLİĞİ BÖLÜMÜ", "academicians" : [ { "id" : 1, "name":"AA", "title" : "PROFESÖR" }, { "id" : 2, "name":"BB", "title" : "PROFESÖR" }, { "id" : 3, "name":"CC", "title" : "PROFESÖR" } ] }, { "id" : 2, "name" : "HARİTA MÜHENDİSLİĞİ BÖLÜMÜ", "academicians" : [ { "id" : 1, "name":"DD", "title" : "PROFESÖR" }, { "id" : 2, "name":"EE", "title" : "PROFESÖR" } ] } ] } ] } ] }
I want it as in the above format but I couldn't get it done. Can anyone help?
1.) get json datas
js_output = """{'universities': [{'id': 1,
'name': 'GEBZE TEKNİK ÜNİVERSİTESİ',
'faculties': [{'id': 1,
'name': 'MÜHENDİSLİK FAKÜLTESİ',
'departments': [{'id': 1,
'name': 'ELEKTRONİK MÜHENDİSLİĞİ BÖLÜMÜ',
'academicians': [{'id': 1, 'name': 'AA', 'title': 'PROFESÖR'},
{'id': 2, 'name': 'BB', 'title': 'PROFESÖR'},
{'id': 3, 'name': 'CC', 'title': 'PROFESÖR'}]},
{'id': 2,
'name': 'HARİTA MÜHENDİSLİĞİ BÖLÜMÜ',
'academicians': [{'id': 1, 'name': 'DD', 'title': 'PROFESÖR'},
{'id': 2, 'name': 'EE', 'title': 'PROFESÖR'}]}]}]}]}"""
js_input = """[{'name': 'XX',
'title': 'PROFESÖR',
'university': 'GEBZE TEKNİK ÜNİVERSİTESİ',
'faculty': 'MÜHENDİSLİK FAKÜLTESİ',
'department': 'BİLGİSAYAR MÜHENDİSLİĞİ BÖLÜMÜ'},
{'name': 'YY',
'title': 'PROFESÖR',
'university': 'GEBZE TEKNİK ÜNİVERSİTESİ',
'faculty': 'MÜHENDİSLİK FAKÜLTESİ',
'department': 'ELEKTRONİK MÜHENDİSLİĞİ BÖLÜMÜ'}]"""
2.) set json normalize
# get record_path with json keys and get frame list
df_list = []
def get_frames(l,j):
for i in range(len(l)):
df_list.append(pd.json_normalize(j, l[:i+1]))
records = ["universities", "faculties", "departments", "academicians"]
jdo = json.loads(js_output.replace("'",'"'))
get_frames(records, jdo)
3.) concatenate all frames
con = pd.DataFrame()
for df in df_list[:-1]: # because last item is dict and must be opened next step
con = pd.concat([con, df.iloc[:,:-1]], axis=1)
con = pd.concat([con, df_list[-1]], axis=1)
4.) drop na because of example frame is output template
df = con.dropna().copy()
5.) design columns and match input keys for next concatenates
df.columns = [
"uni_id",
"university",
"faculty_id",
"faculty",
"department_id",
"department",
"aca_id",
"name",
"title"
]
6.) refix id sections and join input frame with template
def input_join_to_get_desired_template(jdi):
jdf = pd.DataFrame(jdi)
con_df = pd.concat([df,jdf], ignore_index=True, sort=False)
# enumerate ids with unique counts ↓ =================================================
unique_uni = list(con_df["university"].unique())
unique_fac = list(con_df["faculty"].unique())
unique_dep = list(con_df["department"].unique())
con_df["uni_id"] = con_df["university"].apply(lambda x: unique_uni.index(x)+1)
con_df["faculty_id"] = con_df["faculty"].apply(lambda x: unique_fac.index(x)+1)
con_df["department_id"] = con_df["department"].apply(lambda x: unique_dep.index(x)+1)
# set academicians indexes
l = ["uni_id","faculty_id","department_id","aca_id"]
con_df["aca_id"] = 1
con_df["aca_id"] = con_df.groupby(l)["aca_id"].cumsum().to_frame()
# enumerate ids with unique counts ↑ =================================================
return con_df
jd_input = json.loads(js_input.replace("'",'"'))
result_df = input_join_to_get_desired_template(jd_input)
result_df
7.) get other inputs and test
js_input_test = """[{'name': 'hl',
'title': 'doc',
'university': 'GEBZE ÜNİVERSİTESİ',
'faculty': 'FAKÜLTESİ',
'department': 'BİLGİSAYAR BÖLÜMÜ'},
{'name': 'hl',
'title': 'PROFESÖR',
'university': 'GEBZE ÜNİVERSİTESİ',
'faculty': 'FAKÜLTESİ',
'department': 'BİLGİSAYAR BÖLÜMÜ'},
{'name': 'yz',
'title': 'yrddoc',
'university': 'TEKNİK ÜNİVERSİTESİ',
'faculty': 'other',
'department': 'BİLGİSAYAR MÜHENDİSLİĞİ BÖLÜMÜ'},
{'name': 'zz',
'title': 'doc',
'university': 'TEKNİK ÜNİVERSİTESİ',
'faculty': 'other',
'department': 'ELEKTRONİK MÜHENDİSLİĞİ BÖLÜMÜ'},
{'name': 'hl',
'title': 'PROFESÖR',
'university': 'TEKNİK ÜNİVERSİTESİ',
'faculty': 'other',
'department': 'BİLGİSAYAR MÜHENDİSLİĞİ BÖLÜMÜ'},
{'name': 'abc',
'title': 'prof',
'university': 'TEKNİK ÜNİVERSİTESİ',
'faculty': 'other',
'department': 'HARİTA MÜHENDİSLİĞİ BÖLÜMÜ'},
{'name': 'aaa',
'title': 'PROFESÖR',
'university': 'ÜNİVERSİTESİ',
'faculty': 'MÜHENDİSLİK',
'department': 'BİLGİSAYAR BÖLÜMÜ'},
{'name': 'bbb',
'title': 'PROFESÖR',
'university': 'ÜNİVERSİTESİ',
'faculty': 'MÜHENDİSLİK',
'department': 'BİLGİSAYAR BÖLÜMÜ'},
{'name': 'ccc',
'title': 'PROFESÖR',
'university': 'ÜNİVERSİTESİ',
'faculty': 'MÜHENDİSLİK',
'department': 'BİLGİSAYAR BÖLÜMÜ'},
{'name': 'ddd',
'title': 'PROFESÖR',
'university': 'ÜNİVERSİTESİ',
'faculty': 'MÜHENDİSLİK FAKÜLTESİ',
'department': 'ELEKTRONİK MÜHENDİSLİĞİ BÖLÜMÜ'}]"""
8.) and get results
jd_input = json.loads(js_input_test.replace("'",'"'))
result_df = input_join_to_get_desired_template(jd_input)
result_df

Python: Change a JSON value

Let's say I have the following JSON file named output.
{'fields': [{'name': 2, 'type': 'Int32'},
{'name': 12, 'type': 'string'},
{'name': 9, 'type': 'datetimeoffset'},
}],
'type': 'struct'}
If type key has a value datetimeoffset, I would like to change it to dateTime and if If type key has a value Int32, I would like to change it to integer and like this, I have multiple values to replace.
The expected output is
{'fields': [{ 'name': 2, 'type': 'integer'},
{ 'name': 12, 'type': 'string'},
{ 'name': 9, 'type': 'dateTime'},
,
}],
'type': 'struct'}
Can anyone help with this in Python?
You can try this out:
substitute = {"Int32": "integer", "datetimeoffset": "dateTime"}
x = {'fields': [
{'name': 2, 'type': 'Int32'},
{'name': 12, 'type': 'string'},
{'name': 9, 'type': 'datetimeoffset'}
],'type': 'struct'}
for i in range(len(x['fields'])):
if x['fields'][i]["type"] in substitute:
x['fields'][i]['type'] = substitute[x['fields'][i]['type']]
print(x)
You can use the following code. Include in equivalences dict the values you want to replace:
json = {
'fields': [
{'name': 2, 'type': 'Int32'},
{'name': 12, 'type': 'string'},
{'name': 9, 'type': 'datetimeoffset'},
],
'type': 'struct'
}
equivalences = {"datetimeoffset": "dateTime", "Int32": "integer"}
#Replace values based on equivalences dict
for i, data in enumerate(json["fields"]):
if data["type"] in equivalences.keys():
json["fields"][i]["type"] = equivalences[data["type"]]
print(json)
The output is:
{
"fields": [
{
"name": 2,
"type": "integer"
},
{
"name": 12,
"type": "string"
},
{
"name": 9,
"type": "dateTime"
}
],
"type": "struct"
}
simple but ugly way:
json_ ={'fields': [{'name': 2, 'type': 'Int32'},
{'name': 12, 'type': 'string'},
{'name': 9, 'type': 'datetimeoffset'}], 'type': 'struct'}
result = json.loads(json.dumps(json_ ).replace("datetimeoffset", "dateTime").replace("Int32", "integer"))

Remove nested element occurs twice but should be only once

I have a problem. I want to remove all nested elements inside a dict. But unfortunately my code does not work. Every nested element occurs twice, but it should be occurs only once.
What is the problem for that?
Method
def nested_dict(dictionaries):
my_list = []
for my_Dict in dictionaries:
my_new_dict = {}
for key in my_Dict.keys():
if isinstance(my_Dict[key], dict):
idx = str(uuid.uuid4())
my_Dict[key]["__id"] = idx
my_new_dict[key] = my_Dict[key]
my_Dict[key] = idx
my_list.append(my_new_dict)
return my_list
Running example
import uuid
my_Dict = {
'_key': '1',
'group': 'test',
'data': {},
'type': '',
'code': '007',
'conType': '1',
'flag': None,
'createdAt': '2021',
'currency': 'EUR',
'detail': {
'selector': {
'number': '12312',
'isTrue': True,
'requirements': [{
'type': 'customer',
'requirement': '1'}]
}
}
}
my_Dict2 = {
'_key': '2',
'group': 'test',
'data2': {},
'type': '',
'code': '007',
'conType': '1',
'flag': None,
'createdAt': '2021',
'currency': 'EUR',
'detail2': {
'selector': {
'number': '12312',
'isTrue': True,
'requirements': [{
'type': 'customer',
'requirement': '1'}]
}
}
}
dictionaries = [my_Dict, my_Dict2]
def nested_dict(dictionaries):
my_list = []
for my_Dict in dictionaries:
my_new_dict = {}
for key in my_Dict.keys():
if isinstance(my_Dict[key], dict):
idx = str(uuid.uuid4())
my_Dict[key]["__id"] = idx
my_new_dict[key] = my_Dict[key]
my_Dict[key] = idx
my_list.append(my_new_dict)
return my_list
result = nested_dict(dictionaries)
result
[OUT]
[{'data': {'__id': '46f4eb3d-977c-4da4-a99c-c9bfa831b96e'},
'detail': {'selector': {'number': '12312',
'isTrue': True,
'requirements': [{'type': 'customer', 'requirement': '1'}]},
'__id': 'fad4053e-75e5-4a03-93b6-67e0df814d23'}},
{'data': {'__id': '46f4eb3d-977c-4da4-a99c-c9bfa831b96e'},
'detail': {'selector': {'number': '12312',
'isTrue': True,
'requirements': [{'type': 'customer', 'requirement': '1'}]},
'__id': 'fad4053e-75e5-4a03-93b6-67e0df814d23'}},
{'data2': {'__id': '6afcf48e-508c-476b-98f3-9bf1e8370fb4'},
'detail2': {'selector': {'number': '12312',
'isTrue': True,
'requirements': [{'type': 'customer', 'requirement': '1'}]},
'__id': '2d4745ea-decd-45dc-aa0b-7bea5c449c34'}},
{'data2': {'__id': '6afcf48e-508c-476b-98f3-9bf1e8370fb4'},
'detail2': {'selector': {'number': '12312',
'isTrue': True,
'requirements': [{'type': 'customer', 'requirement': '1'}]},
'__id': '2d4745ea-decd-45dc-aa0b-7bea5c449c34'}}]
What I want
[{'data': {'__id': '46f4eb3d-977c-4da4-a99c-c9bfa831b96e'},
'detail': {'selector': {'number': '12312',
'isTrue': True,
'requirements': [{'type': 'customer', 'requirement': '1'}]},
'__id': 'fad4053e-75e5-4a03-93b6-67e0df814d23'}},
{'data2': {'__id': '6afcf48e-508c-476b-98f3-9bf1e8370fb4'},
'detail2': {'selector': {'number': '12312',
'isTrue': True,
'requirements': [{'type': 'customer', 'requirement': '1'}]},
'__id': '2d4745ea-decd-45dc-aa0b-7bea5c449c34'}}]
import uuid
import json
my_Dict = {
'_key': '1',
'group': 'test',
'data': {},
'type': '',
'code': '007',
'conType': '1',
'flag': None,
'createdAt': '2021',
'currency': 'EUR',
'detail': {
'selector': {
'number': '12312',
'isTrue': True,
'requirements': [{
'type': 'customer',
'requirement': '1'}]
}
}
}
my_Dict2 = {
'_key': '2',
'group': 'test',
'data2': {},
'type': '',
'code': '007',
'conType': '1',
'flag': None,
'createdAt': '2021',
'currency': 'EUR',
'detail2': {
'selector': {
'number': '12312',
'isTrue': True,
'requirements': [{
'type': 'customer',
'requirement': '1'}]
}
}
}
dictionaries = [my_Dict, my_Dict2]
def nested_dict(dictionaries):
my_list = []
for my_Dict in dictionaries:
my_new_dict = {}
for key in my_Dict.keys():
if isinstance(my_Dict[key], dict):
idx = str(uuid.uuid4())
my_Dict[key]["__id"] = idx
my_new_dict[key] = my_Dict[key]
my_Dict[key] = idx
my_list.append(my_new_dict)
return my_list
output:
[
{
"data": {
"__id": "5c6769cf-01e5-4f5d-acfa-622472163aba"
},
"detail": {
"selector": {
"number": "12312",
"isTrue": true,
"requirements": [
{
"type": "customer",
"requirement": "1"
}
]
},
"__id": "d167277f-4d02-4d53-934b-131187f6f214"
}
},
{
"data2": {
"__id": "e9182913-c2fc-4d60-adb8-b0b8274faf50"
},
"detail2": {
"selector": {
"number": "12312",
"isTrue": true,
"requirements": [
{
"type": "customer",
"requirement": "1"
}
]
},
"__id": "46e6be7b-8903-4d2a-a768-f6b24fcc5d31"
}
}
]
only minor changes needed that is you are appending the list within inner for loop but you should do it at outer for loop level. I have pasted the code with output which I got
I think it is because my_new_dict is holding an object that is changed by the time it appends to the list.
def nested_dict(dictionaries):
my_list = []
for my_Dict in dictionaries:
my_new_dict = {}
for key in my_Dict.keys():
if isinstance(my_Dict[key], dict):
idx = str(uuid.uuid4())
my_Dict[key]["__id"] = idx
my_new_dict[key] = my_Dict[key]
my_Dict[key] = idx
my_list.append({key: my_new_dict[key]})
print(my_list)
return my_list

How do I get data from a nested dict?

Hello I'm trying to get specific data out an API call from a website. This is the data I'm receiving
This is the data I'm recieving
{'type': 'NonStockItem', 'attributes': [], 'id': '1', 'description': 'Ikke lagerførte varer høy sats'}
{'type': 'NonStockItem', 'attributes': [], 'id': '2', 'description': 'Ikke lagerførte varer middels sats'}
{'type': 'NonStockItem', 'attributes': [], 'id': '3', 'description': 'Ikke lagerførte varer lav sats'}
{'type': 'NonStockItem', 'attributes': [], 'id': '4', 'description': 'Ikke lagerførte varer avgiftsfri'}
{'type': 'FinishedGoodItem', 'attributes': [{'attributeId': 'NETTBUTIKK', 'description': 'WEB', 'required': False, 'attributeType': 'Text', 'details': []}], 'id': '5', 'description': 'Lagerførte varer høy sats'}
{'type': 'FinishedGoodItem', 'attributes': [], 'id': '6', 'description': 'Lagerførte varer middels sats'}
{'type': 'FinishedGoodItem', 'attributes': [], 'id': '7', 'description': 'Lagerførte varer avgiftsfri'}
{'type': 'LaborItem', 'attributes': [], 'id': '8', 'description': 'Tjenester (prosjekt)'}
{'type': 'ExpenseItem', 'attributes': [], 'id': '9', 'description': 'Utgifter (Reise)'}
{'type': 'FinishedGoodItem', 'attributes': [{'attributeId': 'NETTBUTIKK', 'description': 'WEB', 'required': True, 'attributeType': 'Text', 'details': []}], 'id': 'ONLINE', 'description': 'Online'}
{'type': 'FinishedGoodItem', 'attributes': [{'attributeId': 'NETTBUTIKK', 'description': 'WEB', 'required': False, 'attributeType': 'Text', 'details': []}, {'attributeId': 'WEB2', 'description': 'tilgjengelighet i nettbutikk', 'required': True, 'attributeType': 'Combo', 'details': [{'id': 'Ikke Inne', 'description': 'Produktet er utsolgt.'}, {'id': 'Inne', 'description': 'tilgjengelig i nettbutikk'}]}], 'id': 'WEB', 'description': 'Tilgjengelig på nettbutikk.'}
This is the object fields
[
{
"type": "NonStockItem",
"attributes": [
{
"attributeId": "string",
"description": "string",
"sortOrder": 0,
"required": true,
"attributeType": "Text"
}
]
this is my code
if response.status_code == 200:
itemClass = json.loads(response.text)
for item in itemClass:
print(item["type"])
print(item["description"])
print(item["attributes"])
What I'm trying to do is to get only the attributes with an existing attributeId. I'm a bit stuck because the data inside the attributes array is a dict, how can I get the key values?
Current output:
NonStockItem
Ikke lagerførte varer høy sats
[]
NonStockItem
Ikke lagerførte varer middels sats
[]
NonStockItem
Ikke lagerførte varer lav sats
[]
NonStockItem
Ikke lagerførte varer avgiftsfri
[]
FinishedGoodItem
Lagerførte varer høy sats
[{'attributeId': 'NETTBUTIKK', 'description': 'WEB', 'required': False, 'attributeType': 'Text', 'details': []}]
FinishedGoodItem
Lagerførte varer middels sats
[]
FinishedGoodItem
Lagerførte varer avgiftsfri
[]
LaborItem
Tjenester (prosjekt)
[]
ExpenseItem
Utgifter (Reise)
[]
FinishedGoodItem
Online
[{'attributeId': 'NETTBUTIKK', 'description': 'WEB', 'required': True, 'attributeType': 'Text', 'details': []}]
FinishedGoodItem
Tilgjengelig på nettbutikk.
[{'attributeId': 'NETTBUTIKK', 'description': 'WEB', 'required': False, 'attributeType': 'Text', 'details': []}, {'attributeId': 'WEB2', 'description': 'tilgjengelighet i nettbutikk', 'required': True, 'attributeType': 'Combo', 'details': [{'id': 'Ikke Inne', 'description': 'Produktet er utsolgt.'}, {'id': 'Inne', 'description': 'tilgjengelig i nettbutikk'}]}]
I only want the types which contain an attributeId
I am assuming the list you are working on is accessible using lst[0]['attributes'].
Try the following, which uses list comprehension:
lst = [
{
"type": "NonStockItem",
"attributes": [
{
"attributeId": "string",
"description": "string",
"sortOrder": 0,
"required": True,
"attributeType": "Text"
},
{
# Note that it does not have attributeId
"description": "string",
"sortOrder": 0,
"required": True,
"attributeType": "Text"
}
]
}
]
attrs = lst[0]['attributes']
output = [d for d in attrs if 'attributeId' in d]
print(output)
Output:
[{'attributeId': 'string', 'description': 'string', 'sortOrder': 0, 'required': True, 'attributeType': 'Text'}]
Note that the output has only one element; in the input example I gave, the second dict does not have attributeId.
Pandas json_normalize could be used for this as well:
import json
import pandas as pd
response = '''[
{
"type": "NonStockItem",
"attributes": [
{
"attributeId": "string1",
"description": "string",
"sortOrder": 0,
"required": true,
"attributeType": "Text"
},
{
"attributeId": "string2",
"description": "string",
"sortOrder": 0,
"required": true,
"attributeType": "Text"
}]
},
{
"type": "NonStockItem",
"attributes":[]
},
{
"type": "NonStockItem",
"attributes": [
{
"attributeId": "string3",
"description": "string",
"sortOrder": 0,
"required": true,
"attributeType": "Text"
},
{
"attributeId": "string4",
"description": "string",
"sortOrder": 0,
"required": true,
"attributeType": "Text"
}]
}
]
'''
itemClass = json.loads(response)
print(pd.concat([pd.json_normalize(x["attributes"]) for x in itemClass],
ignore_index=True))
attributeId description sortOrder required attributeType
0 string1 string 0 True Text
1 string2 string 0 True Text
2 string3 string 0 True Text
3 string4 string 0 True Text
The best solution that I could think considering your data sample and output is to verify if item["attributes"] has values inside or no:
Code:
itemclass = [{'type': 'NonStockItem', 'attributes': [], 'id': '1', 'description': 'Ikke lagerførte varer høy sats'},
{'type': 'NonStockItem', 'attributes': [], 'id': '2', 'description': 'Ikke lagerførte varer middels sats'},
{'type': 'NonStockItem', 'attributes': [], 'id': '3', 'description': 'Ikke lagerførte varer lav sats'},
{'type': 'NonStockItem', 'attributes': [], 'id': '4', 'description': 'Ikke lagerførte varer avgiftsfri'},
{'type': 'FinishedGoodItem', 'attributes': [{'attributeId': 'NETTBUTIKK', 'description': 'WEB', 'required': False, 'attributeType': 'Text', 'details': []}], 'id': '5', 'description': 'Lagerførte varer høy sats'},
{'type': 'FinishedGoodItem', 'attributes': [], 'id': '6', 'description': 'Lagerførte varer middels sats'},
{'type': 'FinishedGoodItem', 'attributes': [], 'id': '7', 'description': 'Lagerførte varer avgiftsfri'},
{'type': 'LaborItem', 'attributes': [], 'id': '8', 'description': 'Tjenester (prosjekt)'},
{'type': 'ExpenseItem', 'attributes': [], 'id': '9', 'description': 'Utgifter (Reise)'},
{'type': 'FinishedGoodItem', 'attributes': [{'attributeId': 'NETTBUTIKK', 'description': 'WEB', 'required': True, 'attributeType': 'Text', 'details': []}], 'id': 'ONLINE', 'description': 'Online'},
{'type': 'FinishedGoodItem', 'attributes': [{'attributeId': 'NETTBUTIKK', 'description': 'WEB', 'required': False, 'attributeType': 'Text', 'details': []}, {'attributeId': 'WEB2', 'description': 'tilgjengelighet i nettbutikk', 'required': True, 'attributeType': 'Combo', 'details': [{'id': 'Ikke Inne', 'description': 'Produktet er utsolgt.'}, {'id': 'Inne', 'description': 'tilgjengelig i nettbutikk'}]}], 'id': 'WEB', 'description': 'Tilgjengelig på nettbutikk.'}]
for item in itemclass:
if item["attributes"]:
print(item["type"])
print(item["description"])
print(item["attributes"])
Output:
FinishedGoodItem
Lagerførte varer høy sats
[{'attributeId': 'NETTBUTIKK', 'description': 'WEB', 'required': False, 'attributeType': 'Text', 'details': []}]
FinishedGoodItem
Online
[{'attributeId': 'NETTBUTIKK', 'description': 'WEB', 'required': True, 'attributeType': 'Text', 'details': []}]
FinishedGoodItem
Tilgjengelig på nettbutikk.
[{'attributeId': 'NETTBUTIKK', 'description': 'WEB', 'required': False, 'attributeType': 'Text', 'details': []}, {'attributeId': 'WEB2', 'description': 'tilgjengelighet i nettbutikk', 'required': True, 'attributeType': 'Combo', 'details': [{'id': 'Ikke Inne', 'description': 'Produktet er utsolgt.'}, {'id': 'Inne', 'description': 'tilgjengelig i nettbutikk'}]}]

PyMongo - Aggregation pipeline to get user - mentioned user network

I have uploaded some tweets in a Mongo DB collection and I would like to extract the following information with PyMongo:
user.screen_name
entities.user_mentions.screen_name
count
i.e. I would like to know who has mentioned whom and how many times, in order to create some kind of network.
I used the following pipeline to get the most mentioned users but I'm not able to introduce also the user.screen_name:
tweets.aggregate([
{'$project': {'mentions': '$entities.user_mentions.screen_name', '_id': 0}},
{'$unwind': '$mentions'},
{'$group': {'_id': '$mentions', 'count': {'$sum': 1}}}
])
Here an example of document (tweet), where I removed some of the fields I'm not interested in:
{'_id': ObjectId('604c805b289d1ef5947e1845'),
'created_at': 'Fri Mar 12 04:36:10 +0000 2021',
'display_text_range': [0, 140],
'entities': {'hashtags': [{'indices': [124, 136], 'text': 'mytag'}],
'symbols': [],
'urls': [],
'user_mentions': [{'id': 123,
'id_str': '123',
'indices': [3, 14],
'name': 'user_name',
'screen_name': 'user_screen_name'}]},
'user': {'id': 456,
'id_str': '456',
'name': 'Author Name',
'screen_name': 'Author Screen Name'}}
{'_id': ObjectId('604c805b289d1ef5947e184x'),
'created_at': 'Fri Mar 12 04:36:10 +0000 2021',
'display_text_range': [0, 140],
'entities': {'hashtags': [{'indices': [124, 136], 'text': 'mytag'}],
'symbols': [],
'urls': [],
'user_mentions': [{'id': 126,
'id_str': '126',
'indices': [3, 14],
'name': 'user_name',
'screen_name': 'user_screen_name'}]},
'user': {'id': 4567,
'id_str': '4567',
'name': 'Other Author Name',
'screen_name': 'Other Author Screen Name'}}
In this example I would expect something like:
{'mentioned': 'user_screen_name',
'author': 'Author Screen Name',
'count': '1'},
{'mentioned': 'user_screen_name',
'author': 'Other Author Screen Name',
'count': '1'},
Can someone help me?
Thank you in advance for your help!
Francesca
db.collection.aggregate([
{
"$project": {
"mentions": "$entities.user_mentions.screen_name",
"author": "$user.screen_name"
}
},
{ "$unwind": "$mentions" },
{
"$group": {
"_id": { aut: "$author", ment: "$mentions" },
"count": { "$sum": 1 },
author: { "$first": "$author" },
mentions: { "$first": "$mentions" }
}
},
{
"$project": { _id: 0 }
}
])
Working Mongo playground

Categories