I'm using the code below to pull in multiple json files into 1 using pagination. When I try to create a spark dataframe I get the error '
java.lang.ArrayStoreException: java.util.HashMap' during spark.read.json. Below the code block I've provided output from printing the 'issues' data set (minus proprietary info). I've done a bit of research and can't figure out what I can try to make this work. Any assistance would be greatly appreciated!!
import requests
import json
limit = 2
startat = 0
issues = []
for page_num in range(2):
startat = page_num*50
url = f"https://URL/rest/api/2/search?jql=TEST&startAt={startat}&maxResults={limit}"
req = requests.get(url, headers={'Accept': 'application/json', 'Authorization': 'Basic xxxxxxxxxxxxxxxxxxxxxxxxxx'})
data = req.json()
issues.extend(data['issues'])
jsonDF = spark.read.json(issues)
jsonDF.printSchema()
[{'expand': 'operations,versionedRepresentations,editmeta,changelog,customfield_10010.requestTypePractice,renderedFields', 'id': '11441', 'self': 'https://my.url.net/rest/api/2/issue/11441', 'key': 'TS-1401', 'fields': {'statuscategorychangedate': '2022-11-29T07:05:17.359-0800', 'issuetype': {'self': 'https://my.url.net/rest/api/2/issuetype/10004', 'id': '10004', 'description': 'Functionality or a feature expressed as a user goal.', 'iconUrl': 'https://my.url.net/rest/api/2/universal_avatar/view/type/issuetype/avatar/10315?size=medium', 'name': 'Story', 'subtask': False, 'avatarId': 10315, 'hierarchyLevel': 0}, 'parent': {'id': '11420', 'key': 'TS-1380', 'self': 'https://my.url.net/rest/api/2/issue/11420', 'fields': {'summary': 'Clone30 - Migration Epics', 'status': {'self': 'https://my.url.net/rest/api/2/status/10003', 'description': '', 'iconUrl': 'https://my.url.net/', 'name': 'Backlog', 'id': '10003', 'statusCategory': {'self': 'https://my.url.net/rest/api/2/statuscategory/2', 'id': 2, 'key': 'new', 'colorName': 'blue-gray', 'name': 'To Do'}}, 'priority': {'self': 'https://my.url.net/rest/api/2/priority/3', 'iconUrl': 'https://my.url.net/images/icons/priorities/medium.svg', 'name': 'Medium', 'id': '3'}, 'issuetype': {'self': 'https://my.url.net/rest/api/2/issuetype/10000', 'id': '10000', 'description': 'A big user story that needs to be broken down. Created by Jira Software - do not edit or delete.', 'iconUrl': 'https://my.url.net/images/icons/issuetypes/epic.svg', 'name': 'Epic', 'subtask': False, 'hierarchyLevel': 1}}}, 'timespent': None, 'project': {'self': 'https://my.url.net/rest/api/2/project/10001', 'id': '10001', 'key': 'TS', 'name': 'Project', 'projectTypeKey': 'software', 'simplified': False, 'avatarUrls': {'48x48': 'https://my.url.net/rest/api/2/universal_avatar/view/type/project/avatar/10556', '24x24': 'https://my.url.net/rest/api/2/universal_avatar/view/type/project/avatar/10556?size=small', '16x16': 'https://my.url.net/rest/api/2/universal_avatar/view/type/project/avatar/10556?size=xsmall', '32x32': 'https://my.url.net/rest/api/2/universal_avatar/view/type/project/avatar/10556?size=medium'}}, 'customfield_10033': None, 'fixVersions': [], 'aggregatetimespent': None, 'customfield_10034': [], 'customfield_10035': None, 'resolution': None, 'customfield_10036': None, 'customfield_10037': None, 'customfield_10027': None, 'customfield_10028': None, 'customfield_10029': None, 'resolutiondate': None, 'workratio': -1, 'watches': {'self': 'https://my.url.net/rest/api/2/issue/TS-1401/watchers', 'watchCount': 1, 'isWatching': True}, 'lastViewed': '2022-12-08T10:06:57.022-0800', 'created': '2022-11-29T07:05:16.501-0800', 'customfield_10020': None, 'customfield_10021': None, 'customfield_10022': None, 'priority': {'self': 'https://my.url.net/rest/api/2/priority/3', 'iconUrl': 'https://my.url.net/images/icons/priorities/medium.svg', 'name': 'Medium', 'id': '3'}, 'customfield_10023': None, 'customfield_10024': None, 'customfield_10025': None, 'customfield_10026': None, 'labels': [], 'customfield_10016': None, 'customfield_10017': None, 'customfield_10018': {'hasEpicLinkFieldDependency': False, 'showField': False, 'nonEditableReason': {'reason': 'EPIC_LINK_SHOULD_BE_USED', 'message': 'To set an epic as the parent, use the epic link instead'}}, 'customfield_10019': '0|i008a3:', 'timeestimate': None, 'aggregatetimeoriginalestimate': None, 'versions': [], 'issuelinks': [], 'assignee': None, 'updated': '2022-11-29T07:05:20.759-0800', 'status': {'self': 'https://my.url.net/rest/api/2/status/10003', 'description': '', 'iconUrl': 'https://my.url.net/', 'name': 'Backlog', 'id': '10003', 'statusCategory': {'self': 'https://my.url.net/rest/api/2/statuscategory/2', 'id': 2, 'key': 'new', 'colorName': 'blue-gray', 'name': 'To Do'}}, 'components': [], 'timeoriginalestimate': None, 'description': 'Data owner completes template (understand scope of migration efforts)', 'customfield_10010': None, 'customfield_10014': 'TS-1380', 'customfield_10015': None, 'customfield_10005': None, 'customfield_10006': None, 'customfield_10007': None, 'security': None, 'customfield_10008': None, 'customfield_10009': None, 'aggregatetimeestimate': None, 'summary': 'Template', 'creator': {'self': 'https://my.url.net/rest/api/2/user?accountId=5d669f4bf81f2c0d99ee9e38', 'accountId': '5d669f4bf81f2c0d99ee9e38', 'emailAddress': 'test#aol.com', 'avatarUrls': {'48x48': 'https://secure.gravatar.com/avatar/69b7db33e65c274c27a07b28b356e329?d=https%3A%2F%2Favatar-management--avatars.us-west-2.test.png', '24x24': 'https://secure.gravatar.com/avatar/69b7db33e65c274c27a07b28b356e329?d=https%3A%2F%2Favatar-management--avatars.us-west-2.test.png', '16x16': 'https://secure.gravatar.com/avatar/69b7db33e65c274c27a07b28b356e329?d=https%3A%2F%2Favatar-management--avatars.us-west-2.test.png', '32x32': 'https://secure.gravatar.com/avatar/69b7db33e65c274c27a07b28b356e329?d=https%3A%2F%2Favatar-management--avatars.us-west-2.test.png'}, 'displayName': 'Joe Test', 'active': True, 'timeZone': 'America/Los_Angeles', 'accountType': 'atlassian'}, 'subtasks': [{'id': '11442', 'key': 'TS-1402', 'self': 'https://my.url.net/rest/api/2/issue/11442', 'fields': {'summary': 'Complete Template with table/views required (in) and produced (out)', 'status': {'self': 'https://my.url.net/rest/api/2/status/10003', 'description': '', 'iconUrl': 'https://my.url.net/', 'name': 'Backlog', 'id': '10003', 'statusCategory': {'self': 'https://my.url.net/rest/api/2/statuscategory/2', 'id': 2, 'key': 'new', 'colorName': 'blue-gray', 'name': 'To Do'}}, 'priority': {'self': 'https://my.url.net/rest/api/2/priority/3', 'iconUrl': 'https://my.url.net/images/icons/priorities/medium.svg', 'name': 'Medium', 'id': '3'}, 'issuetype': {'self': 'https://my.url.net/rest/api/2/issuetype/10006', 'id': '10006', 'description': "A small piece of work that's part of a larger task.", 'iconUrl': 'https://my.url.net/rest/api/2/universal_avatar/view/type/issuetype/avatar/10316?size=medium', 'name': 'Sub-task', 'subtask': True, 'avatarId': 10316, 'hierarchyLevel': -1}}}], 'reporter': {'self': 'https://my.url.net/rest/api/2/user?accountId=5d669f4bf81f2c0d99ee9e38', 'accountId': '5d669f4bf81f2c0d99ee9e38', 'emailAddress': 'test#aol.com', 'avatarUrls': {'48x48': 'https://secure.gravatar.com/avatar/69b7db33e65c274c27a07b28b356e329?d=https%3A%2F%2Favatar-management--avatars.us-west-2.test.png', '24x24': 'https://secure.gravatar.com/avatar/69b7db33e65c274c27a07b28b356e329?d=https%3A%2F%2Favatar-management--avatars.us-west-2.test.png', '16x16': 'https://secure.gravatar.com/avatar/69b7db33e65c274c27a07b28b356e329?d=https%3A%2F%2Favatar-management--avatars.us-west-2.test.png', '32x32': 'https://secure.gravatar.com/avatar/69b7db33e65c274c27a07b28b356e329?d=https%3A%2F%2Favatar-management--avatars.us-west-2.test.png'}, 'displayName': 'Joe Test', 'active': True, 'timeZone': 'America/Los_Angeles', 'accountType': 'atlassian'}, 'aggregateprogress': {'progress': 0, 'total': 0}, 'customfield_10001': None, 'customfield_10002': None, 'customfield_10003': None, 'customfield_10004': None, 'customfield_10038': None, 'environment': None, 'duedate': None, 'progress': {'progress': 0, 'total': 0}, 'votes': {'self': 'https://my.url.net/rest/api/2/issue/TS-1401/votes', 'votes': 0, 'hasVoted': False}}}, {'expand': 'operations,versionedRepresentations,editmeta,changelog,customfield_10010.requestTypePractice,renderedFields', 'id': '11438', 'self': 'https://my.url.net/rest/api/2/issue/11438', 'key': 'TS-1398', 'fields': {'statuscategorychangedate': '2022-11-29T07:05:09.126-0800', 'issuetype': {'self': 'https://my.url.net/rest/api/2/issuetype/10004', 'id': '10004', 'description': 'Functionality or a feature expressed as a user goal.', 'iconUrl': 'https://my.url.net/rest/api/2/universal_avatar/view/type/issuetype/avatar/10315?size=medium', 'name': 'Story', 'subtask': False, 'avatarId': 10315, 'hierarchyLevel': 0}, 'parent': {'id': '11420', 'key': 'TS-1380', 'self': 'https://my.url.net/rest/api/2/issue/11420', 'fields': {'summary': 'Clone30 - Migration Epics', 'status': {'self': 'https://my.url.net/rest/api/2/status/10003', 'description': '', 'iconUrl': 'https://my.url.net/', 'name': 'Backlog', 'id': '10003', 'statusCategory': {'self': 'https://my.url.net/rest/api/2/statuscategory/2', 'id': 2, 'key': 'new', 'colorName': 'blue-gray', 'name': 'To Do'}}, 'priority': {'self': 'https://my.url.net/rest/api/2/priority/3', 'iconUrl': 'https://my.url.net/images/icons/priorities/medium.svg', 'name': 'Medium', 'id': '3'}, 'issuetype': {'self': 'https://my.url.net/rest/api/2/issuetype/10000', 'id': '10000', 'description': 'A big user story that needs to be broken down. Created by Jira Software - do not edit or delete.', 'iconUrl': 'https://my.url.net/images/icons/issuetypes/epic.svg', 'name': 'Epic', 'subtask': False, 'hierarchyLevel': 1}}}, 'timespent': None, 'project': {'self': 'https://my.url.net/rest/api/2/project/10001', 'id': '10001', 'key': 'TS', 'name': 'Project', 'projectTypeKey': 'software', 'simplified': False, 'avatarUrls': {'48x48': 'https://my.url.net/rest/api/2/universal_avatar/view/type/project/avatar/10556', '24x24': 'https://my.url.net/rest/api/2/universal_avatar/view/type/project/avatar/10556?size=small', '16x16': 'https://my.url.net/rest/api/2/universal_avatar/view/type/project/avatar/10556?size=xsmall', '32x32': 'https://my.url.net/rest/api/2/universal_avatar/view/type/project/avatar/10556?size=medium'}}, 'fixVersions': [], 'customfield_10033': None, 'customfield_10034': [], 'aggregatetimespent': None, 'customfield_10035': None, 'resolution': None, 'customfield_10036': None, 'customfield_10037': None, 'customfield_10027': None, 'customfield_10028': None, 'customfield_10029': None, 'resolutiondate': None, 'workratio': -1, 'lastViewed': None, 'watches': {'self': 'https://my.url.net/rest/api/2/issue/TS-1398/watchers', 'watchCount': 1, 'isWatching': True}, 'created': '2022-11-29T07:05:08.312-0800', 'customfield_10020': None, 'customfield_10021': None, 'customfield_10022': None, 'customfield_10023': None, 'priority': {'self': 'https://my.url.net/rest/api/2/priority/3', 'iconUrl': 'https://my.url.net/images/icons/priorities/medium.svg', 'name': 'Medium', 'id': '3'}, 'customfield_10024': None, 'customfield_10025': None, 'customfield_10026': None, 'labels': [], 'customfield_10016': None, 'customfield_10017': None, 'customfield_10018': {'hasEpicLinkFieldDependency': False, 'showField': False, 'nonEditableReason': {'reason': 'EPIC_LINK_SHOULD_BE_USED', 'message': 'To set an epic as the parent, use the epic link instead'}}, 'customfield_10019': '0|i008ae:y', 'timeestimate': None, 'aggregatetimeoriginalestimate': None, 'versions': [], 'issuelinks': [], 'assignee': None, 'updated': '2022-11-29T07:05:22.417-0800', 'status': {'self': 'https://my.url.net/rest/api/2/status/10003', 'description': '', 'iconUrl': 'https://my.url.net/', 'name': 'Backlog', 'id': '10003', 'statusCategory': {'self': 'https://my.url.net/rest/api/2/statuscategory/2', 'id': 2, 'key': 'new', 'colorName': 'blue-gray', 'name': 'To Do'}}, 'components': [], 'timeoriginalestimate': None, 'description': 'Creating reports/reporting cubes; need to find out reports used', 'customfield_10010': None, 'customfield_10014': 'TS-1380', 'customfield_10015': None, 'customfield_10005': None, 'customfield_10006': None, 'security': None, 'customfield_10007': None, 'customfield_10008': None, 'customfield_10009': None, 'aggregatetimeestimate': None, 'summary': '\xa0create reports/cubes', 'creator': {'self': 'https://my.url.net/rest/api/2/user?accountId=5d669f4bf81f2c0d99ee9e38', 'accountId': '5d669f4bf81f2c0d99ee9e38', 'emailAddress': 'test#aol.com', 'avatarUrls': {'48x48': 'https://secure.gravatar.com/avatar/69b7db33e65c274c27a07b28b356e329?d=https%3A%2F%2Favatar-management--avatars.us-west-2.test.png', '24x24': 'https://secure.gravatar.com/avatar/69b7db33e65c274c27a07b28b356e329?d=https%3A%2F%2Favatar-management--avatars.us-west-2.test.png', '16x16': 'https://secure.gravatar.com/avatar/69b7db33e65c274c27a07b28b356e329?d=https%3A%2F%2Favatar-management--avatars.us-west-2.test.png', '32x32': 'https://secure.gravatar.com/avatar/69b7db33e65c274c27a07b28b356e329?d=https%3A%2F%2Favatar-management--avatars.us-west-2.test.png'}, 'displayName': 'Joe Test', 'active': True, 'timeZone': 'America/Los_Angeles', 'accountType': 'atlassian'}, 'subtasks': [{'id': '11439', 'key': 'TS-1399', 'self': 'https://my.url.net/rest/api/2/issue/11439', 'fields': {'summary': 'Confirm: any reporting cubes required using this data are created and in production?', 'status': {'self': 'https://my.url.net/rest/api/2/status/10003', 'description': '', 'iconUrl': 'https://my.url.net/', 'name': 'Backlog', 'id': '10003', 'statusCategory': {'self': 'https://my.url.net/rest/api/2/statuscategory/2', 'id': 2, 'key': 'new', 'colorName': 'blue-gray', 'name': 'To Do'}}, 'priority': {'self': 'https://my.url.net/rest/api/2/priority/3', 'iconUrl': 'https://my.url.net/images/icons/priorities/medium.svg', 'name': 'Medium', 'id': '3'}, 'issuetype': {'self': 'https://my.url.net/rest/api/2/issuetype/10006', 'id': '10006', 'description': "A small piece of work that's part of a larger task.", 'iconUrl': 'https://my.url.net/rest/api/2/universal_avatar/view/type/issuetype/avatar/10316?size=medium', 'name': 'Sub-task', 'subtask': True, 'avatarId': 10316, 'hierarchyLevel': -1}}}, {'id': '11440', 'key': 'TS-1400', 'self': 'https://my.url.net/rest/api/2/issue/11440', 'fields': {'summary': 'Confirm: any structured reports using this data are created and in production?', 'status': {'self': 'https://my.url.net/rest/api/2/status/10003', 'description': '', 'iconUrl': 'https://my.url.net/', 'name': 'Backlog', 'id': '10003', 'statusCategory': {'self': 'https://my.url.net/rest/api/2/statuscategory/2', 'id': 2, 'key': 'new', 'colorName': 'blue-gray', 'name': 'To Do'}}, 'priority': {'self': 'https://my.url.net/rest/api/2/priority/3', 'iconUrl': 'https://my.url.net/images/icons/priorities/medium.svg', 'name': 'Medium', 'id': '3'}, 'issuetype': {'self': 'https://my.url.net/rest/api/2/issuetype/10006', 'id': '10006', 'description': "A small piece of work that's part of a larger task.", 'iconUrl': 'https://my.url.net/rest/api/2/universal_avatar/view/type/issuetype/avatar/10316?size=medium', 'name': 'Sub-task', 'subtask': True, 'avatarId': 10316, 'hierarchyLevel': -1}}}], 'reporter': {'self': 'https://my.url.net/rest/api/2/user?accountId=5d669f4bf81f2c0d99ee9e38', 'accountId': '5d669f4bf81f2c0d99ee9e38', 'emailAddress': 'test#aol.com', 'avatarUrls': {'48x48': 'https://secure.gravatar.com/avatar/69b7db33e65c274c27a07b28b356e329?d=https%3A%2F%2Favatar-management--avatars.us-west-2.test.png', '24x24': 'https://secure.gravatar.com/avatar/69b7db33e65c274c27a07b28b356e329?d=https%3A%2F%2Favatar-management--avatars.us-west-2.test.png', '16x16': 'https://secure.gravatar.com/avatar/69b7db33e65c274c27a07b28b356e329?d=https%3A%2F%2Favatar-management--avatars.us-west-2.test.png', '32x32': 'https://secure.gravatar.com/avatar/69b7db33e65c274c27a07b28b356e329?d=https%3A%2F%2Favatar-management--avatars.us-west-2.test.png'}, 'displayName': 'Joe Test', 'active': True, 'timeZone': 'America/Los_Angeles', 'accountType': 'atlassian'}, 'aggregateprogress': {'progress': 0, 'total': 0}, 'customfield_10001': None, 'customfield_10002': None, 'customfield_10003': None, 'customfield_10004': None, 'customfield_10038': None, 'environment': None, 'duedate': None, 'progress': {'progress': 0, 'total': 0}, 'votes': {'self': 'https://my.url.net/rest/api/2/issue/TS-1398/votes', 'votes': 0, 'hasVoted': False}}}]
I have a problem. I want to remove all nested elements inside a dict. But unfortunately my code does not work. Every nested element occurs twice, but it should be occurs only once.
What is the problem for that?
Method
def nested_dict(dictionaries):
my_list = []
for my_Dict in dictionaries:
my_new_dict = {}
for key in my_Dict.keys():
if isinstance(my_Dict[key], dict):
idx = str(uuid.uuid4())
my_Dict[key]["__id"] = idx
my_new_dict[key] = my_Dict[key]
my_Dict[key] = idx
my_list.append(my_new_dict)
return my_list
Running example
import uuid
my_Dict = {
'_key': '1',
'group': 'test',
'data': {},
'type': '',
'code': '007',
'conType': '1',
'flag': None,
'createdAt': '2021',
'currency': 'EUR',
'detail': {
'selector': {
'number': '12312',
'isTrue': True,
'requirements': [{
'type': 'customer',
'requirement': '1'}]
}
}
}
my_Dict2 = {
'_key': '2',
'group': 'test',
'data2': {},
'type': '',
'code': '007',
'conType': '1',
'flag': None,
'createdAt': '2021',
'currency': 'EUR',
'detail2': {
'selector': {
'number': '12312',
'isTrue': True,
'requirements': [{
'type': 'customer',
'requirement': '1'}]
}
}
}
dictionaries = [my_Dict, my_Dict2]
def nested_dict(dictionaries):
my_list = []
for my_Dict in dictionaries:
my_new_dict = {}
for key in my_Dict.keys():
if isinstance(my_Dict[key], dict):
idx = str(uuid.uuid4())
my_Dict[key]["__id"] = idx
my_new_dict[key] = my_Dict[key]
my_Dict[key] = idx
my_list.append(my_new_dict)
return my_list
result = nested_dict(dictionaries)
result
[OUT]
[{'data': {'__id': '46f4eb3d-977c-4da4-a99c-c9bfa831b96e'},
'detail': {'selector': {'number': '12312',
'isTrue': True,
'requirements': [{'type': 'customer', 'requirement': '1'}]},
'__id': 'fad4053e-75e5-4a03-93b6-67e0df814d23'}},
{'data': {'__id': '46f4eb3d-977c-4da4-a99c-c9bfa831b96e'},
'detail': {'selector': {'number': '12312',
'isTrue': True,
'requirements': [{'type': 'customer', 'requirement': '1'}]},
'__id': 'fad4053e-75e5-4a03-93b6-67e0df814d23'}},
{'data2': {'__id': '6afcf48e-508c-476b-98f3-9bf1e8370fb4'},
'detail2': {'selector': {'number': '12312',
'isTrue': True,
'requirements': [{'type': 'customer', 'requirement': '1'}]},
'__id': '2d4745ea-decd-45dc-aa0b-7bea5c449c34'}},
{'data2': {'__id': '6afcf48e-508c-476b-98f3-9bf1e8370fb4'},
'detail2': {'selector': {'number': '12312',
'isTrue': True,
'requirements': [{'type': 'customer', 'requirement': '1'}]},
'__id': '2d4745ea-decd-45dc-aa0b-7bea5c449c34'}}]
What I want
[{'data': {'__id': '46f4eb3d-977c-4da4-a99c-c9bfa831b96e'},
'detail': {'selector': {'number': '12312',
'isTrue': True,
'requirements': [{'type': 'customer', 'requirement': '1'}]},
'__id': 'fad4053e-75e5-4a03-93b6-67e0df814d23'}},
{'data2': {'__id': '6afcf48e-508c-476b-98f3-9bf1e8370fb4'},
'detail2': {'selector': {'number': '12312',
'isTrue': True,
'requirements': [{'type': 'customer', 'requirement': '1'}]},
'__id': '2d4745ea-decd-45dc-aa0b-7bea5c449c34'}}]
import uuid
import json
my_Dict = {
'_key': '1',
'group': 'test',
'data': {},
'type': '',
'code': '007',
'conType': '1',
'flag': None,
'createdAt': '2021',
'currency': 'EUR',
'detail': {
'selector': {
'number': '12312',
'isTrue': True,
'requirements': [{
'type': 'customer',
'requirement': '1'}]
}
}
}
my_Dict2 = {
'_key': '2',
'group': 'test',
'data2': {},
'type': '',
'code': '007',
'conType': '1',
'flag': None,
'createdAt': '2021',
'currency': 'EUR',
'detail2': {
'selector': {
'number': '12312',
'isTrue': True,
'requirements': [{
'type': 'customer',
'requirement': '1'}]
}
}
}
dictionaries = [my_Dict, my_Dict2]
def nested_dict(dictionaries):
my_list = []
for my_Dict in dictionaries:
my_new_dict = {}
for key in my_Dict.keys():
if isinstance(my_Dict[key], dict):
idx = str(uuid.uuid4())
my_Dict[key]["__id"] = idx
my_new_dict[key] = my_Dict[key]
my_Dict[key] = idx
my_list.append(my_new_dict)
return my_list
output:
[
{
"data": {
"__id": "5c6769cf-01e5-4f5d-acfa-622472163aba"
},
"detail": {
"selector": {
"number": "12312",
"isTrue": true,
"requirements": [
{
"type": "customer",
"requirement": "1"
}
]
},
"__id": "d167277f-4d02-4d53-934b-131187f6f214"
}
},
{
"data2": {
"__id": "e9182913-c2fc-4d60-adb8-b0b8274faf50"
},
"detail2": {
"selector": {
"number": "12312",
"isTrue": true,
"requirements": [
{
"type": "customer",
"requirement": "1"
}
]
},
"__id": "46e6be7b-8903-4d2a-a768-f6b24fcc5d31"
}
}
]
only minor changes needed that is you are appending the list within inner for loop but you should do it at outer for loop level. I have pasted the code with output which I got
I think it is because my_new_dict is holding an object that is changed by the time it appends to the list.
def nested_dict(dictionaries):
my_list = []
for my_Dict in dictionaries:
my_new_dict = {}
for key in my_Dict.keys():
if isinstance(my_Dict[key], dict):
idx = str(uuid.uuid4())
my_Dict[key]["__id"] = idx
my_new_dict[key] = my_Dict[key]
my_Dict[key] = idx
my_list.append({key: my_new_dict[key]})
print(my_list)
return my_list
I have uploaded some tweets in a Mongo DB collection and I would like to extract the following information with PyMongo:
user.screen_name
entities.user_mentions.screen_name
count
i.e. I would like to know who has mentioned whom and how many times, in order to create some kind of network.
I used the following pipeline to get the most mentioned users but I'm not able to introduce also the user.screen_name:
tweets.aggregate([
{'$project': {'mentions': '$entities.user_mentions.screen_name', '_id': 0}},
{'$unwind': '$mentions'},
{'$group': {'_id': '$mentions', 'count': {'$sum': 1}}}
])
Here an example of document (tweet), where I removed some of the fields I'm not interested in:
{'_id': ObjectId('604c805b289d1ef5947e1845'),
'created_at': 'Fri Mar 12 04:36:10 +0000 2021',
'display_text_range': [0, 140],
'entities': {'hashtags': [{'indices': [124, 136], 'text': 'mytag'}],
'symbols': [],
'urls': [],
'user_mentions': [{'id': 123,
'id_str': '123',
'indices': [3, 14],
'name': 'user_name',
'screen_name': 'user_screen_name'}]},
'user': {'id': 456,
'id_str': '456',
'name': 'Author Name',
'screen_name': 'Author Screen Name'}}
{'_id': ObjectId('604c805b289d1ef5947e184x'),
'created_at': 'Fri Mar 12 04:36:10 +0000 2021',
'display_text_range': [0, 140],
'entities': {'hashtags': [{'indices': [124, 136], 'text': 'mytag'}],
'symbols': [],
'urls': [],
'user_mentions': [{'id': 126,
'id_str': '126',
'indices': [3, 14],
'name': 'user_name',
'screen_name': 'user_screen_name'}]},
'user': {'id': 4567,
'id_str': '4567',
'name': 'Other Author Name',
'screen_name': 'Other Author Screen Name'}}
In this example I would expect something like:
{'mentioned': 'user_screen_name',
'author': 'Author Screen Name',
'count': '1'},
{'mentioned': 'user_screen_name',
'author': 'Other Author Screen Name',
'count': '1'},
Can someone help me?
Thank you in advance for your help!
Francesca
db.collection.aggregate([
{
"$project": {
"mentions": "$entities.user_mentions.screen_name",
"author": "$user.screen_name"
}
},
{ "$unwind": "$mentions" },
{
"$group": {
"_id": { aut: "$author", ment: "$mentions" },
"count": { "$sum": 1 },
author: { "$first": "$author" },
mentions: { "$first": "$mentions" }
}
},
{
"$project": { _id: 0 }
}
])
Working Mongo playground