How to get values in script using python - python

I'm creating a crawler with python + beautiful soup.
I have to access the tag to get some data in the dataLayer.
I did a search with beatifulsoup and managed to return the tag that I need but I can not turn it into a json to access the information.
This is the code that I made to get the :
page = get_html('URL')
dataLayer = page.findAll('script')[NUMBER OF SCRIPT]
And this is my return:
<script type="text/javascript">
dataLayer = [{
'site': {
'isMobile': false
},
'page': {
'pageType': 'ad_detail',
'detail': {
'parent_category_id': '2000',
'category_id': '2020',
'state_id': '2',
'region_id': '31',
'ad_id': '293231982',
'list_id': '250941507',
'city_id': '9208',
'zipcode':'34710620',
},
'adDetail': {
'adID': '293231982',
'listID': '250941507',
'sellerName': 'Marr',
'adDate': '2016-11-30 20:52:11',
},
},
'session': {
'user': {
'userID': '',
'loginType': ''
}
},
'pageType': 'Ad_detail',
'abtestingEnable' : '1',
// Listing information
'listingCategory': '2020',
// Ad information
'adId': '293231982',
'state': '2',
'region': '31',
'category': '2020',
'pictures': '8',
'listId': '250941507',
//Account Information
'loggedUser':'0',
'referrer': '',
//User Information
}];
</script>
I would like to get the data as adDate and zipcode.

s = soup.script.text.replace('\'', '"') # replace ' with "
s = re.search(r'\{.+\}', s, re.DOTALL).group() # get json data
s = re.sub(r'//.+\n', '', s) # replace comment
s = re.sub(r'\s+', '', s) # strip whitspace
s = re.sub(r',}', '}', s) # get rid of last , in the dict
json.loads(s)
out:
{'abtestingEnable': '1',
'adId': '293231982',
'category': '2020',
'listId': '250941507',
'listingCategory': '2020',
'loggedUser': '0',
'page': {'adDetail': {'adDate': '2016-11-3020:52:11',
'adID': '293231982',
'listID': '250941507',
'sellerName': 'Marr'},
'detail': {'ad_id': '293231982',
'category_id': '2020',
'city_id': '9208',
'list_id': '250941507',
'parent_category_id': '2000',
'region_id': '31',
'state_id': '2',
'zipcode': '34710620'},
'pageType': 'ad_detail'},
'pageType': 'Ad_detail',
'pictures': '8',
'referrer': '',
'region': '31',
'session': {'user': {'loginType': '', 'userID': ''}},
'site': {'isMobile': False},
'state': '2'}

Your json is using single quotes instead of double quotes.
you should replace all single quotes with doubles quotes to make you dataLayer variable json compliant.
A simple .replace("'", '"') should do the trick.
Note : you also have to remove the commented line with a second regex.

Related

Updating a list of map in Dynamo DB using Boto3 with Python

I have a response from my Dynamo DB as:
{
'subject': 'Mathematics',
'course_id': '123',
'Term': 'Second',
'stats':
{'student_stats':
[
{'student_id': '234',
'registration_id': '321'},
{'student_id': '987',
'registration_id': '456'}
]
},
}
Where the partition key is 'subject' and the sort key is 'course_id'.
I would like to update the 'student_stats' for student with 'student_id' as '234' to passed.
The output should look like this:
{
'subject': 'Mathematics',
'course_id': '123',
'Term': 'Second',
'stats':
{'student_stats':
[
{'student_id': '234',
'registration_id': '321',
'status: 'passed'},
{'student_id': '987',
'registration_id': '456'}
]
},
}
This is my implementation so far:
dynamodb = boto3.resource(
'dynamodb',
aws_access_key_id=ACCESS_KEY,
aws_secret_access_key=ACCESS_SECRET
)
table = dynamodb.Table(TABLE_NAME)
response = table.update_item(Key={'subject': 'Mathematics',
'course_id': '123'},
UpdateExpression="set #status=:s",
ExpressionAttributeNames={
'#status': 'stats.student_stats'
},
ExpressionAttributeValues={
':s': 'passed'
},
ReturnValues="UPDATED_NEW"
)

How extract the key from dictionary and make it as primary key

Myd is below
{ 'Owner': [ { 'id': '1', 'name': 'John', 'contactEmail': 'john#nif.com', 'role': 'Owner' }, { 'id': '2', 'contactName': 'Work', 'contactEmail': 'work#nif.com', 'role': 'Owner' } ], 'Manager': [ { 'id': '1', 'name': 'John', 'contactEmail': 'john#nif.com', 'role': 'Manager' } ] }
Extract id to outside
Add entire dictionary into a new key called 'employee'
For the same key role are there in two different keys merge to one
id=1 role is present as Owner and Manager, output will role:['Manager', 'Owner']
Expected out
{ 'employee': { '1': { 'email': 'john#nif.com', 'name': 'John', 'role': [ 'Owner', 'Manager' ] }, '2': { 'email': 'work#nif.com', 'name': 'Work', 'role': [ 'Owner' ] } } }
emp = {}
for key,val in event.items():
for each in val:
# [{'employee': key, **val} for key, val in event.items()] if event else []
emp['employee'] = each['id']
emp['name'] = each['name']
using python native method
Here's a try without using third party lib:
myd = {
'Owner': [
{ 'id': '1', 'name': 'John', 'contactEmail': 'john#nif.com', 'role': 'Owner' },
{ 'id': '2', 'contactName': 'Work', 'contactEmail': 'work#nif.com', 'role': 'Owner' }
],
'Manager': [ { 'id': '1', 'name': 'John', 'contactEmail': 'john#nif.com', 'role': 'Manager' } ]
}
empl_dict = {}
for employees in myd.values():
for emp in employees:
emp_id = emp.pop('id')
emp_role = emp.pop('role')
empl_dict[emp_id] = empl_dict.get(emp_id, {})
empl_dict[emp_id].update(emp)
empl_dict[emp_id]['role'] = empl_dict[emp_id].get('role', [])
empl_dict[emp_id]['role'].append(emp_role)
all_employees = {'employee': empl_dict}
print(all_employees)
results in:
{'employee': {'1': {'name': 'John', 'contactEmail': 'john#nif.com', 'role': ['Owner', 'Manager']}, '2': {'contactName': 'Work', 'contactEmail': 'work#nif.com', 'role': ['Owner']}}}
You can use pandas to achieve this
Converting to pandas dataframe followed by groupby on contactEmail and aggregating results in required manner
df = pd.concat([pd.DataFrame(v).assign(key=k) for k,v in a.items()])
res = df.groupby('contactEmail').agg({'role':list,'name':'first'}).reset_index().T.to_dict()
{'employee':res}
out:
{'employee': {0: {'contactEmail': 'john#nif.com',
'role': ['Owner', 'Manager'],
'name': 'John'},
1: {'contactEmail': 'work#nif.com', 'role': ['Owner'], 'name': nan}}}
Edit:
if you want to achieve this in python
for OM in a.keys():
for ids in a[OM]:
ids['role'] = [OM]
total_recs = sum(list(a.values()),[])
res = {}
for rec in total_recs:
ID = rec['id']
if ID not in res.keys():
rec.pop('id')
res[ID] = rec
else:
rec.pop('id')
res[ID]['role'].extend(rec['role'])
{'employee':res}
Out:
{'employee': {'1': {'name': 'John',
'contactEmail': 'john#nif.com',
'role': ['Owner', 'Manager']},
'2': {'contactName': 'Work',
'contactEmail': 'work#nif.com',
'role': ['Owner']}}}

How can I implement this recursion in python?

Let's say that I have a Dictionary like this
dict1 = [{
'Name': 'Team1',
'id': '1',
'Members': [
{
'type': 'user',
'id': '11'
},
{
'type': 'user',
'id': '12'
}
]
},
{
'Name': 'Team2',
'id': '2',
'Members': [
{
'type': 'group'
'id': '1'
},
{
'type': 'user',
'id': '21'
}
]
},
{
'Name': 'Team3',
'id': '3',
'Members': [
{
'type': 'group'
'id': '2'
}
]
}]
and I want to get an output that can replace all the groups and nested groups with all distinct users.
In this case the output should look like this:
dict2 = [{
'Name': 'Team1',
'id': '1',
'Members': [
{
'type': 'user',
'id': '11'
},
{
'type': 'user',
'id': '12'
}
]
},
{
'Name': 'Team2',
'id': '2',
'Members': [
{
'type': 'user',
'id': '11'
},
{
'type': 'user',
'id': '12'
}
{
'type': 'user',
'id': '21'
}
]
},
{
'Name': 'Team3',
'id': '3',
'Members': [
{
'type': 'user',
'id: '11'
},
{
'type': 'user',
'id': '12'
}
{
'type': 'user',
'id': '21'
}
]
}]
Now let's assume that I have a large dataset to perform these actions on. (approx 20k individual groups)
What would be the best way to code this? I am attempting recursion, but I am not sure about how to search through the dictionary and lists in this manner such that it doesn't end up using too much memory
I do not think you need recursion. Looping is enough.
I think you can simply evaluate each Memberss, fetch users if group type, and make them unique. Then you can simply replace Members's value with distinct_users.
You might have a dictionary for groups like:
group_dict = {
'1': [
{'type': 'user', 'id': '11'},
{'type': 'user', 'id': '12'}
],
'2': [
{'type': 'user', 'id': '11'},
{'type': 'user', 'id': '12'},
{'type': 'user', 'id': '21'}
],
'3': [
{'type': 'group', 'id': '1'},
{'type': 'group', 'id': '2'},
{'type': 'group', 'id': '3'} # recursive
]
...
}
You can try:
def users_in_group(group_id):
users = []
groups_to_fetch = []
for user_or_group in group_dict[group_id]:
if user_or_group['type'] == 'group':
groups_to_fetch.append(user_or_group)
else: # 'user' type
users.append(user_or_group)
groups_fetched = set() # not to loop forever
while groups_to_fetch:
group = groups_to_fetch.pop()
if group['id'] not in groups_fetched:
groups_fetched.add(group['id'])
for user_or_group in group_dict[group['id']]:
if user_or_group['type'] == 'group' and user_or_group['id'] not in groups_fetched:
groups_to_fetch.append(user_or_group)
else: # 'user' type
users.append(user_or_group)
return users
def distinct_users_in(members):
distinct_users = []
def add(user):
if user['id'] not in user_id_set:
distinct_users.append(user)
user_id_set.add(user['id'])
user_id_set = set()
for member in members:
if member['type'] == 'group':
for user in users_in_group(member['id']):
add(user)
else: # 'user'
user = member
add(user)
return distinct_users
dict2 = dict1 # or `copy.deepcopy`
for element in dict2:
element['Members'] = distinct_users_in(element['Members'])
Each Members is re-assigned by distinct_users returned by the corresponding function.
The function takes Members and fetches users from each if member type. If user type, member itself is a user. While (fetched) users are appended to distinct_user, you can use their ids for uniquity.
When you fetch users_in_group, you can use two lists; groups_to_fetch and groups_fetched. The former is a stack to recursively fetch all groups in a group. The latter is not to fetch an already fetched group again. Or, it could loop forever.
Finally, if your data are already in memory, this approach may not exhaust memory and work.

How to extract values from list and store it as dictionary(key-value pair)?

I need to extract 2 values from this list of dictionary and store it as a key-value pair.
Here I attached sample data..Where I need to extract "Name" and "Service" from this input and store it as a dictionary. Where "Name" is Key and corresponding "Service" is its value.
Input:
response = {
'Roles': [
{
'Path': '/',
'Name': 'Heera',
'Age': '25',
'Policy': 'Policy1',
'Start_Month': 'January',
'PolicyDocument':
{
'Date': '2012-10-17',
'Statement': [
{
'id': '',
'RoleStatus': 'New_Joinee',
'RoleType': {
'Service': 'Service1'
},
'Action': ''
}
]
},
'Duration': 3600
},
{
'Path': '/',
'Name': 'Prem',
'Age': '40',
'Policy': 'Policy2',
'Start_Month': 'April',
'PolicyDocument':
{
'Date': '2018-11-27',
'Statement': [
{
'id': '',
'RoleStatus': 'Senior',
'RoleType': {
'Service': ''
},
'Action': ''
}
]
},
'Duration': 2600
},
]
}
From this input, I need output as a dictionary type.
Output Format: { Name : Service }
Output:
{ "Heera":"Service1","Prem" : " "}
My try:
Role_name =[]
response = {#INPUT WHICH I SPECIFIED ABOVE#}
roles = response['Roles']
for role in roles:
Role_name.append(role['Name'])
print(Role_name)
I need to pair the name with its corresponding service. Any help would be really appreciable.
Thanks in advance.
You just have to write a long line which can reach till the key 'Service'.
And you a syntax error in line Start_Month': 'January') and 'Start_Month': 'April'). You can't have one unclosed brackets.
Fix it and run the following.
This is the code:
output_dict = {}
for r in response['Roles']:
output_dict[r["Name"]] = r['PolicyDocument']['Statement'][0]['RoleType']['Service']
print(output_dict)
Output:
{'Heera': 'Service1', 'Prem': ''}
You just have to do like this:
liste = []
for role in response['Roles']:
liste.append(
{
role['Name']:role['PolicyDocument']['Statement'][0]['RoleType']['Service'],
}
)
print(liste)
It seems your input data is structured kind of strange and I am not sure what the ) are doing next to the months since they make things invalid but here is a working script assuming you removed the parenthesis from your input.
response = {
'Roles': [
{
'Path': '/',
'Name': 'Heera',
'Age': '25',
'Policy': 'Policy1',
'Start_Month': 'January',
'PolicyDocument':
{
'Date': '2012-10-17',
'Statement': [
{
'id': '',
'RoleStatus': 'New_Joinee',
'RoleType': {
'Service': 'Service1'
},
'Action': ''
}
]
},
'Duration': 3600
},
{
'Path': '/',
'Name': 'Prem',
'Age': '40',
'Policy': 'Policy2',
'Start_Month': 'April',
'PolicyDocument':
{
'Date': '2018-11-27',
'Statement': [
{
'id': '',
'RoleStatus': 'Senior',
'RoleType': {
'Service': ''
},
'Action': ''
}
]
},
'Duration': 2600
},
]
}
output = {}
for i in response['Roles']:
output[i['Name']] = i['PolicyDocument']['Statement'][0]['RoleType']['Service']
print(output)
This should give you what you want in a variable called role_services:
role_services = {}
for role in response['Roles']:
for st in role['PolicyDocument']['Statement']:
role_services[role['Name']] = st['RoleType']['Service']
It will ensure you'll go through all of the statements within that data structure but be aware you'll overwrite key-value pairs as you traverse the response, if they exist in more than a single entry!
A reference on for loops which might be helpful, illustrates using if statements within them which can help you to extend this to check if items already exist!
Hope that helps

Convert nested XML content into CSV using xml tree in python

I'm very new to python and please treat me as same. When i tried to convert the XML content into List of Dictionaries I'm getting output but not as expected and tried a lot playing around.
XML Content
<project>
<data>
<row>
<respondent>m0wxo5f6w42h3fot34m7s6xij</respondent>
<timestamp>10-06-16 11:30</timestamp>
<product>1</product>
<replica>1</replica>
<seqnr>1</seqnr>
<session>1</session>
<column>
<question>Q1</question>
<answer>a1</answer>
</column>
<column>
<question>Q2</question>
<answer>a2</answer>
</column>
</row>
<row>
<respondent>w42h3fot34m7s6x</respondent>
<timestamp>10-06-16 11:30</timestamp>
<product>1</product>
<replica>1</replica>
<seqnr>1</seqnr>
<session>1</session>
<column>
<question>Q3</question>
<answer>a3</answer>
</column>
<column>
<question>Q4</question>
<answer>a4</answer>
</column>
<column>
<question>Q5</question>
<answer>a5</answer>
</column>
</row>
</data>
</project>
Code i have used:
import xml.etree.ElementTree as ET
tree = ET.parse(xml_file.xml) # import xml from
root = tree.getroot()
data_list = []
for item in root.find('./data'): # find all projects node
data = {} # dictionary to store content of each projects
for child in item:
data[child.tag] = child.text # add item to dictionary
#-----------------for loop with subchild is not working as expcted in my case
for subchild in child:
data[subchild.tag] = subchild.text
data_list.append(data)
print(data_list)
headers = {k for d in data_list for k in d.keys()} # headers for csv
with open(csv_file,'w') as f:
writer = csv.DictWriter(f, fieldnames = headers) # creating a DictWriter object
writer.writeheader() # write headers to csv
writer.writerows(data_list)
Output for the data_list is getting the last info of question to the list of dictionaries.
i guess the issue is at subchild forloop but im not understanding how to append the list with dictionaries.
[{
'respondent': 'anonymous_m0wxo5f6w42h3fot34m7s6xij',
'timestamp': '10-06-16 11:30',
'product': '1',
'replica': '1',
'seqnr': '1',
'session': '1',
'column': '\n ,
'question': 'Q2',
'answer': 'a2'
},
{
'respondent': 'w42h3fot34m7s6x',
'timestamp': '10-06-16 11:30',
'product': '1',
'replica': '1',
'seqnr': '1',
'session': '1',
'column': '\n ,
'question': 'Q2',
'answer': 'a2'
}.......
]
I expect the below output, tried a lot but unable to loop over the column tag.
[{
'respondent': 'anonymous_m0wxo5f6w42h3fot34m7s6xij',
'timestamp': '10-06-16 11:30',
'product': '1',
'replica': '1',
'seqnr': '1',
'session': '1',
'question': 'Q1',
'answer': 'a1'
},
{
'respondent': 'anonymous_m0wxo5f6w42h3fot34m7s6xij',
'timestamp': '10-06-16 11:30',
'product': '1',
'replica': '1',
'seqnr': '1',
'session': '1',
'question': 'Q2',
'answer': 'a2'
},
{
'respondent': 'w42h3fot34m7s6x',
'timestamp': '10-06-16 11:30',
'product': '1',
'replica': '1',
'seqnr': '1',
'session': '1',
'question': 'Q3',
'answer': 'a3'
},
{
'respondent': 'w42h3fot34m7s6x',
'timestamp': '10-06-16 11:30',
'product': '1',
'replica': '1',
'seqnr': '1',
'session': '1',
'question': 'Q4',
'answer': 'a4'
},
{
'respondent': 'w42h3fot34m7s6x',
'timestamp': '10-06-16 11:30',
'product': '1',
'replica': '1',
'seqnr': '1',
'session': '1',
'question': 'Q5',
'answer': 'a5'
}
]
I have refereed so many stack overflow questions on xml tree but still didn't helped me.
any help/suggestion is appreciated.
I had a problem understanding what this code is supposed to do because it uses abstract variable names like item, child, subchild and this makes it hard to reason about the code. I'm not as clever as that, so I renamed the variables to row, tag, and column to make it easier for me to see what the code is doing. (In my book, even row and column are a bit abstract, but I suppose the opacity of the XML input is hardly your fault.)
You have 2 rows but you want 5 dictionaries, because you have 5 <column> tags and you want each <column>'s data in a separate dictionary. But you want the other tags in the <row> to be repeated along with each <column>'s data.
That means you need to build a dictionary for every <row>, then, for each <column>, add that column's data to the dictionary, then output it before going on to the next column.
This code makes the simplifying assumption that all of your <columns>s have the same structure, with exactly one <question> and exactly one <answer> and nothing else. If this assumption does not hold then a <column> may get reported with stale data it inherited from the previous <column> in the same row. It will also produce no output at all for any <row> that does not have at least one <column>.
The code has to loop through the tags twice, once for the non-<column>s and once for the <column>s. Otherwise it can't be sure it has seen all the non-<column> tags before it starts outputting the <column>s.
There are other (no doubt more elegant) ways to do this, but I kept the code structure as close to your original as I could, other than making the variable names less opaque.
for row in root.find('./data'): # find all projects node
data = {} # dictionary to store content of each projects
for tag in row:
if tag.tag != "column":
data[tag.tag] = tag.text # add row to dictionary
# Now the dictionary data is built for the row level
for tag in row:
if tag.tag == "column":
for column in tag:
data[column.tag] = column.text
# Now we have added the column level data for one column tag
data_list.append(data.copy())
Output is as below. The key order of the dicts isn't preserved because I used pprint.pprint for convenience.
[{'answer': 'a1',
'product': '1',
'question': 'Q1',
'replica': '1',
'respondent': 'm0wxo5f6w42h3fot34m7s6xij',
'seqnr': '1',
'session': '1',
'timestamp': '10-06-16 11:30'},
{'answer': 'a2',
'product': '1',
'question': 'Q2',
'replica': '1',
'respondent': 'm0wxo5f6w42h3fot34m7s6xij',
'seqnr': '1',
'session': '1',
'timestamp': '10-06-16 11:30'},
{'answer': 'a3',
'product': '1',
'question': 'Q3',
'replica': '1',
'respondent': 'w42h3fot34m7s6x',
'seqnr': '1',
'session': '1',
'timestamp': '10-06-16 11:30'},
{'answer': 'a4',
'product': '1',
'question': 'Q4',
'replica': '1',
'respondent': 'w42h3fot34m7s6x',
'seqnr': '1',
'session': '1',
'timestamp': '10-06-16 11:30'},
{'answer': 'a5',
'product': '1',
'question': 'Q5',
'replica': '1',
'respondent': 'w42h3fot34m7s6x',
'seqnr': '1',
'session': '1',
'timestamp': '10-06-16 11:30'}]

Categories