Related
I'm trying to scrape the data from this interactive chart which is located at the bottom of the website below: https://www.vgchartz.com/tools/hw_date.php?reg=USA&ending=Yearly
I've used developer tools in chrome but cannot find the data points in the elements tab.
Would appreciate if someone can take a look and tell me if the data points are stored on the page somewhere or there is any way to do using Python
Thank you very much!
The data is included inside <script> tag on that page. To parse it, you can use js2py library. For example:
import ast
import js2py
import requests
url = "https://www.vgchartz.com/tools/hw_date.php?reg=USA&ending=Yearly"
data = re.search(
r"StockChart\(({.*?})\);", requests.get(url).text, flags=re.S
).group(1)
data = js2py.eval_js("data = " + data + ";")
data = ast.literal_eval(str(data))
print(data)
Prints:
{
"chart": {
"endOnTick": False,
"marginBottom": 90,
"marginLeft": 80,
"marginRight": 20,
"renderTo": "chart_container",
"startOnTick": False,
"zoomType": "y",
},
"legend": {"enabled": True},
"plotOptions": {"series": {"dataGrouping": {"smoothed": True}}},
"rangeSelector": {"selected": 5},
"series": [
{
"cropThreshold": 1,
"data": [
{"x": 1072933200000, "y": 1302609},
{"x": 1104555600000, "y": 2604003},
{"x": 1136091600000, "y": 5074726},
{"x": 1167627600000, "y": 8757992},
{"x": 1199163600000, "y": 10156740},
{"x": 1230786000000, "y": 10369446},
{"x": 1262322000000, "y": 8434877},
{"x": 1293858000000, "y": 4335275},
{"x": 1325394000000, "y": 2164269},
{"x": 1357016400000, "y": 600849},
],
...
The chart data is imbedded in the HTML response of a basic GET request to the link. The data points themselves can be found in a Javascript Object formatted in a script tag. To pull these points, you will need to use some sort of Javascript parser to access the Object and convert the JS object representation to a Python dictionary. Pure JSON parsing cannot easily be used here, since the Object is not initialized in valid JSON syntax. The code below uses the module pyjsparser which can be installed via pip: pip3 install pyjsparser.
import requests, pyjsparser
from bs4 import BeautifulSoup as soup
d = soup(requests.get('https://www.vgchartz.com/tools/hw_date.php?reg=USA&ending=Yearly'), 'html.parser')
ast = pyjsparser.parse(d.select_one('#chart_body > .chart_date_selector ~ script').text)
def to_json(ast):
if ast['type'] == 'ArrayExpression':
return [to_json(i) for i in ast['elements']]
if ast['type'] == 'ObjectExpression':
return {i['key']['name']:to_json(i['value']) for i in ast['properties']}
if ast['type'] == 'Literal':
return ast['value']
def get_chart_data(ast):
if isinstance(ast, dict):
if ast.get('type') == 'Property' and ast['key']['name'] == 'series' and ast['value']['type'] == 'ArrayExpression':
yield to_json(ast['value'])
return
for b in ast.values():
yield from get_chart_data(b)
elif isinstance(ast, list):
for i in ast:
yield from get_chart_data(i)
data = next(get_chart_data(ast))
Output:
[{'showLastLabel': True, 'cropThreshold': 1.0, 'name': 'PSP', 'data': [{'x': 1072933200000.0, 'y': 0.0}, {'x': 1104555600000.0, 'y': 3527367.0}, {'x': 1136091600000.0, 'y': 2952955.0}, {'x': 1167627600000.0, 'y': 3563757.0}, {'x': 1199163600000.0, 'y': 3815680.0}, {'x': 1230786000000.0, 'y': 2452361.0}, {'x': 1262322000000.0, 'y': 1824105.0}, {'x': 1293858000000.0, 'y': 1245169.0}, {'x': 1325394000000.0, 'y': 274023.0}, {'x': 1357016400000.0, 'y': 128378.0}, {'x': 1388552400000.0, 'y': 26999.0}]}, {'showLastLabel': True, 'cropThreshold': 1.0, 'name': 'Wii', 'data': [{'x': 1136091600000.0, 'y': 1075329.0}, {'x': 1167627600000.0, 'y': 6444409.0}, {'x': 1199163600000.0, 'y': 9826502.0}, {'x': 1230786000000.0, 'y': 8989309.0}, {'x': 1262322000000.0, 'y': 7398500.0}, {'x': 1293858000000.0, 'y': 4878060.0}, {'x': 1325394000000.0, 'y': 2042064.0}, {'x': 1357016400000.0, 'y': 773488.0}, {'x': 1388552400000.0, 'y': 216453.0}, {'x': 1420088400000.0, 'y': 58765.0}]}, {'showLastLabel': True, 'cropThreshold': 1.0, 'name': 'X360', 'data': [{'x': 1104555600000.0, 'y': 563282.0}, {'x': 1136091600000.0, 'y': 3832778.0}, {'x': 1167627600000.0, 'y': 4356599.0}, {'x': 1199163600000.0, 'y': 4784134.0}, {'x': 1230786000000.0, 'y': 4691537.0}, {'x': 1262322000000.0, 'y': 6999773.0}, {'x': 1293858000000.0, 'y': 7777810.0}, {'x': 1325394000000.0, 'y': 6488715.0}, {'x': 1357016400000.0, 'y': 3164108.0}, {'x': 1388552400000.0, 'y': 855780.0}, {'x': 1420088400000.0, 'y': 165018.0}, {'x': 1451624400000.0, 'y': 67456.0}, {'x': 1483246800000.0, 'y': 5433.0}]}, {'showLastLabel': True, 'cropThreshold': 1.0, 'name': 'DS', 'data': [{'x': 1072933200000.0, 'y': 1302609.0}, {'x': 1104555600000.0, 'y': 2604003.0}, {'x': 1136091600000.0, 'y': 5074726.0}, {'x': 1167627600000.0, 'y': 8757992.0}, {'x': 1199163600000.0, 'y': 10156740.0}, {'x': 1230786000000.0, 'y': 10369446.0}, {'x': 1262322000000.0, 'y': 8434877.0}, {'x': 1293858000000.0, 'y': 4335275.0}, {'x': 1325394000000.0, 'y': 2164269.0}, {'x': 1357016400000.0, 'y': 600849.0}]}, {'showLastLabel': True, 'cropThreshold': 1.0, 'name': 'PS3', 'data': [{'x': 1136091600000.0, 'y': 667762.0}, {'x': 1167627600000.0, 'y': 2474435.0}, {'x': 1199163600000.0, 'y': 3547363.0}, {'x': 1230786000000.0, 'y': 4255949.0}, {'x': 1262322000000.0, 'y': 4737437.0}, {'x': 1293858000000.0, 'y': 4486935.0}, {'x': 1325394000000.0, 'y': 3480788.0}, {'x': 1357016400000.0, 'y': 2237467.0}, {'x': 1388552400000.0, 'y': 721523.0}, {'x': 1420088400000.0, 'y': 274884.0}, {'x': 1451624400000.0, 'y': 125072.0}, {'x': 1483246800000.0, 'y': 12482.0}]}, {'showLastLabel': True, 'cropThreshold': 1.0, 'name': '3DS', 'data': [{'x': 1293858000000.0, 'y': 4056029.0}, {'x': 1325394000000.0, 'y': 3542069.0}, {'x': 1357016400000.0, 'y': 3905067.0}, {'x': 1388552400000.0, 'y': 2518536.0}, {'x': 1420088400000.0, 'y': 2499074.0}, {'x': 1451624400000.0, 'y': 2456456.0}, {'x': 1483246800000.0, 'y': 2060575.0}, {'x': 1514782800000.0, 'y': 1534418.0}, {'x': 1546318800000.0, 'y': 615394.0}, {'x': 1577854800000.0, 'y': 249847.0}, {'x': 1609477200000.0, 'y': 2787.0}]}, {'showLastLabel': True, 'cropThreshold': 1.0, 'name': 'PSV', 'data': [{'x': 1293858000000.0, 'y': 0.0}, {'x': 1325394000000.0, 'y': 1227049.0}, {'x': 1357016400000.0, 'y': 509085.0}, {'x': 1388552400000.0, 'y': 324396.0}, {'x': 1420088400000.0, 'y': 247162.0}, {'x': 1451624400000.0, 'y': 108422.0}, {'x': 1483246800000.0, 'y': 20908.0}, {'x': 1514782800000.0, 'y': 3119.0}, {'x': 1546318800000.0, 'y': 0.0}]}, {'showLastLabel': True, 'cropThreshold': 1.0, 'name': 'WiiU', 'data': [{'x': 1325394000000.0, 'y': 899502.0}, {'x': 1357016400000.0, 'y': 1214283.0}, {'x': 1388552400000.0, 'y': 1565573.0}, {'x': 1420088400000.0, 'y': 1361771.0}, {'x': 1451624400000.0, 'y': 460025.0}, {'x': 1483246800000.0, 'y': 9553.0}]}, {'showLastLabel': True, 'cropThreshold': 1.0, 'name': 'PS4', 'data': [{'x': 1357016400000.0, 'y': 2072802.0}, {'x': 1388552400000.0, 'y': 4656798.0}, {'x': 1420088400000.0, 'y': 5802890.0}, {'x': 1451624400000.0, 'y': 5077809.0}, {'x': 1483246800000.0, 'y': 5404655.0}, {'x': 1514782800000.0, 'y': 5245644.0}, {'x': 1546318800000.0, 'y': 3824746.0}, {'x': 1577854800000.0, 'y': 2084519.0}, {'x': 1609477200000.0, 'y': 613650.0}, {'x': 1641013200000.0, 'y': 168996.0}]}, {'showLastLabel': True, 'cropThreshold': 1.0, 'name': 'XOne', 'data': [{'x': 1357016400000.0, 'y': 1928445.0}, {'x': 1388552400000.0, 'y': 4325832.0}, {'x': 1420088400000.0, 'y': 4938473.0}, {'x': 1451624400000.0, 'y': 4696984.0}, {'x': 1483246800000.0, 'y': 4307564.0}, {'x': 1514782800000.0, 'y': 4304447.0}, {'x': 1546318800000.0, 'y': 2971031.0}, {'x': 1577854800000.0, 'y': 1508975.0}, {'x': 1609477200000.0, 'y': 503439.0}, {'x': 1641013200000.0, 'y': 22086.0}]}, {'showLastLabel': True, 'cropThreshold': 1.0, 'name': 'NS', 'data': [{'x': 1483246800000.0, 'y': 4880477.0}, {'x': 1514782800000.0, 'y': 5613050.0}, {'x': 1546318800000.0, 'y': 6479134.0}, {'x': 1577854800000.0, 'y': 9027556.0}, {'x': 1609477200000.0, 'y': 8012707.0}, {'x': 1641013200000.0, 'y': 2163947.0}]}, {'showLastLabel': True, 'cropThreshold': 1.0, 'name': 'PS5', 'data': [{'x': 1577854800000.0, 'y': 1940361.0}, {'x': 1609477200000.0, 'y': 4302047.0}, {'x': 1641013200000.0, 'y': 1328693.0}]}, {'showLastLabel': True, 'cropThreshold': 1.0, 'name': 'XS', 'data': [{'x': 1577854800000.0, 'y': 1525675.0}, {'x': 1609477200000.0, 'y': 3893991.0}, {'x': 1641013200000.0, 'y': 1937560.0}]}]
I have orig list of tuples that comprises dict & text values.
orig = [({'x': 28.346, 'y': 19},'Text0'),
({'x': 109.726, 'y': 19},'Text1'),
({'x': 147.776, 'y': 19},'Text2'),
({'x': 153.606, 'y': 24}, 'Text3'),
({'x': 452.788, 'y': 24}, 'Text4'),
({'x': 504.168, 'y': 34}, 'Text5'),
({'x': 527.768, 'y': 34}, 'Text6'),
({'x': 533.598, 'y': 45},'Text7'),
({'x': 64.291, 'y': 55},'Text8'),
({'x': 98.623, 'y': 55},'Text9')]
and I want to filter group from the key='y' in the which would give me list it according to unique values in y. Something like following:
res = [
[({'x': 28.346, 'y': 19},'Text0'),
({'x': 109.726, 'y': 19},'Text1'),
({'x': 147.776, 'y': 19},'Text2')],
[({'x': 153.606, 'y': 24}, 'Text3'),
({'x': 452.788, 'y': 24}, 'Text4')],
[({'x': 504.168, 'y': 34}, 'Text5'),
({'x': 527.768, 'y': 34}, 'Text6')],
[({'x': 533.598, 'y': 45},'Text7')],
[({'x': 64.291, 'y': 55},'Text8'),
({'x': 98.623, 'y': 55},'Text9')]]
If you use numpy it will be bit easier.
import numpy as np
orig = [({'x': 28.346, 'y': 19}, 'Text0'),
({'x': 109.726, 'y': 19}, 'Text1'),
({'x': 147.776, 'y': 19}, 'Text2'),
({'x': 153.606, 'y': 24}, 'Text3'),
({'x': 452.788, 'y': 24}, 'Text4'),
({'x': 504.168, 'y': 34}, 'Text5'),
({'x': 527.768, 'y': 34}, 'Text6'),
({'x': 533.598, 'y': 45}, 'Text7'),
({'x': 64.291, 'y': 55}, 'Text8'),
({'x': 98.623, 'y': 55}, 'Text9')]
input_array = np.array([val[0]['y'] for val in orig])
out_array = [np.where(input_array == element)[0].tolist() for element in np.unique(input_array)]
res = [[orig[i] for i in ind_arr] for ind_arr in out_array]
print(res)
Output:
[[({'x': 28.346, 'y': 19}, 'Text0'),
({'x': 109.726, 'y': 19}, 'Text1'),
({'x': 147.776, 'y': 19}, 'Text2')],
[({'x': 153.606, 'y': 24}, 'Text3'),
({'x': 452.788, 'y': 24}, 'Text4')],
[({'x': 504.168, 'y': 34}, 'Text5'),
({'x': 527.768, 'y': 34}, 'Text6')],
[({'x': 533.598, 'y': 45}, 'Text7')],
[({'x': 64.291, 'y': 55}, 'Text8'),
({'x': 98.623, 'y': 55}, 'Text9')]]
A two-liner solution using itertools.groupby and list comprehension:
from itertools import groupby
# group by the input orig with a key of dict "y" and then take it in a list of list comprehension
print ([[x for x in v] for k, v in groupby(orig, key= lambda x: x[0]["y"])])
Result:
[[({'x': 28.346, 'y': 19}, 'Text0'), ({'x': 109.726, 'y': 19}, 'Text1'), ({'x': 147.776, 'y': 19}, 'Text2')], [({'x': 153.606, 'y': 24}, 'Text3'), ({'x': 452.788, 'y': 24}, 'Text4')], [({'x': 504.168, 'y': 34}, 'Text5'), ({'x': 527.768, 'y': 34}, 'Text6')], [({'x': 533.598, 'y': 45}, 'Text7')], [({'x': 64.291, 'y': 55}, 'Text8'), ({'x': 98.623, 'y': 55}, 'Text9')]]
I hope this counts :)
What's the best way to represent this data-structure in python:
[{'x': 230, 'y': 50}, {'x': 350, 'y': 50}, {'x': 410, 'y': 50}]
It's not json, it's something else, sorry for my stupidity, I'm searching various python tutorials, but can't figure out if it's some structure that can be easily loaded like numpy.load or json.loads, because when I try validating that structure as JSON, it says invalid json...
You have a list of three dictionaries (mappings of keys to values) and it works like this:
>>> dicts = [{'x': 230, 'y': 50}, {'x': 350, 'y': 50}, {'x': 410, 'y': 50}]
>>> dicts[0]
{'x': 230, 'y': 50}
>>> dicts[0]['x']
230
>>> dicts[2]['y']
50
Since all the dictionaries share the same keys ('x' and 'y') in your example you can interpret them as records.
A neat way to represent these records is with a pandas.DataFrame, which has a table-like printout.
>>> import pandas as pd
>>> pd.DataFrame(dicts)
x y
0 230 50
1 350 50
2 410 50
If you have a string
>>> s = "[{'x': 230, 'y': 50}, {'x': 350, 'y': 50}, {'x': 410, 'y': 50}]"
you can evaluate it safely with ast.literal_eval.
>>> from ast import literal_eval
>>> literal_eval(s)
[{'x': 230, 'y': 50}, {'x': 350, 'y': 50}, {'x': 410, 'y': 50}]
What you have there is a list of dictionaries.
myList = []
dict1 = {'x': 230, 'y': 50}
dict2 = {'x': 350, 'y': 50}
dict3 = {'x': 410, 'y': 50}
myList.append(dict1)
myList.append(dict2)
myList.append(dict3)
I have a dictionary as follows in python and I have to group by 'label' and get the highest value of 'confidence' for each 'label'
[{'label': 'id',
'confidence': 0.11110526,
'topleft': {'x': 0, 'y': 0},
'bottomright': {'x': 187, 'y': 57}},
{'label': 'id',
'confidence': 0.10690566,
'topleft': {'x': 265, 'y': 0},
'bottomright': {'x': 525, 'y': 54}},
{'label': 'name',
'confidence': 0.15541315,
'topleft': {'x': 9, 'y': 24},
'bottomright': {'x': 116, 'y': 58}},
{'label': 'group',
'confidence': 0.12578075,
'topleft': {'x': 53, 'y': 24},
'bottomright': {'x': 153, 'y': 61}},
{'label': 'name',
'confidence': 0.12709439,
'topleft': {'x': 0, 'y': 0},
'bottomright': {'x': 247, 'y': 84}},
{'label': 'group',
'confidence': 0.116156094,
'topleft': {'x': 96, 'y': 23},
'bottomright': {'x': 191, 'y': 61}}]
How do I achieve this efficiently
You can do this with groupby
for n,g in groupby(tst,key=lambda x:x['label']):
print n,max(list(g),key=lambda x:x['confidence']).get('confidence')
Result:
id 0.11110526
name 0.15541315
group 0.12578075
name 0.12709439
group 0.116156094
I have the following list
some_list = [{'key': 'YOUNG', 'x': 22, 'y': 0.9},
{'key': 'OLD', 'x': 45, 'y': 0.6},
{'key': 'OLD', 'x': 40, 'y': 0.3},
{'key': 'YOUNG', 'x': 25, 'y': 0.3}]
and I would like to change it to:
[{'key': 'YOUNG', 'values': [ {'x': 25, 'y': 0.3}, {'x': 22, 'y': 0.9} ]}
{'key': 'OLD', 'values': [ {'x': 40, 'y': 0.3}, {'x': 45, 'y': 0.6} ]}]
Added some of my attempts
arr = [{'key': 'YOUNG', 'x': 22, 'y': 0.9},
{'key': 'OLD', 'x': 45, 'y': 0.6},
{'key': 'OLD', 'x': 40, 'y': 0.3},
{'key': 'YOUNG', 'x': 25, 'y': 0.3}]
all_keys = []
for item in arr:
all_keys.append(item['key'])
all_keys = list(set(all_keys))
res = [[{
'key': key,
'values': {'x': each['x'], 'y': each['y']}
} for each in arr if each['key'] == key]
for key in all_keys]
print res
But the result is not right, it constructs more lists:
[[{'values': {'y': 0.6, 'x': 45}, 'key': 'OLD'}, {'values': {'y': 0.3, 'x': 40}, 'key': 'OLD'}], [{'values': {'y': 0.9, 'x': 22}, 'key': 'YOUNG'}, {'values': {'y': 0.3, 'x': 25}, 'key': 'YOUNG'}]]
Thanks.
The loops should be like this:
res = [{ 'key': key,
'values': [{'x': each['x'], 'y': each['y']}
for each in arr if each['key'] == key] }
for key in all_keys]
Using an intermediate dictionary you can do:
>>> temp_data = {}
>>> for x in some_list:
... temp_data.setdefault(x['key'], []).append({k: x[k] for k in ['x', 'y']})
>>> [{'key': k, 'values': v} for k,v in temp_data.items()]
[{'key': 'OLD', 'values': [{'x': 45, 'y': 0.6}, {'x': 40, 'y': 0.3}]},
{'key': 'YOUNG', 'values': [{'x': 22, 'y': 0.9}, {'x': 25, 'y': 0.3}]}]
Though personally I would just leave it in dictionary form:
>>> temp_data
{'OLD': [{'x': 45, 'y': 0.6}, {'x': 40, 'y': 0.3}],
'YOUNG': [{'x': 22, 'y': 0.9}, {'x': 25, 'y': 0.3}]}
from itertools import *
data = [{'key': 'YOUNG', 'x': 22, 'y': 0.9},
{'key': 'OLD', 'x': 45, 'y': 0.6},
{'key': 'OLD', 'x': 40, 'y': 0.3},
{'key': 'YOUNG', 'x': 25, 'y': 0.3}]
data = sorted(data, key=lambda x: x['key'])
groups = []
uniquekeys = []
for k, v in groupby(data, lambda x: x['key'] ):
val_list = []
for each_val in v:
val_list.append({ 'x' : each_val['x'], 'y': each_val['y']})
groups.append(val_list)
uniquekeys.append(k)
print uniquekeys
print groups
print zip(uniquekeys, groups)
You will get your output as a list of tuples where the first element is your key and the second one is the group/list of values,
[('OLD', [{'y': 0.6, 'x': 45}, {'y': 0.3, 'x': 40}]), ('YOUNG', [{'y': 0.9, 'x': 22}, {'y': 0.3, 'x': 25}])]
some_list = [{'key': 'YOUNG', 'x': 22, 'y': 0.9},
{'key': 'OLD', 'x': 45, 'y': 0.6},
{'key': 'OLD', 'x': 40, 'y': 0.3},
{'key': 'YOUNG', 'x': 25, 'y': 0.3}]
outDict = {}
for dictionary in some_list:
key = dictionary['key']
copyDict = dictionary.copy() #This leaves the original dict list unaltered
del copyDict['key']
if key in outDict:
outDict[key].append(copyDict)
else:
outDict[key] = [copyDict]
print(outDict)
print(some_list)
Here you go-
some_list = [{'key': 'YOUNG', 'x': 22, 'y': 0.9},
{'key': 'OLD', 'x': 45, 'y': 0.6},
{'key': 'OLD', 'x': 40, 'y': 0.3},
{'key': 'YOUNG', 'x': 25, 'y': 0.3}]
dict_young_vals = []
dict_old_vals = []
for dict_step in some_list:
temp_dict = {}
if (dict_step['key'] == 'YOUNG'):
for keys in dict_step.keys():
if keys != 'key':
temp_dict[keys] = dict_step[keys]
if temp_dict != {}:
dict_young_vals.append(temp_dict)
if (dict_step['key'] == 'OLD'):
for keys in dict_step.keys():
if keys != 'key':
temp_dict[keys] = dict_step[keys]
if temp_dict != {}:
dict_old_vals.append(temp_dict)
dict_young = {'key':'YOUNG'}
dict_young['values'] = dict_young_vals
dict_old = {'key': 'OLD'}
dict_old['values'] = dict_old_vals
print(dict_young_vals)
result_dict = []
result_dict.append(dict_young)
result_dict.append(dict_old)
print(result_dict)
Another try may be using defaultdict- It will run faster if data is larger.
from collections import defaultdict
data = defaultdict(list)
some_list = [{'key': 'YOUNG', 'x': 22, 'y': 0.9},
{'key': 'OLD', 'x': 45, 'y': 0.6},
{'key': 'OLD', 'x': 40, 'y': 0.3},
{'key': 'YOUNG', 'x': 25, 'y': 0.3}]
for item in some_list:
vals = item.copy()
del vals['key']
data[item['key']].append(vals)
print [{'key':k,'values':v} for k,v in data.items()]
Output (dictionary does not care about ordering)-
[{'values': [{'y': 0.6, 'x': 45}, {'y': 0.3, 'x': 40}], 'key': 'OLD'}, {'values': [{'y': 0.9, 'x': 22}, {'y': 0.3, 'x': 25}], 'key': 'YOUNG'}]
some_list = [{'key': 'YOUNG', 'x': 22, 'y': 0.9},
{'key': 'OLD', 'x': 45, 'y': 0.6},
{'key': 'OLD', 'x': 40, 'y': 0.3},
{'key': 'YOUNG', 'x': 25, 'y': 0.3}]
x=[]
for i in some_list:
d={}
d["key"]=i["key"]
d["values"]=[{m:n for m,n in i.items() if m!="key"}]
if d["key"] not in [j["key"] for j in x]:
x.append(d)
else:
for k in x:
if k["key"]==d["key"]:
k["values"].append(d["values"][0])
print x
Output:[{'values': [{'y': 0.9, 'x': 22}, {'y': 0.3, 'x': 25}], 'key': 'YOUNG'}, {'values': [{'y': 0.6, 'x': 45}, {'y': 0.3, 'x': 40}], 'key': 'OLD'}]