I have run the below code for 522 gzip files of size 100 GB and after decompressing, it will be around 320 GB data and data in protobuf format and write the output to GCS. I have used n1 standard machines and region for input, output all taken care and job cost me around 17$, this is for half-hour data and so I really need to do some cost optimization here very badly.
Cost I get from the below query
SELECT l.value AS JobID, ROUND(SUM(cost),3) AS JobCost
FROM `PROJECT.gcp_billing_data.gcp_billing_export_v1_{}` bill,
UNNEST(bill.labels) l
WHERE service.description = 'Cloud Dataflow' and l.key = 'goog-dataflow-job-id' and
extract(date from _PARTITIONTIME) > "2020-12-31"
GROUP BY 1
Complete code
import time
import sys
import argparse
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
import csv
import base64
from google.protobuf import timestamp_pb2
from google.protobuf.json_format import MessageToDict
from google.protobuf.json_format import MessageToJson
import io
import logging
from io import StringIO
from google.cloud import storage
import json
###PROTOBUF CLASS
from otherfiles import processor_pb2
class ConvertToJson(beam.DoFn):
def process(self, message, *args, **kwargs):
import base64
from otherfiles import processor_pb2
from google.protobuf.json_format import MessageToDict
from google.protobuf.json_format import MessageToJson
import json
if (len(message) >= 4):
b64ProtoData = message[2]
totalProcessorBids = int(message[3] if message[3] and message[3] is not None else 0);
b64ProtoData = b64ProtoData.replace('_', '/')
b64ProtoData = b64ProtoData.replace('*', '=')
b64ProtoData = b64ProtoData.replace('-', '+')
finalbunary = base64.b64decode(b64ProtoData)
log = processor_pb2.ProcessorLogProto()
log.ParseFromString(finalbunary)
#print(log)
jsonObj = MessageToDict(log,preserving_proto_field_name=True)
jsonObj["totalProcessorBids"] = totalProcessorBids
#wjdata = json.dumps(jsonObj)
print(jsonObj)
return [jsonObj]
else:
pass
class ParseFile(beam.DoFn):
def process(self, element, *args, **kwargs):
import csv
for line in csv.reader([element], quotechar='"', delimiter='\t', quoting=csv.QUOTE_ALL, skipinitialspace=True):
#print (line)
return [line]
def run():
parser = argparse.ArgumentParser()
parser.add_argument("--input", dest="input", required=False)
parser.add_argument("--output", dest="output", required=False)
parser.add_argument("--bucket", dest="bucket", required=True)
parser.add_argument("--bfilename", dest="bfilename", required=True)
app_args, pipeline_args = parser.parse_known_args()
#pipeline_args.extend(['--runner=DirectRunner'])
pipeline_options = PipelineOptions(pipeline_args)
pipeline_options.view_as(SetupOptions).save_main_session = True
bucket_input=app_args.bucket
bfilename=app_args.bfilename
storage_client = storage.Client()
bucket = storage_client.get_bucket(bucket_input)
blob = bucket.blob(bfilename)
blob = blob.download_as_string()
blob = blob.decode('utf-8')
blob = StringIO(blob)
pqueue = []
names = csv.reader(blob)
for i,filename in enumerate(names):
if filename and filename[0]:
pqueue.append(filename[0])
with beam.Pipeline(options=pipeline_options) as p:
if(len(pqueue)>0):
input_list=app_args.input
output_list=app_args.output
events = ( p | "create PCol from list" >> beam.Create(pqueue)
| "read files" >> beam.io.textio.ReadAllFromText()
| "Transform" >> beam.ParDo(ParseFile())
| "Convert To JSON" >> beam.ParDo(ConvertToJson())
| "Write to BQ" >> beam.io.WriteToBigQuery(
table='TABLE',
dataset='DATASET',
project='PROJECT',
schema="dataevent:STRING",
create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
insert_retry_strategy=RetryStrategy.RETRY_ON_TRANSIENT_ERROR,
custom_gcs_temp_location='gs://BUCKET/gcs-temp-to-bq/',
method='FILE_LOADS'))
##bigquery failed rows NOT WORKING so commented
#(events[beam.io.gcp.bigquery.BigQueryWriteFn.FAILED_ROWS] | "Bad lines" >> beam.io.textio.WriteToText("error_log.txt"))
##WRITING TO GCS
#printFileConetent | "Write TExt" >> beam.io.WriteToText(output_list+"file_",file_name_suffix=".json",num_shards=1, append_trailing_newlines = True)
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
run()
The job took around 49 mins
Things I tried:
1) For avro, generated schema that needs to be in JSON for proto file and tried below code to convert a dictionary to avro msg, but it is taking time as the size of the dictionary is more.
schema_separated= is an avro JSON schema and it is working fine
with beam.Pipeline(options=pipeline_options) as p:
if(len(pqueue)>0):
input_list=app_args.input
output_list=app_args.output
p1 = p | "create PCol from list" >> beam.Create(pqueue)
readListofFiles=p1 | "read files" >> beam.io.textio.ReadAllFromText()
parsingProtoFile = readListofFiles | "Transform" >> beam.ParDo(ParseFile())
printFileConetent = parsingProtoFile | "Convert To JSON" >> beam.ParDo(ConvertToJson())
compressIdc=True
use_fastavro=True
printFileConetent | 'write_fastavro' >> WriteToAvro(
output_list+"file_",
# '/tmp/dataflow/{}/{}'.format(
# 'demo', 'output'),
# parse_schema(json.loads(SCHEMA_STRING)),
parse_schema(schema_separated),
use_fastavro=use_fastavro,
file_name_suffix='.avro',
codec=('deflate' if compressIdc else 'null'),
)
In the main code, I tried to insert JSON record as a string to bigquery table and so that I can use JSON functions in bigquery to extract the data and that also didn't go well and getting this below error.
message: 'Error while reading data, error message: JSON table encountered too many errors,
giving up. Rows: 1; errors: 1. Please look into the errors[] collection for more details.'
reason: 'invalid'> [while running 'Write to
BQ/BigQueryBatchFileLoads/WaitForDestinationLoadJobs']
Tried to insert the above JSON dictionary to bigquery providing JSON schema to table and is working fine as well
Now the challenge is size after deserialising the proto to JSON dict is doubled and cost will be calculated in dataflow by how much data processed
I'm trying and reading a lot to make this work and if it works, then I can make it stable for production.
Sample JSON record.
{'timestamp': '1609286400', 'bidResponseId': '5febc300000115cd054b9fd6840a5af1', 'aggregatorId': '1', 'userId': '7567d74e-2e43-45f4-a42a-8224798bb0dd', 'uniqueResponseId': '', 'adserverId': '1002418', 'dataVersion': '1609285802', 'geoInfo': {'country': '101', 'region': '122', 'city': '11605', 'timezone': '420'}, 'clientInfo': {'os': '4', 'browser': '1', 'remoteIp': '36.70.64.0'}, 'adRequestInfo': {'requestingPage': 'com.opera.mini.native', 'siteId': '557243954', 'foldPosition': '2', 'adSlotId': '1', 'isTest': False, 'opType': 'TYPE_LEARNING', 'mediaType': 'BANNER'}, 'userSegments': [{'id': '2029660', 'weight': -1.0, 'recency': '1052208'}, {'id': '2034588', 'weight': -1.0, 'recency': '-18101'}, {'id': '2029658', 'weight': -1.0, 'recency': '744251'}, {'id': '2031067', 'weight': -1.0, 'recency': '1162398'}, {'id': '2029659', 'weight': -1.0, 'recency': '862833'}, {'id': '2033498', 'weight': -1.0, 'recency': '802749'}, {'id': '2016729', 'weight': -1.0, 'recency': '1620540'}, {'id': '2034584', 'weight': -1.0, 'recency': '111571'}, {'id': '2028182', 'weight': -1.0, 'recency': '744251'}, {'id': '2016726', 'weight': -1.0, 'recency': '1620540'}, {'id': '2028183', 'weight': -1.0, 'recency': '744251'}, {'id': '2028178', 'weight': -1.0, 'recency': '862833'}, {'id': '2016722', 'weight': -1.0, 'recency': '1675814'}, {'id': '2029587', 'weight': -1.0, 'recency': '38160'}, {'id': '2028177', 'weight': -1.0, 'recency': '862833'}, {'id': '2016719', 'weight': -1.0, 'recency': '1675814'}, {'id': '2027404', 'weight': -1.0, 'recency': '139031'}, {'id': '2028172', 'weight': -1.0, 'recency': '1052208'}, {'id': '2028173', 'weight': -1.0, 'recency': '1052208'}, {'id': '2034058', 'weight': -1.0, 'recency': '1191459'}, {'id': '2016712', 'weight': -1.0, 'recency': '1809526'}, {'id': '2030025', 'weight': -1.0, 'recency': '1162401'}, {'id': '2015235', 'weight': -1.0, 'recency': '139031'}, {'id': '2027712', 'weight': -1.0, 'recency': '139031'}, {'id': '2032447', 'weight': -1.0, 'recency': '7313670'}, {'id': '2034815', 'weight': -1.0, 'recency': '586825'}, {'id': '2034811', 'weight': -1.0, 'recency': '659366'}, {'id': '2030004', 'weight': -1.0, 'recency': '139031'}, {'id': '2027316', 'weight': -1.0, 'recency': '1620540'}, {'id': '2033141', 'weight': -1.0, 'recency': '7313670'}, {'id': '2034736', 'weight': -1.0, 'recency': '308252'}, {'id': '2029804', 'weight': -1.0, 'recency': '307938'}, {'id': '2030188', 'weight': -1.0, 'recency': '3591519'}, {'id': '2033449', 'weight': -1.0, 'recency': '1620540'}, {'id': '2029672', 'weight': -1.0, 'recency': '1441083'}, {'id': '2029664', 'weight': -1.0, 'recency': '636630'}], 'perfInfo': {'timeTotal': '2171', 'timeBidInitialize': '0', 'timeProcessDatastore': '0', 'timeGetCandidates': '0', 'timeAdFiltering': '0', 'timeEcpmComputation': '0', 'timeBidComputation': '0', 'timeAdSelection': '0', 'timeBidSubmit': '0', 'timeTFQuery': '0', 'timeVWQuery': '8'}, 'learningPercent': 0.10000000149011612, 'pageLanguageId': '0', 'sspUserId': 'CAESECHFlNeuUm16IYThguoQ8ck_1', 'minEcpm': 0.12999999523162842, 'adSpotId': '1', 'creativeSizes': [{'width': '7', 'height': '7'}], 'pageTypeId': '0', 'numSlots': '0', 'eligibleLIs': [{'type': 'TYPE_OPTIMIZED', 'liIds': [{'id': 44005, 'reason': '12', 'creative_id': 121574, 'bid_amount': 8.403361132251052e-08}, {'id': 46938, 'reason': '12', 'creative_id': 124916, 'bid_amount': 8.403361132251052e-06}, {'id': 54450, 'reason': '12', 'creative_id': 124916, 'bid_amount': 2.0117618771650174e-05}, {'id': 54450, 'reason': '12', 'creative_id': 135726, 'bid_amount': 2.4237295484638312e-05}]}, {'type': 'TYPE_LEARNING'}], 'bidType': 4, 'isSecureRequest': True, 'sourceType': 3, 'deviceBrand': 82, 'deviceModel': 1, 'sellerNetworkId': 12814, 'interstitialRequest': False, 'nativeAdRequest': True, 'native': {'mainImg': [{'w': 0, 'h': 0, 'wmin': 1200, 'hmin': 627}, {'w': 0, 'h': 0, 'wmin': 1200, 'hmin': 627}, {'w': 0, 'h': 0, 'wmin': 1200, 'hmin': 627}, {'w': 0, 'h': 0, 'wmin': 1200, 'hmin': 627}], 'iconImg': [{'w': 0, 'h': 0, 'wmin': 0, 'hmin': 0}, {'w': 0, 'h': 0, 'wmin': 100, 'hmin': 100}, {'w': 0, 'h': 0, 'wmin': 0, 'hmin': 0}, {'w': 0, 'h': 0, 'wmin': 100, 'hmin': 100}], 'logoImg': [{'w': 0, 'h': 0, 'wmin': 100, 'hmin': 100}, {'w': 0, 'h': 0, 'wmin': 0, 'hmin': 0}, {'w': 0, 'h': 0, 'wmin': 100, 'hmin': 100}, {'w': 0, 'h': 0, 'wmin': 0, 'hmin': 0}]}, 'throttleWeight': 1, 'isSegmentReceived': False, 'viewability': 46, 'bannerAdRequest': False, 'videoAdRequest': False, 'mraidAdRequest': True, 'jsonModelCallCount': 0, 'totalProcessorBids': 1}
Can someone help me here?
PFA screenshots for reference as well
My advice here would be to use Java to perform your transformations.
In Java, you can convert the Protobuf into Avro like this: Writing protobuf object in parquet using apache beam
And once you've done that, you can use AvroIO to write the data to files.
Java is much more performant than Python, and will save you computing resources. Since this job does something very simple, and does not require any special Python libraries, I encourage you strongly to try and go with Java.
Just wanted to bring your attention to "FlexRS" if you haven't checked this. This uses preemptible virtual machine (VM) instances and that way you can reduce your cost.
Related
I am using python sqs_extended_client to process data and using S3 bucket for large payload processing but despite using, I am getting the following error
ERROR - An error occurred (InvalidParameterValue) when calling the SendMessage operation: One or more parameters are invalid. Reason: Message must be shorter than 1024 bytes.
Following is my code
# boto resource
resource = boto3.resource("sqs")
queue = resource.Queue(AWS_SQS_QUEUE_URL)
queue.large_payload_support = SQS_BUCKET
queue.message_size_threshold = 65536
#Send message code
def send_message(message_body):
response = queue.send_message(
QueueUrl=AWS_SQS_QUEUE_URL, MessageBody=json.dumps(message_body)
)
return response
message = {'topic_name': 'report', 'report': {'header': {'stamp': {'sec': 166515321, 'nanosec': 313451487}, 'x_id': ''}, 'b_id': 'a1', 'c_id': 1, 'd_id': 2, 'e_id': 'STOPPED', 'f_id': '', 'd_id': False, 'e_id': False, 'g_id': 1, 'id': 0, 'h_id': 0, 'i_id': 1.0, 'j_id': {'pose': {'k_id': 12040, 'm_id': 180}, 'lyf_id': False}, 'pose': {'x': 27, 'y': -12, 'thaw': 3}, 'next_pose': {'x': 27, 'y': -12, 'yaw': 3}, 'current_goal': {'pose': {'node_id': 0, 'heading': 0}, 'lyf_id': False}, 'goals': '', 'goal_id': '', 'next_goal': {'pose': {'node_id': 0, 'heading': 0}, 'lift': False}, 'next_goal_type': '', 'next_goal_id': ''}, 'filename': 'filename.ext'}
status = send_message(json.dumps(message))
I use the latest version of Spyder to code. I made a simple graph with this code:
import pandas as pd
import sys
import os
import plotly.express as px
filepath = input('Enter filepath: ')
assert os.path.exists(filepath), "I did not find the file at, " + str(filepath)
f = open(filepath, 'r+')
print("Hooray we found your file!")
f.close()
file = pd.read_csv(filepath, encoding='latin1', delimiter=',')
fig = px.histogram(file, x='Idade', color='Categoria')
print(fig)
#Idade means age and Categoria will show who has canceled services or not ('Cliente' and 'Cancelado', I'm sure you know what each means). The idea is that the graph has to show the ratio of cancellation of services between different ages. Ex: in a group of people of 20 years of age, 50 cancelled but 120 still remain clients.
But when I try to run it, spyder shows me this weird... I don't even know what to call this
Figure({
'data': [{'alignmentgroup': 'True',
'hovertemplate': 'Categoria=Cliente<br>Idade=%{x}<br>index=%{y}<extra></extra>',
'legendgroup': 'Cliente',
'marker': {'color': '#636efa'},
'name': 'Cliente',
'offsetgroup': 'Cliente',
'orientation': 'h',
'showlegend': True,
'textposition': 'auto',
'type': 'bar',
'x': array([45, 49, 51, ..., 54, 56, 50], dtype=int64),
'xaxis': 'x',
'y': array([ 0, 1, 2, ..., 10120, 10121, 10122], dtype=int64),
'yaxis': 'y'},
{'alignmentgroup': 'True',
'hovertemplate': 'Categoria=Cancelado<br>Idade=%{x}<br>index=%{y}<extra></extra>',
'legendgroup': 'Cancelado',
'marker': {'color': '#EF553B'},
'name': 'Cancelado',
'offsetgroup': 'Cancelado',
'orientation': 'h',
'showlegend': True,
'textposition': 'auto',
'type': 'bar',
'x': array([62, 66, 54, ..., 44, 30, 43], dtype=int64),
'xaxis': 'x',
'y': array([ 21, 39, 51, ..., 10124, 10125, 10126], dtype=int64),
'yaxis': 'y'}],
'layout': {'barmode': 'relative',
'legend': {'title': {'text': 'Categoria'}, 'tracegroupgap': 0},
'margin': {'t': 60},
'template': '...',
'xaxis': {'anchor': 'y', 'domain': [0.0, 1.0], 'title': {'text': 'Idade'}},
'yaxis': {'anchor': 'x', 'domain': [0.0, 1.0], 'title': {'text': 'index'}}}
})
How do I get an actual image insted of this? btw I can't use jupyter or google colabs since I have to make an executable program that generate said images as jpeg or whatever
I have the following huge output from the code :urllib.request.urlopen("https://api...").read()
This looks like a JSON object but it is a bytes object. I am looking into on data into this whole. I am not sure how parse all these nested dictionary. Any help would be appreciated. I want to extract the value 112242287903649 located around the end.
b'{"address":"0x4264422fa4c1e60c2ee10d19549c0775fe544d7c","ETH":{"balance":39234.92760140797,"price":{"rate":406.0918669863694,"diff":3.33,"diff7d":7.19,"ts":1603860182,"marketCapUsd":45964513524.05101,"availableSupply":113187476.1865,"volume24h":14765115042.093159,"diff30d":14.028844201369225}},"countTxs":7,"tokens":[{"tokenInfo":{"address":"0x0d4b4da5fb1a7d55e85f8e22f728701ceb6e44c9","name":"DigiMax","decimals":"18","symbol":"DGMT","totalSupply":"1000000000000000000000000000","owner":"0x","lastUpdated":1603831313,"issuancesCount":0,"holdersCount":1042,"description":"DigiMax (DGMT) is a de-centralized Currency on ETHEREUM NETWORK. It is trustless, non-custodial, Layer-2 scaling solution for transferring value on Ethereum. It is Open Source. Community oriented and powered to maximize the power of the blockchain technology","website":"https://digimaxtoken.io/","twitter":"DigiMax_DGMT","image":"/images/DGMT0d4b4da5.png","telegram":"https://t.me/DigiMaxToken","reddit":"DigiMax_DGMT","coingecko":"digimax","price":{"rate":1.218303675e-5,"diff":3.55,"diff7d":-87.33,"ts":1603860187,"marketCapUsd":0,"availableSupply":0,"volume24h":0.36549128,"diff30d":-99.95948266499424,"currency":"USD"}},"balance":3.9e+19,"totalIn":0,"totalOut":0},{"tokenInfo":{"address":"0x28cb7e841ee97947a86b06fa4090c8451f64c0be","name":"YF Link","decimals":"18","symbol":"YFL","totalSupply":"52000000000000000000000","owner":"0x","lastUpdated":1603851830,"issuancesCount":0,"holdersCount":5164,"image":"/images/YFL28cb7e84.png","website":"https://yflink.io/","telegram":"https://t.me/YFLinkGroup","twitter":"YFLinkio","coingecko":"yflink","price":{"rate":411.62315709142763,"diff":2.44,"diff7d":22.67,"ts":1603860243,"marketCapUsd":20628385.985420085,"availableSupply":50114.73633112,"volume24h":673808.77973096,"diff30d":-9.745291974110742,"currency":"USD"},"publicTags":["Yield Farming","Yearn","Governance"]},"balance":69000000000000,"totalIn":0,"totalOut":0},{"tokenInfo":{"address":"0x618e75ac90b12c6049ba3b27f5d5f8651b0037f6","name":"QASH","decimals":"6","symbol":"QASH","totalSupply":"1000000000000000","owner":"0x9fa8a9cd0bd7cbfc503513bc94cd3b3a9ca90e35","lastUpdated":1603818056,"issuancesCount":0,"holdersCount":13087,"website":"https://liquid.plus/","facebook":"LiquidGlobal","telegram":"https://t.me/QUOINENews","twitter":"Liquid_Global","image":"/images/QASH618e75ac.jpeg","reddit":"liquid","coingecko":"qash","ethTransfersCount":2,"price":{"rate":0.03783789848158,"diff":2.83,"diff7d":0.05,"ts":1603860243,"marketCapUsd":13243264.468553,"availableSupply":350000000,"volume24h":170565.95092274,"diff30d":-5.421371004476654,"currency":"USD"},"publicTags":["Exchange"]},"balance":112242287903649,"totalIn":0,"totalOut":0},{"tokenInfo":{"address":"0x9f7229af0c4b9740e207ea283b9094983f78ba04","decimals":"18","name":"Tadpole","owner":"0x","symbol":"TAD","totalSupply":"1000000000000000000000000","lastUpdated":1603859098,"issuancesCount":0,"holdersCount":597,"price":false},"balance":100000000000000,"totalIn":0,"totalOut":0}]}'
The in built json module is perfectly capable of parsing byte strings-
import json
response = urllib.request.urlopen("https://api/endpoint").read()
jsondat = json.loads(response)
Now you can use jsondat however you'd like and extract whichever nested property you desire.
Note that you can also use the requests module, though you absolutely don't have to in this case, to achieve this a bit more simply-
import requests
jsondat = requests.get("https://api/endpoint").json()
Doing json.loads on your given byte string yields-
{'address': '0x4264422fa4c1e60c2ee10d19549c0775fe544d7c',
'ETH': {'balance': 39234.92760140797,
'price': {'rate': 406.0918669863694,
'diff': 3.33,
'diff7d': 7.19,
'ts': 1603860182,
'marketCapUsd': 45964513524.05101,
'availableSupply': 113187476.1865,
'volume24h': 14765115042.093159,
'diff30d': 14.028844201369225}},
'countTxs': 7,
'tokens': [{'tokenInfo': {'address': '0x0d4b4da5fb1a7d55e85f8e22f728701ceb6e44c9',
'name': 'DigiMax',
'decimals': '18',
'symbol': 'DGMT',
'totalSupply': '1000000000000000000000000000',
'owner': '0x',
'lastUpdated': 1603831313,
'issuancesCount': 0,
'holdersCount': 1042,
'description': 'DigiMax (DGMT) is a de-centralized Currency on ETHEREUM NETWORK. It is trustless, non-custodial, Layer-2 scaling solution for transferring value on Ethereum. It is Open Source. Community oriented and powered to maximize the power of the blockchain technology',
'website': 'https://digimaxtoken.io/',
'twitter': 'DigiMax_DGMT',
'image': '/images/DGMT0d4b4da5.png',
'telegram': 'https://t.me/DigiMaxToken',
'reddit': 'DigiMax_DGMT',
'coingecko': 'digimax',
'price': {'rate': 1.218303675e-05,
'diff': 3.55,
'diff7d': -87.33,
'ts': 1603860187,
'marketCapUsd': 0,
'availableSupply': 0,
'volume24h': 0.36549128,
'diff30d': -99.95948266499424,
'currency': 'USD'}},
'balance': 3.9e+19,
'totalIn': 0,
'totalOut': 0},
{'tokenInfo': {'address': '0x28cb7e841ee97947a86b06fa4090c8451f64c0be',
'name': 'YF Link',
'decimals': '18',
'symbol': 'YFL',
'totalSupply': '52000000000000000000000',
'owner': '0x',
'lastUpdated': 1603851830,
'issuancesCount': 0,
'holdersCount': 5164,
'image': '/images/YFL28cb7e84.png',
'website': 'https://yflink.io/',
'telegram': 'https://t.me/YFLinkGroup',
'twitter': 'YFLinkio',
'coingecko': 'yflink',
'price': {'rate': 411.62315709142763,
'diff': 2.44,
'diff7d': 22.67,
'ts': 1603860243,
'marketCapUsd': 20628385.985420085,
'availableSupply': 50114.73633112,
'volume24h': 673808.77973096,
'diff30d': -9.745291974110742,
'currency': 'USD'},
'publicTags': ['Yield Farming', 'Yearn', 'Governance']},
'balance': 69000000000000,
'totalIn': 0,
'totalOut': 0},
{'tokenInfo': {'address': '0x618e75ac90b12c6049ba3b27f5d5f8651b0037f6',
'name': 'QASH',
'decimals': '6',
'symbol': 'QASH',
'totalSupply': '1000000000000000',
'owner': '0x9fa8a9cd0bd7cbfc503513bc94cd3b3a9ca90e35',
'lastUpdated': 1603818056,
'issuancesCount': 0,
'holdersCount': 13087,
'website': 'https://liquid.plus/',
'facebook': 'LiquidGlobal',
'telegram': 'https://t.me/QUOINENews',
'twitter': 'Liquid_Global',
'image': '/images/QASH618e75ac.jpeg',
'reddit': 'liquid',
'coingecko': 'qash',
'ethTransfersCount': 2,
'price': {'rate': 0.03783789848158,
'diff': 2.83,
'diff7d': 0.05,
'ts': 1603860243,
'marketCapUsd': 13243264.468553,
'availableSupply': 350000000,
'volume24h': 170565.95092274,
'diff30d': -5.421371004476654,
'currency': 'USD'},
'publicTags': ['Exchange']},
'balance': 112242287903649,
'totalIn': 0,
'totalOut': 0},
{'tokenInfo': {'address': '0x9f7229af0c4b9740e207ea283b9094983f78ba04',
'decimals': '18',
'name': 'Tadpole',
'owner': '0x',
'symbol': 'TAD',
'totalSupply': '1000000000000000000000000',
'lastUpdated': 1603859098,
'issuancesCount': 0,
'holdersCount': 597,
'price': False},
'balance': 100000000000000,
'totalIn': 0,
'totalOut': 0}]}
I am trying to pull the table data from this website - 'https://understat.com/league/EPL'
When I viewed the Source code, the table is saved in a . I want to know how to extract the data from the script in a usable format.
I tried using the solution from a similar question (How to Get Script Tag Variables From a Website using Python):
import requests
import bs4
import json
url = 'https://understat.com/league/EPL'
r = requests.get(url)
bs = bs4.BeautifulSoup(r.text, "html.parser")
scripts = bs.find_all('script')
for s in scripts:
if 'var datesData' in s.text:
script = s.text
print(script)
However, nothing is getting printed, that is, it can't find 'var datesData' in the script, but when I just print(scripts), I get:
[<script>
var THEME = localStorage.getItem("theme") || 'DARK';
document.body.className = "theme-" + THEME.toLowerCase();
</script>,
<script>
var datesData = JSON.parse('\x5B\x7B\x22id\x22\x3A\x2211643\x22,\x22isResult\x22\x3Atrue,\x22h\x22\x3A\x7B\x22id\x22\x3A\x2287\x22,\x22title\x22\x3A\x22Liverpool\x22,\x22short_title\x22\x3A\x22LIV\x22\x7D,\x22a\x22\x3A\x7B\x22id\x22\x3A\x2279\x22,\x22title\x22\x3A\x22Norwich\x22,\x22short_title\x22\x3A\x22NOR...
and so on
]
As you can see, the second list contains 'var datesData' but my code won't print it.
What I want is to get that second script from the list and get the data within the JSON.parse() so I can create a dataframe eventually. One option I can do is copy that entire line from the url's source code and pass it on to json.loads() to use it like:
js = json.loads('\x5B\x7B\x22id\x22\x3A\x2211643\x22,\x22isResult\x22\x3Atrue,\x22h\x22\...')
which gives me an output of:
[{'id': '11643',
'isResult': True,
'h': {'id': '87', 'title': 'Liverpool', 'short_title': 'LIV'},
'a': {'id': '79', 'title': 'Norwich', 'short_title': 'NOR'},
'goals': {'h': '4', 'a': '1'},
'xG': {'h': '2.23456', 'a': '0.842407'},
'datetime': '2019-08-09 20:00:00',
'forecast': {'w': '0.7377', 'd': '0.1732', 'l': '0.0891'}},
{'id': '11644',
'isResult': True,
'h': {'id': '81', 'title': 'West Ham', 'short_title': 'WHU'},
'a': {'id': '88', 'title': 'Manchester City', 'short_title': 'MCI'},
'goals': {'h': '0', 'a': '5'},
'xG': {'h': '1.2003', 'a': '3.18377'},
'datetime': '2019-08-10 12:30:00',
'forecast': {'w': '0.0452', 'd': '0.1166', 'l': '0.8382'}},
{'id': '11645',
'isResult': True,
...
However, the better way is to call the data from the website so I can account for changes that WILL happen later to the data.
TLDR: I want to read the data stored in a script tag in a readable format using Python
Perhaps something like
import ast
import json
import re
from pprint import pprint
import requests
pattern = re.compile(r'\bvar\s+datesData\s*=\s*JSON\.parse\((.+?)\)')
url = 'https://understat.com/league/EPL'
r = requests.get(url)
s = r.text
m = pattern.search(s)
data = m.group(1)
o = json.loads(ast.literal_eval(data))
pprint(o[:3])
which gives me
[{'a': {'id': '79', 'short_title': 'NOR', 'title': 'Norwich'},
'datetime': '2019-08-09 20:00:00',
'forecast': {'d': '0.1732', 'l': '0.0891', 'w': '0.7377'},
'goals': {'a': '1', 'h': '4'},
'h': {'id': '87', 'short_title': 'LIV', 'title': 'Liverpool'},
'id': '11643',
'isResult': True,
'xG': {'a': '0.842407', 'h': '2.23456'}},
{'a': {'id': '88', 'short_title': 'MCI', 'title': 'Manchester City'},
'datetime': '2019-08-10 12:30:00',
'forecast': {'d': '0.1166', 'l': '0.8382', 'w': '0.0452'},
'goals': {'a': '5', 'h': '0'},
'h': {'id': '81', 'short_title': 'WHU', 'title': 'West Ham'},
'id': '11644',
'isResult': True,
'xG': {'a': '3.18377', 'h': '1.2003'}},
{'a': {'id': '238', 'short_title': 'SHE', 'title': 'Sheffield United'},
'datetime': '2019-08-10 15:00:00',
'forecast': {'d': '0.3923', 'l': '0.3994', 'w': '0.2083'},
'goals': {'a': '1', 'h': '1'},
'h': {'id': '73', 'short_title': 'BOU', 'title': 'Bournemouth'},
'id': '11645',
'isResult': True,
'xG': {'a': '1.59864', 'h': '1.34099'}}]
My Python script connects to an API and gets some JSON.
I've been trying out prettyprint, parse, loads, dumps but I haven't figured them out yet...
Right now, when i do print(request.json()) I get this:
{'info': {'status': 'OK', 'time': {'seconds': 0.050006151199341, 'human': '50 milliseconds'}},
'datalist': {'total': 1, 'count': 1, 'offset': 0, 'limit': 3, 'next': 1, 'hidden': 0, 'loaded': True, 'list': [
{'id': 27862209, 'name': 'Fate/Grand Order', 'package': 'com.xiaomeng.fategrandorder',
'uname': 'komoe-game-fate-go', 'size': 49527668,
'icon': 'http://pool.img.xxxxx.com/msi8/9b58a48638b480c17135a10810374bd6_icon.png',
'graphic': 'http://pool.img.xxxxx.com/msi8/3a240b50ac37a9824b9ac99f1daab8c8_fgraphic_705x345.jpg',
'added': '2017-05-20 10:54:53', 'modified': '2017-05-20 10:54:53', 'updated': '2018-02-12 12:35:51',
'uptype': 'regular', 'store': {'id': 750918, 'name': 'msi8',
'avatar': 'http://pool.img.xxxxx.com/msi8/c61a8cfe9f68bfcfb71ef59b46a8ae5d_ravatar.png',
'appearance': {'theme': 'grey',
'description': '❤️ Welcome To Msi8 Store & My Store Will Mostly Be Specialized in Games With OBB File Extension. I Hope You Find What You Are Looking For Here ❤️'},
'stats': {'apps': 20776, 'subscribers': 96868, 'downloads': 25958359}},
'file': {'vername': '1.14.5', 'vercode': 52, 'md5sum': 'xxxxx', 'filesize': 49527668,
'path': 'http://pool.apk.xxxxx.com/msi8/com-xiaomeng-fategrandorder-52-27862209-32a264b031d6933514970c43dea4191f.apk',
'path_alt': 'http://pool.apk.xxxxx.com/msi8/alt/Y29tLXhpYW9tZW5nLWZhdGVncmFuZG9yZGVyLTUyLTI3ODYyMjA5LTMyYTI2NGIwMzFkNjkzMzUxNDk3MGM0M2RlYTQxOTFm.apk',
'malware': {'rank': 'UNKNOWN'}},
'stats': {'downloads': 432, 'pdownloads': 452, 'rating': {'avg': 0, 'total': 0},
'prating': {'avg': 0, 'total': 0}}, 'has_versions': False, 'obb': None,
'xxxxx': {'advertising': False, 'billing': False}}]}}
But I want it to look like this:
>>> import json
>>> a={"some":"json", "a":{"b":[1,2,3,4]}}
>>> print(json.dumps(a, indent=4, sort_keys=True))
{
"a": {
"b": [
1,
2,
3,
4
]
},
"some": "json"
}