python script to download youtube video

python script to download youtube video - python

On giving youtube video url, I first download video page and extract javascript object between
<script>var ytplayer = ytplayer ..... </script>
I got
{
"args": {
"is_listed": "1",
"account_playback_token": "QUFFLUhqbWdXR1NfQjRiRmNzWVhRVTM0ajlNcnM1alVUd3xBQ3Jtc0tsVi01WFp5VmV2MTU3RnpkYUVkRzVqR1ZTNUI4T2JaQzk1ckxPejdVNkYzUk5zOTdjZnNmb1BYZHNLQ05nblZZbFk2ZWJXNHRPNVFoNVVNc2RjTE1YekdKSGY4dlVhSnlCU1ctNFZJdXBKbWhIRG1TZw==",
"ptk": "RajshriEntertainment",
"focEnabled": "1",
"tag_for_child_directed": false,
"adaptive_fmts": ......,
"probe_url": .....,
"rmktEnabled": "1",
"allow_ratings": "1",
"dbp": "ChoKFk5RNTV5UGs5bDZmSk5wSjQ4a3RiSHcQARABGAI",
"cc3_module": "1",
"no_get_video_log": "1",
"fmt_list": ......,
"title":..........,
"invideo": true,
"sffb": true,
"iurlmq_webp": ,
"cosver": "10_8_4",
"url_encoded_fmt_stream_map": .................,
"max_dynamic_allocation_ad_tag_length": "2040",
"innertube_api_key": "AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8",
"timestamp": "1446586407",
"cc_asr": "1",
"apiary_host_firstparty": "",
"adsense_video_doc_id": "yt_Vd4iNPuRlx4",
"innertube_context_client_version": "1.20151102",
"mpu": true,
"tmi": "1",
"ldpj": "-19",
"fade_out_duration_milliseconds": "1000",
.........
}
}
i found key adaptive_fmts and url_encoded_fmt_stream_map contain multiple url in percent-encoded form.
i take one url from url_encoded_fmt_stream_map it look like this
https://r1---sn-o3o-qxal.googlevideo.com/videoplayback?
ratebypass=yes&
signature=982E413BBE08CA5801420F9696E0F2ED691B99FA.D666D39D1A0AF066F76F12632A10D3B8076076CE&
lmt=1443906393476832&
expire=1446604919&
fexp=9406983%2C9408710%2C9414764%2C9416126%2C9417707%2C9421410%2C9422596%2C9423663&
itag=22&
dur=128.801&
source=youtube&
upn=pk2CEhVBeFM&
sver=3&
key=yt6&
id=o-AK-OlE5NUsbkp51EZY2yKuz5vsSGofgUvrvTtOrhC72e&
sparams=dur%2Cid%2Cinitcwndbps%2Cip%2Cipbits%2Citag%2Clmt%2Cmime%2Cmm%2Cmn%2Cms%2Cmv%2Cpl%2Cratebypass%2Crequiressl%2Csource%2Cupn%2Cexpire&
mime=video%2Fmp4&
ipbits=0&
pl=21&
ip=x.y.z.a&
initcwndbps=5405000&
requiressl=yes&
mn=sn-o3o-qxal&
mm=31&
ms=au&
mv=m&
mt=1446583222&
itag=22&
type=video/mp4
but when I paste this(above) url in browser nothing happen, I mean not work.
Please help me.
Also
What is difference between adaptive_fmts and url_encoded_fmt_stream_map containing urls?

In python2.7, this works:
import urlparse, urllib2
vid = "vzS1Vkpsi5k"
save_title = "YouTube SpaceX - Booster Number 4 - Thaicom 8 06-06-2016"
url_init = "https://www.youtube.com/get_video_info?video_id=" + vid
resp = urllib2.urlopen(url_init, timeout=10)
data = resp.read()
info = urlparse.parse_qs(data)
title = info['title']
print "length: ", info['length_seconds'][0] + " seconds"
stream_map = info['adaptive_fmts'][0]
vid_info = stream_map.split(",")
mp4_filename = save_title + ".mp4"
for video in vid_info:
item = urlparse.parse_qs(video)
#print 'quality: ', item['quality'][0]
#print 'type: ', item['type'][0]
url_download = item['url'][0]
resp = urllib2.urlopen(url_download)
print resp.headers
length = int(resp.headers['Content-Length'])
my_file = open(mp4_filename, "w+")
done, i = 0, 0
buff = resp.read(1024)
while buff:
my_file.write(buff)
done += 1024
percent = done * 100.0 / length
buff = resp.read(1024)
if not i%1000:
percent = done * 100.0 / length
print str(percent) + "%"
i += 1
break

Related

Scraping a webiste with page limitation

I am trying to scrape https://www.olx.com.eg/en/properties/ listings and there it is showing 200,000+ ads and I'd like to scrape all 200,000 listings but pagination don't go above 49 pages. I have figure out their api endpoint from where data is coming through.
Api endpoint:
'https://search.olx.com.eg/_msearch?filter_path=took%2C*.took%2C*.suggest.*.options.text%2C*.suggest.*.options._source.*%2C*.hits.total.*%2C*.hits.hits._source.*%2C*.hits.hits.highlight.*%2C*.error%2C*.aggregations.*.buckets.key%2C*.aggregations.*.buckets.doc_count%2C*.aggregations.*.buckets.complex_value.hits.hits._source%2C*.aggregations.*.filtered_agg.facet.buckets.key%2C*.aggregations.*.filtered_agg.facet.buckets.doc_count%2C*.aggregations.*.filtered_agg.facet.buckets.complex_value.hits.hits._source'
POST data:
data = '{"index":"olx-eg-production-ads-ar"}\n{"from":0,"size":0,"track_total_hits":false,"query":{"bool":{"must":[{"term":{"category.slug":"properties"}}]}},"aggs":{"category.lvl1.externalID":{"global":{},"aggs":{"filtered_agg":{"filter":{"bool":{"must":[{"term":{"category.lvl0.externalID":"138"}}]}},"aggs":{"facet":{"terms":{"field":"category.lvl1.externalID","size":20}}}}}},"location.lvl1":{"global":{},"aggs":{"filtered_agg":{"filter":{"bool":{"must":[{"term":{"category.slug":"properties"}},{"term":{"location.lvl0.externalID":"0-1"}}]}},"aggs":{"facet":{"terms":{"field":"location.lvl1.externalID","size":20},"aggs":{"complex_value":{"top_hits":{"size":1,"_source":{"include":["location.lvl1"]}}}}}}}}},"product":{"global":{},"aggs":{"filtered_agg":{"filter":{"bool":{"must":[{"term":{"category.slug":"properties"}},{"term":{"product":"featured"}},{"term":{"location.externalID":"0-1"}}]}},"aggs":{"facet":{"terms":{"field":"product","size":20},"aggs":{"complex_value":{"top_hits":{"size":1,"_source":{"include":["product"]}}}}}}}}},"totalProductCount":{"global":{},"aggs":{"filtered_agg":{"filter":{"bool":{"must":[{"term":{"category.slug":"properties"}},{"term":{"product":"featured"}}]}},"aggs":{"facet":{"terms":{"field":"product","size":20},"aggs":{"complex_value":{"top_hits":{"size":1,"_source":{"include":["totalProductCount"]}}}}}}}}}}}\n{"index":"olx-eg-production-ads-ar"}\n{"from":0,"size":45,"track_total_hits":200000,"query":{"function_score":{"random_score":{"seed":97},"query":{"bool":{"must":[{"term":{"category.slug":"properties"}},{"term":{"product":"featured"}}]}}}},"sort":["_score"]}\n{"index":"olx-eg-production-ads-ar"}\n{"from":10045,"size":45,"track_total_hits":200000,"query":{"bool":{"must":[{"term":{"category.slug":"properties"}}]}},"sort":[{"timestamp":{"order":"desc"}},{"id":{"order":"desc"}}]}\n'
Problem is even this elasticsearch endpoint have a limitation of 10000 listings when I try to increase the from value in POST data it throws:
{"message":"[query_phase_execution_exception] Result window is too large, from + size must be less than or equal to: [10000] but was [10045]. See the scroll api for a more efficient way to request large data sets. This limit can be set by changing the [index.max_result_window] index level setting.","error":{"status":500}
I'd like to get all the 200,000 listings, any work around?
Here is my code:
import scrapy
from scrapy.crawler import CrawlerProcess
import requests
class OlxScraper(scrapy.Spider):
name = "olx-scraper"
custom_settings = {
"FEED_FORMAT": "csv",
"FEED_URI": "olx_eg_property_listing.csv",
"LOG_FILE": "olx_eg.log",
}
listing_endpoint = "https://search.olx.com.eg/_msearch?filter_path=took%2C*.took%2C*.suggest.*.options.text%2C*.suggest.*.options._source.*%2C*.hits.total.*%2C*.hits.hits._source.*%2C*.hits.hits.highlight.*%2C*.error%2C*.aggregations.*.buckets.key%2C*.aggregations.*.buckets.doc_count%2C*.aggregations.*.buckets.complex_value.hits.hits._source%2C*.aggregations.*.filtered_agg.facet.buckets.key%2C*.aggregations.*.filtered_agg.facet.buckets.doc_count%2C*.aggregations.*.filtered_agg.facet.buckets.complex_value.hits.hits._source"
headers = {
"authority": "search.olx.com.eg",
"accept": "*/*",
"accept-language": "en,ru;q=0.9",
"authorization": "Basic b2x4LWVnLXByb2R1Y3Rpb24tc2VhcmNoOn1nNDM2Q0R5QDJmWXs2alpHVGhGX0dEZjxJVSZKbnhL",
"content-type": "application/x-ndjson",
"origin": "https://www.olx.com.eg",
"referer": "https://www.olx.com.eg/",
"sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="100", "Yandex";v="22"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Linux"',
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-site",
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.143 YaBrowser/22.5.0.1879 (beta) Yowser/2.5 Safari/537.36",
}
data = '{{"index":"olx-eg-production-ads-ar"}}\n{{"from":0,"size":0,"track_total_hits":false,"query":{{"bool":{{"must":[{{"term":{{"category.slug":"properties"}}}}]}}}},"aggs":{{"category.lvl1.externalID":{{"global":{{}},"aggs":{{"filtered_agg":{{"filter":{{"bool":{{"must":[{{"term":{{"category.lvl0.externalID":"138"}}}}]}}}},"aggs":{{"facet":{{"terms":{{"field":"category.lvl1.externalID","size":20}}}}}}}}}}}},"location.lvl1":{{"global":{{}},"aggs":{{"filtered_agg":{{"filter":{{"bool":{{"must":[{{"term":{{"category.slug":"properties"}}}},{{"term":{{"location.lvl0.externalID":"0-1"}}}}]}}}},"aggs":{{"facet":{{"terms":{{"field":"location.lvl1.externalID","size":20}},"aggs":{{"complex_value":{{"top_hits":{{"size":1,"_source":{{"include":["location.lvl1"]}}}}}}}}}}}}}}}}}},"product":{{"global":{{}},"aggs":{{"filtered_agg":{{"filter":{{"bool":{{"must":[{{"term":{{"category.slug":"properties"}}}},{{"term":{{"product":"featured"}}}},{{"term":{{"location.externalID":"0-1"}}}}]}}}},"aggs":{{"facet":{{"terms":{{"field":"product","size":20}},"aggs":{{"complex_value":{{"top_hits":{{"size":1,"_source":{{"include":["product"]}}}}}}}}}}}}}}}}}},"totalProductCount":{{"global":{{}},"aggs":{{"filtered_agg":{{"filter":{{"bool":{{"must":[{{"term":{{"category.slug":"properties"}}}},{{"term":{{"product":"featured"}}}}]}}}},"aggs":{{"facet":{{"terms":{{"field":"product","size":20}},"aggs":{{"complex_value":{{"top_hits":{{"size":1,"_source":{{"include":["totalProductCount"]}}}}}}}}}}}}}}}}}}}}}}\n{{"index":"olx-eg-production-ads-ar"}}\n{{"from":45,"size":0,"track_total_hits":200000,"query":{{"function_score":{{"random_score":{{"seed":97}},"query":{{"bool":{{"must":[{{"term":{{"category.slug":"properties"}}}},{{"term":{{"product":"featured"}}}}]}}}}}}}},"sort":["_score"]}}\n{{"index":"olx-eg-production-ads-ar"}}\n{{"from":{},"size":45,"track_total_hits":200000,"query":{{"bool":{{"must":[{{"term":{{"category.slug":"properties"}}}}]}}}},"sort":[{{"timestamp":{{"order":"desc"}}}},{{"id":{{"order":"desc"}}}}]}}\n'
def start_requests(self):
for i in range(0, 100045):
pg = i + 45
yield scrapy.Request(
url=self.listing_endpoint,
method="POST",
headers=self.headers,
body=self.data.format(pg),
callback=self.parse_links,
)
def parse_links(self, response):
try:
listing_data = response.json()["responses"][2]["hits"]["hits"]
except:
listing_data = response.json()["responses"][1]["hits"]["hits"]
for listing in listing_data:
listing_id = listing["_source"]["externalID"]
listing_url = "https://www.olx.com.eg/en/ad/" + listing_id
yield scrapy.Request(
url=listing_url,
headers=self.headers,
callback=self.parse_details,
meta={"listing_url": listing_url},
)
def parse_details(self, response):
item = {}
reference_id = response.css("div._171225da::text").get().replace("Ad id ", "")
sub_detail_list = response.css("div._676a547f ::text").extract()
item["URL"] = response.meta.get("listing_url")
try:
item["Breadcrumb"] = (
response.css("li._8c543153 ::text")[4].get()
+ "/"
+ response.css("li._8c543153 ::text")[3].get()
+ "/"
+ response.css("li._8c543153 ::text")[2].get()
+ "/"
+ response.css("li._8c543153 ::text")[1].get()
+ "/"
+ response.css("li._8c543153 ::text").get()
)
except:
item["Breadcrumb"] = (
+response.css("li._8c543153 ::text")[3].get()
+ "/"
+ response.css("li._8c543153 ::text")[2].get()
+ "/"
+ response.css("li._8c543153 ::text")[1].get()
+ "/"
+ response.css("li._8c543153 ::text").get()
)
item["Price"] = response.css("span._56dab877 ::text").get()
item["Title"] = response.css("h1.a38b8112::text").get()
item["Type"] = response.css("div.b44ca0b3 ::text")[1].get()
item["Bedrooms"] = response.css("span.c47715cd::text").get()
try:
item["Bathrooms"] = response.css("span.c47715cd::text")[1].get()
except:
item["Bathrooms"] = ""
try:
item["Area"] = response.css("span.c47715cd::text")[2].get()
except:
for sub in sub_detail_list:
if "Area (m²)" in sub_detail_list:
item["Area"] = sub_detail_list[
sub_detail_list.index("Area (m²)") + 1
]
else:
item["Area"] = ""
item["Location"] = response.css("span._8918c0a8::text").get()
try:
if response.css("div.b44ca0b3 ::text")[18].get() == "Compound":
item["Compound"] = response.css("div.b44ca0b3 ::text")[19].get()
elif response.css("div.b44ca0b3 ::text")[16].get() == "Compound":
item["Compound"] = response.css("div.b44ca0b3 ::text")[17].get()
except:
item["Compound"] = ""
item["seller"] = response.css("span._261203a9._2e82a662::text").getall()[1]
member_since = response.css("span._34a7409b ::text")[1].get()
if member_since == "Cars for Sale":
item["Seller_member_since"] = response.css("span._34a7409b ::text").get()
if "Commercial ID: " in member_since:
item["Seller_member_since"] = response.css("span._34a7409b ::text")[2].get()
else:
item["Seller_member_since"] = member_since
res = requests.get(
f"https://www.olx.com.eg/api/listing/{reference_id}/contactInfo/"
)
item["Seller_phone_number"] = res.json()["mobile"]
item["Description"] = (
response.css("div._0f86855a ::text").get().replace("\n", "")
)
item["Amenities"] = ",".join(response.css("div._27f9c8ac ::text").extract())
item["Reference"] = reference_id
item["Listed_date"] = response.css("span._8918c0a8 ::text")[1].get()
item["Level"] = ""
item["Payment_option"] = ""
item["Delivery_term"] = ""
item["Furnished"] = ""
item["Delivery_date"] = ""
item["Down_payment"] = ""
for sub_detail in sub_detail_list:
if "Level" in sub_detail_list:
item["Level"] = sub_detail_list[sub_detail_list.index("Level") + 1]
if "Payment Option" in sub_detail_list:
item["Payment_option"] = sub_detail_list[
sub_detail_list.index("Payment Option") + 1
]
if "Delivery Term" in sub_detail_list:
item["Delivery_term"] = sub_detail_list[
sub_detail_list.index("Delivery Term") + 1
]
if "Furnished" in sub_detail_list:
item["Furnished"] = sub_detail_list[
sub_detail_list.index("Furnished") + 1
]
if "Delivery Date" in sub_detail_list:
item["Delivery_date"] = sub_detail_list[
sub_detail_list.index("Delivery Date") + 1
]
if "Down Payment" in sub_detail_list:
item["Down_payment"] = sub_detail_list[
sub_detail_list.index("Down Payment") + 1
]
item["Image_url"] = response.css("picture._219b7e0a ::attr(srcset)")[1].get()
yield item
# main driver
if __name__ == "__main__":
# run scrapper
process = CrawlerProcess()
process.crawl(OlxScraper)
process.start()
How to overcome this 10000 limit? Thanks in Advance.

Can i use while loop with 'i' as a variable which will be used in tr[i] in xpath?

import scrapy
import logging
class AssetSpider(scrapy.Spider):
name = 'asset'
start_urls = ['http://mnregaweb4.nic.in/netnrega/asset_report_dtl.aspx?lflag=eng&state_name=WEST%20BENGAL&state_code=32&district_name=NADIA&district_code=3201&block_name=KRISHNAGAR-I&block_code=&panchayat_name=DOGACHI&panchayat_code=3201009009&fin_year=2020-2021&source=national&Digest=8+kWKUdwzDQA1IJ5qhD8Fw']
def parse(self, response):
i = 4
while i<2236:
assetid = response.xpath("//table[2]//tr['i']/td[2]/text()")
assetcategory = response.xpath("//table[2]//tr['i']/td[3]/text()")
schemecode = response.xpath("//table[2]//tr['i']/td[5]/text()")
link = response.xpath("//table[2]//tr['i']/td[6]/a/#href")
schemename = response.xpath("//table[2]//tr['i']/td[7]/text()")
yield {
'assetid' : assetid,
'assetcategory' : assetcategory,
'schemecode' : schemecode,
'link' : link,
'schemename' : schemename
}
i += 1
I want to use 'i' variable to loop in the xpath of tr[position] from 4 to 2235. i just dont know if it is possible! and if it is possible, then what is the right way to do it? mine does not work.

Sure, it is possible and widely used.
You can format the string with variable.
There are several syntaxes for that.
For example you can do it like this:
i = 4
while i<2236:
assetid_path = "//table[2]//tr[{1}]/td[2]/text()".format(i)
assetcategory_path = "//table[2]//tr[{1}]/td[3]/text()".format(i)
schemecode_path = "//table[2]//tr[{1}]/td[5]/text()".format(i)
link_path = "//table[2]//tr[{1}]/td[6]/a/#href".format(i)
schemename_path = "//table[2]//tr[{1}]/td[7]/text()".format(i)
assetid = response.xpath(assetid_path)
assetcategory = response.xpath(assetcategory_path)
schemecode = response.xpath(schemecode_path)
link = response.xpath(link_path)
schemename = response.xpath(schemename_path)
yield {
'assetid' : assetid,
'assetcategory' : assetcategory,
'schemecode' : schemecode,
'link' : link,
'schemename' : schemename
}
i += 1
While the above can be shortened like this:
i = 4
while i<2236:
root_path = "//table[2]//tr[{1}]".format(i)
assetid_path = root_path + "/td[2]/text()"
assetcategory_path = root_path + "/td[3]/text()"
schemecode_path = root_path + "/td[5]/text()"
link_path = root_path + "/td[6]/a/#href"
schemename_path = root_path + "/td[7]/text()"
assetid = response.xpath(assetid_path)
assetcategory = response.xpath(assetcategory_path)
schemecode = response.xpath(schemecode_path)
link = response.xpath(link_path)
schemename = response.xpath(schemename_path)
yield {
'assetid' : assetid,
'assetcategory' : assetcategory,
'schemecode' : schemecode,
'link' : link,
'schemename' : schemename
}
i += 1
But the better way is to use bind variable. As following:
i = 4
while i<2236:
assetid = response.xpath("//table[2]//tr[$i]/td[2]/text()",i=i))
assetcategory = response.xpath("//table[2]//tr[$i]/td[3]/text()",i=i))
schemecode = response.xpath("//table[2]//tr[$i]/td[5]/text()",i=i)
link = response.xpath("//table[2]//tr[$i]/td[6]/a/#href",i=i)
schemename = response.xpath("//table[2]//tr[$i]/td[7]/text()",i=i)
yield {
'assetid' : assetid,
'assetcategory' : assetcategory,
'schemecode' : schemecode,
'link' : link,
'schemename' : schemename
}
i += 1

You send string to xpath so I would suggest to use formating... eg.:
response.xpath(f"//table[2]//tr[{i}]/td[2]/text()")

Bitcoin Transaction Mapping Throws KeyError

I have the following piece of code, which seems to run until line 36 recipientlist.append(target["addr"]) and then throws the error KeyError: 'addr'
However 'addr' seems to be in the data so not sure what the issue is
Can someone please help?
import json
import requests
z = 0
i = 0
firstpart = "https://blockchain.info/rawaddr/"
initialinput = '3PaGEcGDjPsNQHAQ4pTmjQuLXWoEwvnr11'
initialreq = firstpart + initialinput
firstjson = (requests.get(initialreq)).json()
graphvizlines = []
addresslist = []
usedaddresslist = []
addresslist.append(initialinput)
usedaddresslist.append(initialinput)
while i < 6:
if z is 1:
initialreq = firstpart + addresslist[i]
firstjson = (requests.get(initialreq)).json()
for transaction in firstjson["txs"]:
payerlist = []
recipientlist = []
print("\n" + transaction["hash"])
for item in transaction["inputs"]:
payerlist.append(item["prev_out"]["addr"])
if item["prev_out"]["addr"] not in addresslist:
addresslist.append(item["prev_out"]["addr"])
for target in transaction["out"]:
recipientlist.append(target["addr"])
if target["addr"] not in addresslist:
addresslist.append(target["addr"])
for payer in payerlist:
for recipient in recipientlist:
a = '"' + payer + '"' + " -> " + '"' + recipient + '"' + ";"
if a not in graphvizlines:
graphvizlines.append(a)
i = i + 1
z = 1
for t in graphvizlines:
print(t)

While addr is in your data, it's not in every inputs element. Check the very last element in txs, you'll see that inputs is:
"inputs": [
{
"sequence": 0,
"witness": "304402203f872bfd7093fcdad6a3735cbd76f276279890b0304e6f23f54c51388cc2a84402203731d7a7f71265f072f6792c8f4d2e805ff8f86bbfbd0b48a187d573c051593001",
"prev_out": {
"spent": true,
"spending_outpoints": [
{
"tx_index": 0,
"n": 0
}
],
"tx_index": 0,
"type": 0,
"value": 1880609,
"n": 1,
"script": "0014292738ed3f9466f8eedd8c49e5bb013088a7052b"
},
"script": ""
}
],
This element lacks the presence of prev_out.addr.
You will need to first check if the addr element exists or wrap your loop in a try/except.
for transaction in firstjson['txs']:
...
for item in transaction['inputs']:
address = item.get('prev_out').get('addr')
if(address == None):
continue
payerlist.append(address)
...
The above would still fail if prev_out didn't exist, so you should confirm what will be in the result and what might be.

Parsing logs to json Python

Folks,
I am trying to parse log file into json format.
I have a lot of logs, there is one of them
How can I parse this?
03:02:03.113 [info] ext_ref = BANK24AOS_cl_reqmarketcreditorderstate_6M8I1NT8JKYD_1591844522410384_4SGA08M8KIXQ reqid = 1253166 type = INREQ channel = BANK24AOS sid = msid_1591844511335516_KRRNBSLH2FS duration = 703.991 req_uri = marketcredit/order/state login = 77012221122 req_type = cl_req req_headers = {"accept-encoding":"gzip","connection":"close","host":"test-mobileapp-api.bank.kz","user-agent":"okhttp/4.4.1","x-forwarded-for":"212.154.169.134","x-real-ip":"212.154.169.134"} req_body = {"$sid":"msid_1591844511335516_KRRNBSLH2FS","$sid":"msid_1591844511335516_KRRNBSLH2FS","app":"bank","app_version":"2.3.2","channel":"aos","colvir_token":"GExPR0lOX1BBU1NXT1JEX0NMRUFSVEVYVFNzrzh4Thk1+MjDKWl/dDu1fQPsJ6gGLSanBp41yLRv","colvir_commercial_id":"-1","colvir_id":"000120.335980","openway_commercial_id":"6247520","openway_id":"6196360","$lang":"ru","ekb_id":"923243","inn":"990830221722","login":"77012221122","bank24_id":"262"} resp_body = {"task_id":"","status":"success","data":{"state":"init","applications":[{"status":"init","id":"123db561-34a3-4a8d-9fa7-03ed6377b44f","name":"Sulpak","amount":101000,"items":[{"name":"Switch CISCO x24","price":100000,"count":1,"amount":100000}]}],"segment":{"range":{"min":6,"max":36,"step":1},"payment_day":{"max":28,"min":1}}}}
Into this type of json, or any other format (but I guess json is best one)
{
"time":"03:02:03.113",
"class_req":"info",
"ext_ref":"BANK24AOS_cl_reqmarketcreditorderstate_6M8I1NT8JKYD_1591844522410384_4SGA08M8KIXQ",
"reqid":"1253166",
"type":"INREQ",
"channel":"BANK24AOS",
"sid":"msid_1591844511335516_KRRNBSLH2FS",
"duration":"703.991",
"req_uri":"marketcredit/order/state",
"login":"77012221122",
"req_type":"cl_req",
"req_headers":{
"accept-encoding":"gzip",
"connection":"close",
"host":"test-mobileapp-api.bank.kz",
"user-agent":"okhttp/4.4.1",
"x-forwarded-for":"212.154.169.134",
"x-real-ip":"212.154.169.134"
},
"req_body":{
"$sid":"msid_1591844511335516_KRRNBSLH2FS",
"$sid":"msid_1591844511335516_KRRNBSLH2FS",
"app":"bank",
"app_version":"2.3.2",
"channel":"aos",
"colvir_token":"GExPR0lOX1BBU1NXT1JEX0NMRUFSVEVYVFNzrzh4Thk1+MjDKWl/dDu1fQPsJ6gGLSanBp41yLRv",
"colvir_commercial_id":"-1",
"colvir_id":"000120.335980",
"openway_commercial_id":"6247520",
"openway_id":"6196360",
"$lang":"ru",
"ekb_id":"923243",
"inn":"990830221722",
"login":"77012221122",
"bank24_id":"262"
},
"resp_body":{
"task_id":"",
"status":"success",
"data":{
"state":"init",
"applications":[
{
"status":"init",
"id":"123db561-34a3-4a8d-9fa7-03ed6377b44f",
"name":"Sulpak",
"amount":101000,
"items":[
{
"name":"Switch CISCO x24",
"price":100000,
"count":1,
"amount":100000
}
]
}
],
"segment":{
"range":{
"min":6,
"max":36,
"step":1
},
"payment_day":{
"max":28,
"min":1
}
}
}
}
}
I am trying to split first whole text, but there I met another problem is to match keys to values depending on '=' sign. Also there might be some keys with empty values. For ex.:
type = INREQ channel = sid = duration = 1.333 (to get to know that there is an empty value, you need to pay attention on number of spaces. Usually there is 1 space between prev.value and next key). So this example should look like this:
{
"type":"INREQ",
"channel":"",
"sid":"",
"duration":"1.333"
}
Thanks ahead!

Here, one thing pass for duplicate key about "$sid":"msid_1591844511335516_KRRNBSLH2FS"
import re
text = """03:02:03.113 [info] ext_ref = reqid = 1253166 type = INREQ channel = BANK24AOS sid = msid_1591844511335516_KRRNBSLH2FS duration = 703.991 req_uri = marketcredit/order/state login = 77012221122 req_type = cl_req req_headers = {"accept-encoding":"gzip","connection":"close","host":"test-mobileapp-api.bank.kz","user-agent":"okhttp/4.4.1","x-forwarded-for":"212.154.169.134","x-real-ip":"212.154.169.134"} req_body = {"$sid":"msid_1591844511335516_KRRNBSLH2FS","$sid":"msid_1591844511335516_KRRNBSLH2FS","app":"bank","app_version":"2.3.2","channel":"aos","colvir_token":"GExPR0lOX1BBU1NXT1JEX0NMRUFSVEVYVFNzrzh4Thk1+MjDKWl/dDu1fQPsJ6gGLSanBp41yLRv","colvir_commercial_id":"-1","colvir_id":"000120.335980","openway_commercial_id":"6247520","openway_id":"6196360","$lang":"ru","ekb_id":"923243","inn":"990830221722","login":"77012221122","bank24_id":"262"} resp_body = {"task_id":"","status":"success","data":{"state":"init","applications":[{"status":"init","id":"123db561-34a3-4a8d-9fa7-03ed6377b44f","name":"Sulpak","amount":101000,"items":[{"name":"Switch CISCO x24","price":100000,"count":1,"amount":100000}]}],"segment":{"range":{"min":6,"max":36,"step":1},"payment_day":{"max":28,"min":1}}}}"""
index1 = text.index('[')
index2 = text.index(']')
new_text = 'time = '+ text[:index1-1] + ' class_req = ' + text[index1+1:index2] + text[index2+2:]
lst = re.findall(r'\S+? = |\S+? = \{.*?\} |\S+? = \{.*?\}$|\S+? = \S+? ', new_text)
res = {}
for item in lst:
key, equal, value = item.partition('=')
key, value = key.strip(), value.strip()
if value.startswith('{'):
try:
value = json.loads(value)
except:
print(value)
res[key] = value

you can try regulation in python.
here is what i write, it works for your problem.
for convenience i deleted string before "ext_ref...",you can directly truncate the raw string.
import re
import json
string = 'ext_ref = BANK24AOS_cl_reqmarketcreditorderstate_6M8I1NT8JKYD_1591844522410384_4SGA08M8KIXQ reqid = 1253166 type = INREQ channel = BANK24AOS sid = msid_1591844511335516_KRRNBSLH2FS duration = 703.991 req_uri = marketcredit/order/state login = 77012221122 req_type = cl_req req_headers = {"accept-encoding":"gzip","connection":"close","host":"test-mobileapp-api.bank.kz","user-agent":"okhttp/4.4.1","x-forwarded-for":"212.154.169.134","x-real-ip":"212.154.169.134"} req_body = {"$sid":"msid_1591844511335516_KRRNBSLH2FS","$sid":"msid_1591844511335516_KRRNBSLH2FS","app":"bank","app_version":"2.3.2","channel":"aos","colvir_token":"GExPR0lOX1BBU1NXT1JEX0NMRUFSVEVYVFNzrzh4Thk1+MjDKWl/dDu1fQPsJ6gGLSanBp41yLRv","colvir_commercial_id":"-1","colvir_id":"000120.335980","openway_commercial_id":"6247520","openway_id":"6196360","$lang":"ru","ekb_id":"923243","inn":"990830221722","login":"77012221122","bank24_id":"262"} resp_body = {"task_id":"","status":"success","data":{"state":"init","applications":[{"status":"init","id":"123db561-34a3-4a8d-9fa7-03ed6377b44f","name":"Sulpak","amount":101000,"items":[{"name":"Switch CISCO x24","price":100000,"count":1,"amount":100000}]}],"segment":{"range":{"min":6,"max":36,"step":1},"payment_day":{"max":28,"min":1}}}}'
position = re.search("req_headers",string) # position of req_headers
resp_body_pos = re.search("resp_body",string)
resp_body = string[resp_body_pos.span()[0]:]
res1 = {}
res1.setdefault(resp_body.split("=")[0],resp_body.split("=")[1])
print(res1)
before = string[:position.span()[0]]
after = string[position.span()[0]:resp_body_pos.span()[0]] # handle req_body seperately
res2 = re.findall("(\S+) = (\S+)",before)
print(res2)
res3 = re.findall("(\S+) = ({.*?})",after)
print(res3)
#res1 type: dict{'resp_body':'...'} content in resp_body
#res2 type: list[(),()..] content before req_head
#res3 type: list[(),()..] the rest content
and now you can do what you want to do with the data(.e.g. transform it into json respectively)
Hope this is helpful

Question with extracting data from Json using Python

I am building a bot game for my friends in LINE. I'm a beginning coder. I'm trying to call an object in json which includes a string + integer. I've looked around but nothing seems to fit what I need. What would be the best/simple solution?
My code is amateur, please go easy on me. :P
I'm trying to have Python extract through Json, "Name" + "Stat".
Right now it only extracts "Name" and randomly selects an item. Is there any way to select the item + the stat, display the item and calculate the stat? Thanks.
Python 3:
if text == 'FIGHT':
with open('items.json', 'r') as f:
data = json.load(f)
armor1 = [v for d in data['armor'] for k,v in d.items() if k == 'name']
weapon1 = [v for d in data['weapon'] for k,v in d.items() if k == 'name']
magic1 = [v for d in data['magic'] for k,v in d.items() if k == 'name']
armor2 = random.choice(armor1)
weapon2 = random.choice(weapon1)
magic2 = random.choice(magic1)
calc = add(int(armor2), int(weapon2), int(magic2))
line_bot_api.reply_message(
event.reply_token,
TextSendMessage('Armor = ' + (armor2)),
TextSendMessage('Weapon = ' + (weapon2)),
TextSendMessage('Magic = ' + (magic2)),
TextSendMessage('You have a score of ' + str(calc) + '.'),
TextSendMessage('Waiting for next opponent...')
)
Json:
"armor": [
{
"name":"Cardboard armor 10 DEF" ,
"stats":"10" },
{
"name":"Plastic armor 20 DEF" ,
"stats":"20" },
{
"name":"Rubber armor 30 DEF" ,
"stats":"30" },
{
"name":"Metal armor 40 DEF" ,
"stats":"40" },
{
"name":"Indestructable armor 50 DEF" ,
"stats":"50" }
],

After trying just about everything.. The solution was:
if text == 'FIGHT':
with open('items.json', 'r') as f:
data = json.load(f)
armor2 = random.choice(data['armor'])
weapon2 = random.choice(data['weapon'])
magic2 = random.choice(data['magic'])
calc = add(armor2['stats'], weapon2['stats'], magic2['stats'])
line_bot_api.reply_message(
event.reply_token, [
TextSendMessage('Armor = ' + (armor2['name'])),
TextSendMessage('Weapon = ' + (weapon2['name'])),
TextSendMessage('Magic = ' + (magic2['name'])),
TextSendMessage('Total = ' + str(calc) + '.')
]
)
Thanks to everyone and special thanks to my friend Sae who helped me. :)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

python script to download youtube video - python

Related

Scraping a webiste with page limitation

Can i use while loop with 'i' as a variable which will be used in tr[i] in xpath?

Bitcoin Transaction Mapping Throws KeyError

Parsing logs to json Python

Question with extracting data from Json using Python

Categories

Resources