JSONDecodeError: Expecting ',' delimiter in a long JSON string - python

i'm trying to parse the following JSON but I always face the error stating "JSONDecodeError: Expecting ',' delimiter"
Here is the code i'm doing:
import requests
from bs4 import BeautifulSoup
import json
page_link="https://www.indeed.com/cmp/Ocean-Beauty-Seafoods/reviews?start=0"
page_response = requests.get(page_link, verify=False)
soup = BeautifulSoup(page_response.content, 'html.parser')
strJson=soup.findAll('script')[16].text.replace("\n window._initialData=JSON.parse(\'","").replace("');","")
json.loads(strJson)
manyy thanks

The Json as it is isn't valid. Try to "preprocess" it first with ast.literal_eval:
import json
import requests
from ast import literal_eval
from bs4 import BeautifulSoup
page_link = "https://www.indeed.com/cmp/Ocean-Beauty-Seafoods/reviews?start=0"
page_response = requests.get(page_link, verify=False)
soup = BeautifulSoup(page_response.content, "html.parser")
strJson = (
soup.findAll("script")[16]
.text.replace("\n window._initialData=JSON.parse('", "")
.replace("');", "")
)
s = literal_eval("'''" + strJson + "'''")
data = json.loads(s)
print(json.dumps(data, indent=4))
Prints:
{
"breadcrumbs": {
"breadcrumbs": [
{
"name": "Companies",
"noFollow": false,
"url": "https://www.indeed.com/companies"
},
{
"name": "Ocean Beauty Seafoods",
"noFollow": false,
"url": "https://www.indeed.com/cmp/Ocean-Beauty-Seafoods"
},
{
"name": "Employee Reviews",
"noFollow": false
}
]
},
"companyPageFooter": {
"enabledToShowUserFeedbackForm": false,
"encodedFccId": "a9c95405fb0cdb1c",
"stickyJobsTabLink": {
"jobsLink": "/cmp/Ocean-Beauty-Seafoods/jobs"
}
},
"companyPageHeader": {
"auroraLogoUrl": "https://d2q79iu7y748jz.cloudfront.net/s/_squarelogo/64x64/147cafc3914ffb4693dc99df6ad0b169",
"auroraLogoUrl2x": "https://d2q79iu7y748jz.cloudfront.net/s/_squarelogo/128x128/147cafc3914ffb4693dc99df6ad0b169",
"brandColor": "#FFFFFF",
"companyHeader": {
"name": "Ocean Beauty Seafoods",
"rating": 3.7,
"reviewCount": 114,
"reviewCountFormatted": "114",
"reviewsUrl": "/cmp/Ocean-Beauty-Seafoods/reviews"
},
...and so on.

Related

Different content when accessing a website with requests

I am trying to get corresponding handle ids in ARIN automatically using a companies' name, like "Google".
https://search.arin.net/rdap/?query=google*
My naive approach is to use requests and BeautifulSoup:
import requests
from bs4 import BeautifulSoup
html = 'https://search.arin.net/rdap/?query='
comp = 'google*'
r = requests.get(html + comp)
soup = BeautifulSoup(r.text, 'html.parser')
#example search
search = soup.body.find_all(text = "Handle$")
However, I do not get the same output when I am using requests as when I simply use Google Chrome. The html code that is returned by requests is different and I cannot access the corresponding handles.
Does anyone know how to change the code?
The data you see on the page is loaded from external API URL. You can use requests module to simulate it:
import json
import requests
api_url = "https://rdap.arin.net/registry/entities"
params = {"fn": "google*"}
data = requests.get(api_url, params=params).json()
# pretty print the data:
print(json.dumps(data, indent=4))
Prints:
...
{
"handle": "GF-231",
"vcardArray": [
"vcard",
[
[
"version",
{},
"text",
"4.0"
],
[
"fn",
{},
"text",
"GOOGLE FIBER INC"
],
[
"adr",
{
"label": "3425 MALONE DR\nCHAMBLEE\nGA\n30341\nUnited States"
},
"text",
[
"",
"",
"",
"",
"",
"",
""
]
],
[
"kind",
{},
"text",
"org"
]
]
],
...

add a value to a json body request in python

I have created my own function which i import called timestamp, it returns two values :
from requests.auth import HTTPBasicAuth
import requests
import json
def timeframe():
response = requests.get("https://$host/api/profiler/1.13/reporting/timestamps.json", verify=False, auth=HTTPBasicAuth("admin", "admin"))
time = response.json()
for entry in time:
if entry.get('data_resolution') == 'min':
if entry.get('datasource') == 'FDS_TRAFFIC':
start_time = entry['start_time']
end_time = entry['end_time']
return start_time, end_time
timeframe()
i need to add timestamps to a keys in a json body request, you will see 'end' & 'start' keys. I need to retrieve those timestamps and somehow add them to those keys.
import requests
import timestamp
from requests.auth import HTTPBasicAuth
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
stamp = timestamp.timeframe()
print(stamp)
url = 'http://10.65.170.112/api/profiler/1.12/reporting/reports'
headers = {'Content-Type': 'application/json'}
payload = {
"criteria": {
"time_frame": {
"start": str(stamp[0]),
"end": str(stamp[1]),
"resolution": "flow"
},
"query": {
"realm": "traffic_flow_list",
"sort_column": 41,
"devices": [
{
"ipaddr": "10.65.170.2"
}
],
"group_by": "flw",
"columns": [
729,
40,
41,
14,
44,
10,
45,
46
]
}
},
"template_id": 184
}
req = requests.post(url, headers=headers, data = payload, verify=False, auth=HTTPBasicAuth('admin', 'admin'),)
print(req.status_code, req.text)
Not sure what to do.
Thanks
The function you created returns a tuple: return start_time, end_time.
So, a way to implement would be:
start, end = timestamp.timeframe()
Then, you can hydrate your body:
body = {
"criteria": {
"time_frame": {
"end": end,
"start": start,
"resolution": "flow"
},
import requests
import timestamp
from requests.auth import HTTPBasicAuth
import urllib3
import json
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
stamp = timestamp.timeframe()
auth = HTTPBasicAuth("admin", "admin")
url = "https://$host/api/profiler/1.13/reporting/reports"
headers = {'Content-Type': 'application/json'}
payload = {
"criteria": {
"time_frame": {
"start": stamp[0],
"end": stamp[1],
"resolution": "flow"
},
"query": {
"realm": "traffic_summary",
"sort_column": 41,
"devices": [{ "ipaddr": "10.65.170.2"}],
"group_by": "hos",
"columns": [
729,
40,
41,
14,
44,
10,
45,
46
]
}
},
"template_id": 184
}
req = requests.post(url, verify=False, auth=auth, headers=headers, data=json.dumps(payload))
print(req.headers)
Was resolved by adding data=json.dump(payload)

How do I look for the right class and id for parsing a page?

This is the code i have so far, i'm attempting to get the subscriber count.
This is this the error I get:
AttributeError: 'NoneType' object has no attribute 'find_all'
I hope someone can help!
from bs4 import BeautifulSoup
import requests as r
url = r.get("https://www.youtube.com/channel/UC57EgpLB1Q0tXc5tWDhttoQ)
soup_content = BeautifulSoup(url.content, 'html.parser')
id_ = soup_content.find(id="meta")
class_ = id_.find_all(class_="style-scope ytd-c4-tabbed-header-renderer")
hopefully_it_works = class_[0]
print(hopefully_it_works.prettify())
the BS and web scraping is not the correct way how to get the youtube data.
Please, consider using the official youtube api, which provides these data you need and much more :)
https://developers.google.com/youtube/v3/docs/
you can try your requests here https://developers.google.com/youtube/v3/docs/channels/list#try-it
in your case you need to fill:
part: statistics
id: UC57EgpLB1Q0tXc5tWDhttoQ
(https://www.googleapis.com/youtube/v3/channelspart=statistics&id=channel_id&key=your_key)
the response is
{
"kind": "youtube#channelListResponse",
"etag": "eQRUDH-2j1eYIpexSuSOsz12tc8",
"pageInfo": {
"totalResults": 1,
"resultsPerPage": 1
},
"items": [
{
"kind": "youtube#channel",
"etag": "D-yZ896UMFRcDDSrfATBaiygDkc",
"id": "UC57EgpLB1Q0tXc5tWDhttoQ",
"statistics": {
"viewCount": "9",
"commentCount": "0",
"subscriberCount": "3",
"hiddenSubscriberCount": false,
"videoCount": "1"
}
}
]
}
edit:
if you really want to use BS, there is your solution:
from bs4 import BeautifulSoup as bs
import requests as r
content = r.get("https://www.youtube.com/channel/UC57EgpLB1Q0tXc5tWDhttoQ")
soup = bs(content.content, "html.parser")
channel_subscribers = soup.find("span", attrs={"class": "channel-header-subscription-button-container yt-uix-button-subscription-container with-preferences"}).find("span", attrs={"class": "yt-subscription-button-subscriber-count-branded-horizontal subscribed yt-uix-tooltip"}).text
print(channel_subscribers)

How to print json info with python?

I have a json (url = http://open.data.amsterdam.nl/ivv/parkeren/locaties.json) and I want to print all 'title', 'adres', 'postcode'. How can I do that?
I want to print it like this:
title.
adres.
postcode.
title.
adres.
postcode.
so among themselves
I hope you can help me with this
import urllib, json
url = "http://open.data.amsterdam.nl/ivv/parkeren/locaties.json"
import requests
search = requests.get(url).json()
print(search['title'])
print(search['adres'])
print(search['postcode'])
Using print(json.dumps(r, indent=4)) you can see that the structure is
{
"parkeerlocaties": [
{
"parkeerlocatie": {
"title": "Fietsenstalling Tolhuisplein",
"Locatie": "{\"type\":\"Point\",\"coordinates\":[4.9032801,52.3824545]}",
...
}
},
{
"parkeerlocatie": {
"title": "Fietsenstalling Paradiso",
"Locatie": "{\"type\":\"Point\",\"coordinates\":[4.8833735,52.3621851]}",
...
}
},
So to access the inner properties, you need to follow the JSON path
import requests
url = ' http://open.data.amsterdam.nl/ivv/parkeren/locaties.json'
search = requests.get(url).json()
for parkeerlocatie in search["parkeerlocaties"]:
content = parkeerlocatie['parkeerlocatie']
print(content['title'])
print(content['adres'])
print(content['postcode'])
print()

How to get text within <script> tag

I am scraping the LaneBryant website.
Part of the source code is
<script type="application/ld+json">
{
"#context": "http://schema.org/",
"#type": "Product",
"name": "Flip Sequin Teach & Inspire Graphic Tee",
"image": [
"http://lanebryant.scene7.com/is/image/lanebryantProdATG/356861_0000015477",
"http://lanebryant.scene7.com/is/image/lanebryantProdATG/356861_0000015477_Back"
],
"description": "Get inspired with [...]",
"brand": "Lane Bryant",
"sku": "356861",
"offers": {
"#type": "Offer",
"url": "https://www.lanebryant.com/flip-sequin-teach-inspire-graphic-tee/prd-356861",
"priceCurrency": "USD",
"price":"44.95",
"availability": "http://schema.org/InStock",
"itemCondition": "https://schema.org/NewCondition"
}
}
}
}
</script>
In order to get price in USD, I have written this script:
def getPrice(self,start):
fprice=[]
discount = ""
price1 = start.find('script', {'type': 'application/ld+json'})
data = ""
#print("price 1 is + "+ str(price1)+"data is "+str(data))
price1 = str(price1).split(",")
#price1=str(price1).split(":")
print("final price +"+ str(price1[11]))
where start is :
d = webdriver.Chrome('/Users/fatima.arshad/Downloads/chromedriver')
d.get(url)
start = BeautifulSoup(d.page_source, 'html.parser')
It doesn't print the price even though I am getting correct text. How do I get just the price?
In this instance you can just regex for the price
import requests, re
r = requests.get('https://www.lanebryant.com/flip-sequin-teach-inspire-graphic-tee/prd-356861#color/0000015477', headers = {'User-Agent':'Mozilla/5.0'})
p = re.compile(r'"price":"(.*?)"')
print(p.findall(r.text)[0])
Otherwise, target the appropriate script tag by id and then parse the .text with json library
import requests, json
from bs4 import BeautifulSoup
r = requests.get('https://www.lanebryant.com/flip-sequin-teach-inspire-graphic-tee/prd-356861#color/0000015477', headers = {'User-Agent':'Mozilla/5.0'})
start = BeautifulSoup(r.text, 'html.parser')
data = json.loads(start.select_one('#pdpInitialData').text)
price = data['pdpDetail']['product'][0]['price_range']['sale_price']
print(price)
price1 = start.find('script', {'type': 'application/ld+json'})
This is actually the <script> tag, so a better name would be
script_tag = start.find('script', {'type': 'application/ld+json'})
You can access the text inside the script tag using .text. That will give you the JSON in this case.
json_string = script_tag.text
Instead of splitting by commas, use a JSON parser to avoid misinterpretations:
import json
clothing=json.loads(json_string)

Categories