extract a class from beatiful soup - python

I Have a HTML script that after extraction looks something like this:
class="a-toaster Toaster_toaster__bTabZ">
</div>
</div>
</div>
<script id="__NEXT_DATA__" type="application/json">
{"props":{"pageProps": {"type":"Job","sid":"a84cacbbcb07ec55cdbfd5fbe3d9f252d7f9cdd0","loggedIn":false,"userId":null,"avatar":null,"rating":{"count":"743","value":6.6},"metadata":{"title":"Medior/Senior Tester0"}
I am interested in extracting certain key value pairs of this script into a dataframe. I would for example like a column named "title" with the value "Medior/Senior Tester0" and a column "customer" filled with null.
soup.find('a-toaster Toaster_toaster__bTabZ') results in a nonetype nonetype error. What would be a good way to extract for example the title of this html (medior/senior tester) ?

Try:
import json
from bs4 import BeautifulSoup
html_doc = """\
<div class="a-toaster Toaster_toaster__bTabZ">
</div>
</div>
</div>
<script id="__NEXT_DATA__" type="application/json">
{"props":{"pageProps": {"type":"Job","sid":"a84cacbbcb07ec55cdbfd5fbe3d9f252d7f9cdd0","loggedIn":false,"userId":null,"avatar":null,"rating":{"count":"743","value":6.6},"metadata":{"title":"Medior/Senior Tester0"} } } }
</script>
</div>"""
soup = BeautifulSoup(html_doc, "html.parser")
# locate the script tag:
script = soup.select_one("#__NEXT_DATA__")
# decode the json:
data = json.loads(script.text)
# print all data:
print(data)
Prints:
{
"props": {
"pageProps": {
"type": "Job",
"sid": "a84cacbbcb07ec55cdbfd5fbe3d9f252d7f9cdd0",
"loggedIn": False,
"userId": None,
"avatar": None,
"rating": {"count": "743", "value": 6.6},
"metadata": {"title": "Medior/Senior Tester0"},
}
}
}
To print the title:
print(data["props"]["pageProps"]["metadata"]["title"])
Prints:
Medior/Senior Tester0

Related

Get data from inside a <script> tag with beautifulsoup

I use Beautifulsoup to get data from a website where the data I need is inside a <script> tag.
I get a response looking like this and want to get the content of "name", "thumbnailUrl, account, Id":
<script type="text/javascript">
var modelData = {
name: 'somename',
thumbnailUrl: 'https://website.com/blob/bG9uZ2RhbjovL0ZPVVIvbGRwcm9kLWRlL3ljb3B6YTY4N0pnQ2Nfc3JYcVV3VXc9PQ',
account: '5LH7J44IYPAGEZEYA9KIL',
Id: 'someid'
};
store.initOmlib({"ClusterEndpoints":{"ONE":["http://us.site.me"],"TWO":["http://sg.site.me"],"FOUR":["http://de.site.me"],"FIVE":["http://in.site.me"],"SIX":["http://ja.site.me"],"SEVEN":["http://br.site.me"]},"ClusterEndpointsInternal":{"ONE":["http://usi.site.me"],"TWO":["http://sgi.site.me"],"FOUR":["http://dei.site.me"],"FIVE":["http://ini.site.me"],"SIX":["http://jai.site.me"],"SEVEN":["http://bri.site.me"]},"ClusterKeys":{"FIVE":"Cf0Mw0I2/cZf6alwfMhelEEOb6xq23IhPvC9E4eoaXU=","SIX":"bfYXVkWhs/gv+TCJ3EeeEE3oxiZRDpJO0fecUGdq2Qg=","ONE":"xkkzyNJmZ1DmNPxGwrykZ2O91f10KNXQvspa15nKKGs=","FOUR":"xMRCvh1eki9JEceBcV7Bx49uaQYpX8FdD0eZ+LCGqCc=","TWO":"XaG4I7b7wDOZ+lGHSPwbJ2HLkIFf0UGYAWz9c9LkiQk=","SEVEN":"LuSOGA/u5PL7rP8PG3cr6bqgQy7jXEv65iuHUX9ePQY="},"DefaultCluster":"ONE","IdpEndpoints":["http://idp.site.me"],"IdpKey":"MIOC9PS8KIwXOXSHtplBZLSpIqcifns0jzExtkHXw1g=","ReadOnlyEndpoints":["http://site.gg"],"ReadOnlyKey":"QKxHfdLVgbn+VYpnUiCFLMq/hhUpkpx7occEY3Z0Wnk="}, {"Id":"001026a1c1064a1b9305400814783c2385e2a978f13a","Secret":"0110de13b2187fe3078e13d9f6ad4e7567fdc143e915c9cb4df67ca"});
if (store.renderArc) {
store.renderArc(document.getElementById('root'), modelData, translateTable);
} else {
store.renderUser(document.getElementById('root'), modelData, translateTable);
}
</script>
My code to get the above response is this:
url = 'https://website.com'
response = urllib.request.urlopen(url)
soup = BeautifulSoup(response.read(), "html.parser")
results = soup.find_all("script", {"type": "text/javascript"})[6]
print(results)
How can I get inside the <script> tag and get the data inside?
I've looked at several other posts on here, but none which I have successfully been able to get working.
You can use this example how to convert the javascript object to JSON:
import re
import json
from bs4 import BeautifulSoup
html_doc = """
<script type="text/javascript">
var modelData = {
name: 'somename',
thumbnailUrl: 'https://website.com/blob/bG9uZ2RhbjovL0ZPVVIvbGRwcm9kLWRlL3ljb3B6YTY4N0pnQ2Nfc3JYcVV3VXc9PQ',
account: '5LH7J44IYPAGEZEYA9KIL',
Id: 'someid'
};
store.initOmlib({"ClusterEndpoints":{"ONE":["http://us.site.me"],"TWO":["http://sg.site.me"],"FOUR":["http://de.site.me"],"FIVE":["http://in.site.me"],"SIX":["http://ja.site.me"],"SEVEN":["http://br.site.me"]},"ClusterEndpointsInternal":{"ONE":["http://usi.site.me"],"TWO":["http://sgi.site.me"],"FOUR":["http://dei.site.me"],"FIVE":["http://ini.site.me"],"SIX":["http://jai.site.me"],"SEVEN":["http://bri.site.me"]},"ClusterKeys":{"FIVE":"Cf0Mw0I2/cZf6alwfMhelEEOb6xq23IhPvC9E4eoaXU=","SIX":"bfYXVkWhs/gv+TCJ3EeeEE3oxiZRDpJO0fecUGdq2Qg=","ONE":"xkkzyNJmZ1DmNPxGwrykZ2O91f10KNXQvspa15nKKGs=","FOUR":"xMRCvh1eki9JEceBcV7Bx49uaQYpX8FdD0eZ+LCGqCc=","TWO":"XaG4I7b7wDOZ+lGHSPwbJ2HLkIFf0UGYAWz9c9LkiQk=","SEVEN":"LuSOGA/u5PL7rP8PG3cr6bqgQy7jXEv65iuHUX9ePQY="},"DefaultCluster":"ONE","IdpEndpoints":["http://idp.site.me"],"IdpKey":"MIOC9PS8KIwXOXSHtplBZLSpIqcifns0jzExtkHXw1g=","ReadOnlyEndpoints":["http://site.gg"],"ReadOnlyKey":"QKxHfdLVgbn+VYpnUiCFLMq/hhUpkpx7occEY3Z0Wnk="}, {"Id":"001026a1c1064a1b9305400814783c2385e2a978f13a","Secret":"0110de13b2187fe3078e13d9f6ad4e7567fdc143e915c9cb4df67ca"});
if (store.renderArc) {
store.renderArc(document.getElementById('root'), modelData, translateTable);
} else {
store.renderUser(document.getElementById('root'), modelData, translateTable);
}
</script>
"""
soup = BeautifulSoup(html_doc, "html.parser")
# locate the script, get the contents
script_text = soup.select_one("script").contents[0]
# get javascript object inside the script
model_data = re.search(r"modelData = ({.*?});", script_text, flags=re.S)
model_data = model_data.group(1)
# "convert" the javascript object to json-valid object
model_data = re.sub(
r"^\s*([^:\s]+):", r'"\1":', model_data.replace("'", '"'), flags=re.M
)
# json decode the object
model_data = json.loads(model_data)
# print the data
print(model_data["name"])
print(model_data["thumbnailUrl"])
print(model_data["account"])
Prints:
somename
https://website.com/blob/bG9uZ2RhbjovL0ZPVVIvbGRwcm9kLWRlL3ljb3B6YTY4N0pnQ2Nfc3JYcVV3VXc9PQ
5LH7J44IYPAGEZEYA9KIL
Or: Just parse it with re (html_doc is the same as in previous example):
soup = BeautifulSoup(html_doc, "html.parser")
script_text = soup.select_one("script").contents[0]
name = re.search(r"name: '(.*?)'", script_text).group(1)
thumbnailUrl = re.search(r"thumbnailUrl: '(.*?)'", script_text).group(1)
account = re.search(r"account: '(.*?)'", script_text).group(1)
print(name)
print(thumbnailUrl)
print(account)
Prints:
somename
https://website.com/blob/bG9uZ2RhbjovL0ZPVVIvbGRwcm9kLWRlL3ljb3B6YTY4N0pnQ2Nfc3JYcVV3VXc9PQ
5LH7J44IYPAGEZEYA9KIL

How to get text within <script> tag

I am scraping the LaneBryant website.
Part of the source code is
<script type="application/ld+json">
{
"#context": "http://schema.org/",
"#type": "Product",
"name": "Flip Sequin Teach & Inspire Graphic Tee",
"image": [
"http://lanebryant.scene7.com/is/image/lanebryantProdATG/356861_0000015477",
"http://lanebryant.scene7.com/is/image/lanebryantProdATG/356861_0000015477_Back"
],
"description": "Get inspired with [...]",
"brand": "Lane Bryant",
"sku": "356861",
"offers": {
"#type": "Offer",
"url": "https://www.lanebryant.com/flip-sequin-teach-inspire-graphic-tee/prd-356861",
"priceCurrency": "USD",
"price":"44.95",
"availability": "http://schema.org/InStock",
"itemCondition": "https://schema.org/NewCondition"
}
}
}
}
</script>
In order to get price in USD, I have written this script:
def getPrice(self,start):
fprice=[]
discount = ""
price1 = start.find('script', {'type': 'application/ld+json'})
data = ""
#print("price 1 is + "+ str(price1)+"data is "+str(data))
price1 = str(price1).split(",")
#price1=str(price1).split(":")
print("final price +"+ str(price1[11]))
where start is :
d = webdriver.Chrome('/Users/fatima.arshad/Downloads/chromedriver')
d.get(url)
start = BeautifulSoup(d.page_source, 'html.parser')
It doesn't print the price even though I am getting correct text. How do I get just the price?
In this instance you can just regex for the price
import requests, re
r = requests.get('https://www.lanebryant.com/flip-sequin-teach-inspire-graphic-tee/prd-356861#color/0000015477', headers = {'User-Agent':'Mozilla/5.0'})
p = re.compile(r'"price":"(.*?)"')
print(p.findall(r.text)[0])
Otherwise, target the appropriate script tag by id and then parse the .text with json library
import requests, json
from bs4 import BeautifulSoup
r = requests.get('https://www.lanebryant.com/flip-sequin-teach-inspire-graphic-tee/prd-356861#color/0000015477', headers = {'User-Agent':'Mozilla/5.0'})
start = BeautifulSoup(r.text, 'html.parser')
data = json.loads(start.select_one('#pdpInitialData').text)
price = data['pdpDetail']['product'][0]['price_range']['sale_price']
print(price)
price1 = start.find('script', {'type': 'application/ld+json'})
This is actually the <script> tag, so a better name would be
script_tag = start.find('script', {'type': 'application/ld+json'})
You can access the text inside the script tag using .text. That will give you the JSON in this case.
json_string = script_tag.text
Instead of splitting by commas, use a JSON parser to avoid misinterpretations:
import json
clothing=json.loads(json_string)

How to scrape all comments from a news site?

I've been trying to scrape some contents of a news-site
such as news description, tags, comments etc. Successfully done with the description and tags. But, while scraping the comments, the tags are not showing after finding by the tags by beautifulsoup, although it is showing if I inspect the page.
I just want to scrape all the comments (nested comments also) in the page and make them a single string to save in a csv file.
import requests
import bs4
from time import sleep
import os
url = 'https://www.prothomalo.com/bangladesh/article/1573772/%E0%A6%AC%E0%A6%BE%E0%A6%82%E0%A6%B2%E0%A6%BE%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A6%BF-%E0%A6%AA%E0%A6%BE%E0%A6%B8%E0%A6%AA%E0%A7%8B%E0%A6%B0%E0%A7%8D%E0%A6%9F%E0%A6%A7%E0%A6%BE%E0%A6%B0%E0%A7%80-%E0%A6%B0%E0%A7%8B%E0%A6%B9%E0%A6%BF%E0%A6%99%E0%A7%8D%E0%A6%97%E0%A6%BE%E0%A6%B0%E0%A6%BE-%E0%A6%B8%E0%A7%8C%E0%A6%A6%E0%A6%BF-%E0%A6%A5%E0%A7%87%E0%A6%95%E0%A7%87-%E0%A6%A2%E0%A6%BE%E0%A6%95%E0%A6%BE%E0%A7%9F'
resource = requests.get(url, timeout = 3.0)
soup = bs4.BeautifulSoup(resource.text, 'lxml')
# working as expected
tags = soup.find('div', {'class':'topic_list'})
tag = ''
tags = tags.findAll('a', {'':''})
for t in range(len(tags)):
tag = tag + tags[t].text + '|'
# working as expected
content_tag = soup.find('div', {'itemprop':'articleBody'})
content_all = content_tag.findAll('p', {'':''})
content = ''
for c in range(len(content_all)):
content = content + content_all[c].text
# comments not found
comment = soup.find('div', {'class':'comments_holder'})
print(comment)
console:
<div class="comments_holder">
<div class="comments_holder_inner">
<div class="comments_loader"> </div>
<ul class="comments_holder_ul latest">
</ul>
</div>
</div>
What you see in Firefox/Developer tools is not what you received through requests. The comments are loading separately through AJAX and they are in JSON format.
import re
import json
import requests
from bs4 import BeautifulSoup
url = 'https://www.prothomalo.com/bangladesh/article/1573772/%E0%A6%AC%E0%A6%BE%E0%A6%82%E0%A6%B2%E0%A6%BE%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A6%BF-%E0%A6%AA%E0%A6%BE%E0%A6%B8%E0%A6%AA%E0%A7%8B%E0%A6%B0%E0%A7%8D%E0%A6%9F%E0%A6%A7%E0%A6%BE%E0%A6%B0%E0%A7%80-%E0%A6%B0%E0%A7%8B%E0%A6%B9%E0%A6%BF%E0%A6%99%E0%A7%8D%E0%A6%97%E0%A6%BE%E0%A6%B0%E0%A6%BE-%E0%A6%B8%E0%A7%8C%E0%A6%A6%E0%A6%BF-%E0%A6%A5%E0%A7%87%E0%A6%95%E0%A7%87-%E0%A6%A2%E0%A6%BE%E0%A6%95%E0%A6%BE%E0%A7%9F'
comment_url = 'https://www.prothomalo.com/api/comments/get_comments_json/?content_id={}'
article_id = re.findall(r'article/(\d+)', url)[0]
comment_data = requests.get(comment_url.format(article_id)).json()
print(json.dumps(comment_data, indent=4))
Prints:
{
"5529951": {
"comment_id": "5529951",
"parent": "0",
"label_depth": "0",
"commenter_name": "MD Asif Iqbal",
"commenter_image": "//profiles.prothomalo.com/profile/999009/picture/",
"comment": "\u098f\u0987 \u09ad\u09be\u09b0 \u09ac\u09be\u0982\u09b2\u09be\u09a6\u09c7\u09b6\u0995\u09c7 \u09b8\u09be\u09b0\u09be\u099c\u09c0\u09ac\u09a8 \u09ac\u09b9\u09a8 \u0995\u09b0\u09a4\u09c7 \u09b9\u09ac\u09c7",
"create_time": "2019-01-08 19:59",
"comment_status": "published",
"like_count": "\u09e6",
"dislike_count": "\u09e6",
"like_me": null,
"dislike_me": null,
"device": "phone",
"content_id": "1573772"
},
"5529952": {
"comment_id": "5529952",
"parent": "0",
... and so on.

Paginate through json data with Flask

I am using flask and generate tables that are filled with the JSON data that I retrieve. The problem that I have now is that I need to paginate through all the JSON data, because the maximum per page is set at '50'and I want to show all the products in my table.
So far I can't get this working and I don't really know how to get it working with Flask. I tried using a while loop, but that doesn't work with Jinja2 because that command is not recognized.
This is my Python code:
#app.route('/products',methods = ['POST', 'GET'])
def products():
shopnaam = request.form['shopname']
username = request.form['username']
password = request.form['password']
login = 'https://'+shopnaam+'example.com'
url = 'https://'+shopnaam+'.example.com/admin/products.json'
payload = {
'login[email]': username,
'login[password]': password
}
with requests.Session() as session:
post = session.post(login, data=payload)
r = session.get(url)
parsed = json.loads(r.text)
return render_template('producten.html',parsed = parsed)
This is my Jinja2 code:
<button class="collapsible">Bekijk product Informatie</button>
<div class="content">
<table id = "productentabel">
<tr class = "header">
<th>ID</th>
<th>Titel </th>
<th>Prijs Exclusief BTW</th>
<th>Prijs Inclusief BTW</th>
<th>Datum</th>
{% for product in parsed['products'] %}
<TR>
<TD width="100px" >{{product['id']}}</TD>
<TD width="300px" >{{product['nl']['title']}}</TD>
<TD width="150px">{{product['price_excl']}}</TD>
<TD width="150px">{{product['price_incl']}}</TD>
<TD width="300px">{{product['created_at']}}</TD>
</TR>
</tr>
{% endfor %}
</table>
<input class = "exportknop" value="Exporteer product informatie" type="button" onclick="$('#productentabel').table2CSV({header:['ID','Titel','Prijs Exclusief BTW', 'Prijs Inclusief BTW', 'Datum']})">
</div>
As you can see I am using a for loop, this code works, but the pagination is the issue.
My JSON looks like this:
products: [
{
article_code: "123",
barcode: "456",
brand_id: 2600822,
created_at: "2018-05-31T15:15:34+02:00",
data01: "",
data02: "",
data03: "",
delivery_date_id: null,
has_custom_fields: false,
has_discounts: false,
has_matrix: false,
hits: 0,
hs_code: null,
id: 72660113,
image_id: null,
is_visible: false,
price_excl: 33.0165,
price_incl: 39.95,
price_old_excl: 0,
price_old_incl: 0,
product_set_id: null,
product_type_id: null,
search_context: "123 456 789",
shop_id: 252449,
sku: "789",
supplier_id: 555236,
updated_at: "2018-05-31T15:15:34+02:00",
variants_count: 1,
visibility: "hidden",
weight: 0,
links: {
first: ".json",
last: ".json?page=70",
prev: null,
next: ".json?page=2",
count: 3497,
limit: 50,
pages: 70
}
So links is where the pagination happens, I tried the following in my Python code and with this I get all the values printed in my python terminal. Only I can't send the data to the tables.
while url:
with requests.Session() as session:
post = session.post(login, data=payload)
r = session.get(url)
parsed = json.loads(r.text)
for product in parsed['products']:
print(product['id'], product['nl']['title'])
url = 'https://example/admin/products' + parsed['links']['next']
So a couple things to consider here. First, using a generator to create web content is going to be next to impossible outside of a websocket or async calls. The reason is that WSGI needs all the data to come before it renders. Then it closes the connection. You can yield data, but in a table, this is going to cause problems in raw html.
What to do:
I would use something like datatables (datatables.net) feed your table data into a variable, and let datatables handle the pagination for you.
Example using Bootstrap 4 and DataTables.
<script>
var dataSet = []
var wurl = ""
tablex = $('#tablex').dataTable( {
select: true,
stateSave: true,
colReorder: true,
deferRender: true,
"oLanguage": {"sSearch": "Filter:"},
columns: [
{ title: "Number" },
{ title: "Client Name" },
{ title: "Opened" },
{ title: "Closed" },
{ title: "Worked" }
]
} );
function generate() {
$.get(wurl, function( data ) {
dataSet = data;
tablex.fnDestroy();
tablex = $('#tablex').dataTable( {
data: dataSet,
select: true,
stateSave: true,
colReorder: true,
deferRender: true,
"oLanguage": {"sSearch": "Filter:"},
columns: [
{ title: "Number" },
{ title: "Client Name" },
{ title: "Opened"},
{ title: "Closed"},
{ title: "Worked"}
],
} );
});
};
$(document).ready(function() {
wurl = '/api/getdata';
generate()
} );
</script>
I first establish just a base table, and then I call an API and load that table with data in the background. You could even have this poll and refresh the dataset on an interval if you wanted. The API just delivers the raw data in a column output.
Your table id is "tablex" in this example.
Your API route can be anything, just a return value of your output that is in column and row format.

Find next items to tag in Beautiful Soup

I want to parse a HTML file with Beautiful Soup and Python like
<h1>Title 1</h1>
<div class="item"><p>content 1</p></div>
<div class="item"><p>content 2</p></div>
...
<h1>Title 2</h1>
<div class="item"><p>content 3</p></div>
<div class="item"><p>content 4</p></div>
<div class="item"><p>content 5</p></div>
...
How can I parse this into a dictionary like
{
"Title 1": [
{
"content": "content 1"
},
{
"content": "content 2"
}
],
"Title 2": [
{
"content": "content 3"
},
{
"content": "content 4"
},
{
"content": "content 5"
}
]
}
I've tried it with nextSibling, but I am not able check the tag name.
Here is how you can achieve this:
soup = bs4.BeautifulSoup(html)
data = {}
row = []
title = ""
for tag in soup:
print(tag)
if tag.name == 'h1':
if title:
data[title] = row
row = []
title = tag.string
elif tag.name == 'div':
row.append(tag.string)
if title:
data[title] = row
The idea is to iterate over the tags.
If the current tag is a <h1>, then create a new content list.
Else, if it is a <div> tag, then append its content to the current content list.
When a new <h1> tag is found, put the current content list into the global data structure (that is a dictionary), under the name of the last title.
The type of a tag can be found in tag.name.
This is what you need to check so as to find if a tag is a <h1> or a <div>.
This gives a little bit different structure than what you asked for, but I think it is a better data structure, for the keys in your dictionary are always content, so basically no key is needed, and a list is better.
Test input:
html = """<h1>Title 1</h1>
<div class="item"><p>content 1</p></div>
<div class="item"><p>content 2</p></div>
<h1>Title 2</h1>
<div class="item"><p>content 3</p></div>
<div class="item"><p>content 4</p></div>
<div class="item"><p>content 5</p></div>
"""
Output:
{'Title 1': ['content 1', 'content 2'], 'Title 2': ['content 3', 'content 4', 'content 5']}
You can do this with next_sibling and check tag type by .name:
soup = BeautifulSoup(html_page, 'html.parser')
temp_tag = soup.h1
result = {temp_tag.text: []}
temp_key = temp_tag.text
while True:
temp_tag = temp_tag.next_sibling
if temp_tag.name == 'div':
buf = temp_tag.contents[0].text
result[temp_key].append({'content': buf})
elif temp_tag.name == 'h1':
temp_key = temp_tag.text
result[temp_key] = []
else:
break
print(result)
The output of this code:
{
u'Title 1': [
{'content': u'content 1'},
{'content': u'content 2'}
],
u'Title 2': [
{'content': u'content 3'},
{'content': u'content 4'},
{'content': u'content 5'}
]
}

Categories