So far I managed to make this:
from bs4 import BeautifulSoup
import requests
def function():
url = 'https://dynasty-scans.com/chapters/liar_satsuki_can_see_death_ch28_6#6'
soup = BeautifulSoup(requests.get(url).text, 'html.parser')
script = soup.find_all('script')
print(script[1])
output:
<script>
//<![CDATA[
var pages = [{"image":"/system/releases/000/036/945/1.png","name":"1"},{"image":"/system/releases/000/036/945/2.png","name":"2"},{"image":"/system/releases/000/036/945/3.png","name":"3"},{"image":"/system/releases/000/036/945/4.png","name":"4"},{"image":"/system/releases/000/036/945/5.png","name":"5"},{"image":"/system/releases/000/036/945/6.png","name":"6"},{"image":"/system/releases/000/036/945/7.png","name":"7"},{"image":"/system/releases/000/036/945/credits.png","name":"credits"}];
//]]>
</script>
I'm trying to extract values of "image" as strings
example: "/system/releases/000/036/945/7.png"
How can I do it ?
you can use a regular expression to extract the variable "pages"
import re, json, requests
url = 'https://dynasty-scans.com/chapters/liar_satsuki_can_see_death_ch28_6#6'
r = requests.get(url)
# extract the data
match = re.search('var pages = (\[.*?\]);', r.text).group(1)
# parse it into json
match_json = json.loads(match)
# iterate through it to get the links
images = [img['image'] for img in match_json]
output:
['/system/releases/000/036/945/1.png',
'/system/releases/000/036/945/2.png',
'/system/releases/000/036/945/3.png',
'/system/releases/000/036/945/4.png',
'/system/releases/000/036/945/5.png',
'/system/releases/000/036/945/6.png',
'/system/releases/000/036/945/7.png',
'/system/releases/000/036/945/credits.png']
Related
I am trying to fetch a URL from a webpage, here is how the URL looks in the Inspect section:
Here is how the URL looks in my python-code:
How can I get the actual URL without the ../../ part using BeautifulSoup?
Here is my code in case it's needed:
import re
import requests
from bs4 import BeautifulSoup
source = requests.get('https://books.toscrape.com/catalogue/category/books_1/index.html').text
soup = BeautifulSoup(source, 'lxml')
# article = soup.find('article')
# title = article.div.a.img['alt']
# print(title['alt'])
titles, topics,urls,sources = [], [], [],[]
article_productPod = soup.findAll('article', {"class":"product_pod"})
for i in article_productPod:
titles.append(i.div.a.img['alt'])
# print(titles)
for q in article_productPod:
urls.append(q.h3.a['href'])
print(urls[0])
# for z in range(len(urls)):
# source2 = requests.get("https://" + urls[z])
Use urllib:
import urllib
Store your target URL in a separate variable :
src_url = r'https://books.toscrape.com/catalogue/category/books_1/index.html'
source = requests.get(src_url).text
Join the website's URL and the relative URL:
for q in article_productPod:
urls.append(urllib.parse.urljoin(src_url, q.h3.a['href']))
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
url = "https://www.property24.com/for-sale/woodland-hills-wildlife-estate/bloemfontein/free-state/10467/109825373"
data = requests.get(url)
soup = bs(data.content,"html.parser")
The code below are a test with to get 1 item.
property_overview = soup.find(class_="p24_regularListing").find(class_="p24_propertyOverview").find(class_='p24_propertyOverviewRow').find(class_='col-xs-6 p24_propertyOverviewKey').text
property_overview
Output : 'Listing Number'
The code below is what we have to get all the col-xs-6 p24_propertyOverviewKey
p24_regularListing_items = soup.find_all(class_="p24_regularListing")
for p24_propertyOverview_item in p24_regularListing_items:
p24_propertyOverview_items = p24_propertyOverview_item.find_all(class_="p24_propertyOverview")
for p24_propertyOverviewRow_item in p24_propertyOverview_items:
p24_propertyOverviewRow_items = p24_propertyOverviewRow_item.find_all(class_="p24_propertyOverviewRow")
for p24_propertyOverviewKey_item in p24_propertyOverviewRow_items:
p24_propertyOverviewKey_items = p24_propertyOverviewKey_item.find_all(class_="col-xs-6 p24_propertyOverviewKey")
p24_propertyOverviewKey_items
The code above only outputs 1 item. and not all
To put things more simply, you can use soup.select() (and via the comments, you can then use .get_text() to extract the text from each tag).
from bs4 import BeautifulSoup
import requests
resp = requests.get(
"https://www.property24.com/for-sale/woodland-hills-wildlife-estate/bloemfontein/free-state/10467/109825373"
)
resp.raise_for_status()
soup = BeautifulSoup(resp.content, "html.parser")
texts = []
for tag in soup.select(
# NB: this selector uses Python's implicit string concatenation
# to split it onto several lines.
".p24_regularListing "
".p24_propertyOverview "
".p24_propertyOverviewRow "
".p24_propertyOverviewKey"
):
texts.append(tag.get_text())
print(texts)
I'm trying to scrape some data with Python and Beautifulsoup. I know how to get the text from the script tag. The data between [ ] is valid json.
<script>
dataLayer =
[
{
"p":{
"t":"text1",
"lng":"text2",
"vurl":"text3"
},
"c":{ },
"u":{ },
"d":{ },
"a":{ }
}
]
</script>
I've read this response and it almost does what I want:
Extract content of <Script with BeautifulSoup
Here is my code:
import urllib.request
from bs4 import BeautifulSoup
import json
url = "www.example.com"
html = urllib.request.urlopen(url)
soup = BeautifulSoup(html, "html.parser")
raw_data = soup.find("script")
I would then ideally do:
json_dict = json.loads(raw_data)
And access the data through the dictionary. But this is not working because of
"<script> dataLayer ="
preceding the valid json and the script tag at the end. I've tried trimming the raw_data as a string, like this:
raw_data[20:]
But this didn't work because the soup object is not a string.
How can I get the raw_data variable to contain ONLY the text between the block quotes [ ]?
EDIT: this seems to work. It avoids regex and solves the problem of the trailing chars as well. Thanks for your suggestions.
url = "www.example.com"
html = urllib.request.urlopen(url)
soup = BeautifulSoup(html, "html.parser")
# get the script tag data and convert soup into a string
data = str(soup.find("script"))
# cut the <script> tag and some other things from the beginning and end to get valid JSON
cut = data[27:-13]
# load the data as a json dictionary
jsoned = json.loads(cut)
use .text to get content inside <script> tag then replace dataLayer =
raw_data = soup.find("script")
raw_data = raw_data.text.replace('dataLayer =', '')
json_dict = json.loads(raw_data)
>>> import re
>>> soup.find_all(re.compile("\[(.*?)\]"))
you would do that with regex
You will have to create a regex norm that only takes text between []
here a link of common regex usage within beautifulsoup
here the regex to extract from between square brackets
I need to create a code to extract a word from one scrape of images.
I'll explain, from a page sitemap.xml ,my code must try in every link present in this xml file, found insiede each link if there a specific word, inside an image link.
the sitemap is adidas = http://www.adidas.it/on/demandware.static/-/Sites-adidas-IT-Library/it_IT/v/sitemap/product/adidas-IT-it-it-product.xml
this is the code i created for search the image contains the word "ZOOM" :
import requests
from bs4 import BeautifulSoup
html = requests.get(
'http://www.adidas.it/scarpe-superstar/C77124.html').text
bs = BeautifulSoup(html)
possible_links = bs.find_all('img')
for link in possible_links:
if link.has_attr('src'):
if link.has_key('src'):
if 'zoom' in link['src']:
print link['src']
but im search a metod to scrape a list in automatic
thankyou so much
i try to do this for have list :
from bs4 import BeautifulSoup
import requests
url = "http://www.adidas.it/on/demandware.static/-/Sites-adidas-IT-Library/it_IT/v/sitemap/product/adidas-IT-it-it-product.xml"
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data)
for url in soup.findAll("loc"):
print url.text
but i cant to attach request..
i can find the word "Zoom" in any link present in sitemap.xml
thankyou so much
import requests
from bs4 import BeautifulSoup
import re
def make_soup(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
return soup
# put urls in a list
def get_xml_urls(soup):
urls = [loc.string for loc in soup.find_all('loc')]
return urls
# get the img urls
def get_src_contain_str(soup, string):
srcs = [img['src']for img in soup.find_all('img', src=re.compile(string))]
return srcs
if __name__ == '__main__':
xml = 'http://www.adidas.it/on/demandware.static/-/Sites-adidas-IT-Library/it_IT/v/sitemap/product/adidas-IT-it-it-product.xml'
soup = make_soup(xml)
urls = get_xml_urls(soup)
# loop through the urls
for url in urls:
url_soup = make_soup(url)
srcs = get_src_contain_str(url_soup, 'zoom')
print(srcs)
So i have function that is called when i click a button , it goes as below
var min_news_id = "68feb985-1d08-4f5d-8855-cb35ae6c3e93-1";
function loadMoreNews(){
$("#load-more-btn").hide();
$("#load-more-gif").show();
$.post("/en/ajax/more_news",{'category':'','news_offset':min_news_id},function(data){
data = JSON.parse(data);
min_news_id = data.min_news_id||min_news_id;
$(".card-stack").append(data.html);
})
.fail(function(){alert("Error : unable to load more news");})
.always(function(){$("#load-more-btn").show();$("#load-more-gif").hide();});
}
jQuery.scrollDepth();
Now i don't have much experience with javascript , but i assume its returning some json data from some sort of api at "en/ajax/more_news" .
Is there i way could directly call this api and get the json data from my python script. If Yes,how?
If not how do i scrape the content that is being generated?
You need to post the news id that you see inside the script to https://www.inshorts.com/en/ajax/more_news, this is an example using requests:
from bs4 import BeautifulSoup
import requests
import re
# pattern to extract min_news_id
patt = re.compile('var min_news_id\s+=\s+"(.*?)"')
with requests.Session() as s:
soup = BeautifulSoup(s.get("https://www.inshorts.com/en/read").content)
new_id_scr = soup.find("script", text=re.compile("var\s+min_news_id"))
print(new_id_scr.text)
news_id = patt.search(new_id_scr.text).group()
js = s.post("https://www.inshorts.com/en/ajax/more_news", data={"news_offset":news_id})
print(js.json())
js gives you all the html, you just have to access the js["html"].
Here is the script that will automatically loop through all the pages in inshort.com
from bs4 import BeautifulSoup
from newspaper import Article
import requests
import sys
import re
import json
patt = re.compile('var min_news_id\s+=\s+"(.*?)"')
i = 0
while(1):
with requests.Session() as s:
if(i==0):soup = BeautifulSoup(s.get("https://www.inshorts.com/en/read").content,"lxml")
new_id_scr = soup.find("script", text=re.compile("var\s+min_news_id"))
news_id = patt.search(new_id_scr.text).group(1)
js = s.post("https://www.inshorts.com/en/ajax/more_news", data={"news_offset":news_id})
jsn = json.dumps(js.json())
jsonToPython = json.loads(jsn)
news_id = jsonToPython["min_news_id"]
data = jsonToPython["html"]
i += 1
soup = BeautifulSoup(data, "lxml")
for tag in soup.find_all("div", {"class":"news-card"}):
main_text = tag.find("div", {"itemprop":"articleBody"})
summ_text = main_text.text
summ_text = summ_text.replace("\n", " ")
result = tag.find("a", {"class":"source"})
art_url = result.get('href')
if 'www.youtube.com' in art_url:
print("Nothing")
else:
art_url = art_url[:-1]
#print("Hello", art_url)
article = Article(art_url)
article.download()
if article.is_downloaded:
article.parse()
article_text = article.text
article_text = article_text.replace("\n", " ")
print(article_text+"\n")
print(summ_text+"\n")
It gives both the summary from inshort.com and complete news from respective news channel.