I am trying to scrape App Store pages of games to fetch their icon and screenshots using Python. However, sometimes it returns some random images that are not necessarily game screenshots. For example, it might return Apple Maps screenshot etc.
Here is my script to get image URLs,
def get_app_store_images_icon(url):
response = requests.get(url)
data = response.text
soup = BeautifulSoup(data, 'html.parser')
link = soup.find('ul', {"class": "l-row l-row--peek we-screenshot-viewer__screenshots-list"})
link_icon = soup.find('div', {"class": "product-hero__media l-column small-5 medium-4 large-3 small-valign-top"})
images = []
try:
for subHtml in link.find_all('picture'):
image_sources = subHtml.find_all('source')[-1].get('srcset')
# to get last image in srcset
start = image_sources.rindex(", ")
end = image_sources.rindex(" ")
images.append(image_sources[start+2:end])
except:
return None
try:
image_icon_sources = link_icon.find_all('source')[-1].get('srcset')
start = image_icon_sources.rindex(", ")
end = image_icon_sources.rindex(" ")
image_icon = image_icon_sources[start+2:end]
except:
return None
all_images = {'images': images, 'icon': image_icon}
return all_images
I am using BeautifulSoup4 version 4.9.1 and Python 3.7
I want to know if this is a bug or Apple's security conventions to dynamically change images for crawlers.
Related
I am trying to get images which are placed inside of <li>'s anchor tag as a href.
I am able to get only one link, but not everything.
I am trying to scrape the following page:
https://www.msxdistribution.com/love-triangle
As you can see there are multiple product images and I am trying to get them but unfortunately I am not able to do so, what I did successfully is to get only first image, but not other...
Here's my code:
def scraping_data(productlinks,r):
ix = int(0)
for link in productlinks:
ix = ix + 1
f = requests.get(link,headers=headers).text
hun=BeautifulSoup(f,'html.parser')
dom = etree.HTML(str(hun))
#Here I get description of product
try:
name=hun.find("h1",{"class":"product-name"}).get_text().replace('\n',"")
print(name)
except:
name = None
try:
print("Trying to fetch image...")
all_imgs = hun.find_all('img') #Here I tried to fetch every img from web-site
for image in all_imgs:
print(all_imgs)
ioner = image.find_all(attrs={'class': 'zoomImg'}) #Tried to get only images with class of zoomImg #Unsuccessful
print(ioner)
ss = hun.find("a",{"class":"fancy-images"}).get('href') #This one gets only first img and it works
print(ss)
except Exception as e:
print("No images")
Try:
import requests
from bs4 import BeautifulSoup
url = "https://www.msxdistribution.com/love-triangle"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
for img in soup.select(".etalage_thumb_image"):
print(img["src"])
Prints:
https://www.msxdistribution.com/media/catalog/product/cache/4/thumbnail/800x800/9df78eab33525d08d6e5fb8d27136e95/7/1/7101024-1_1/stimolatore-love-triangle-11.jpg
https://www.msxdistribution.com/media/catalog/product/cache/4/thumbnail/800x800/9df78eab33525d08d6e5fb8d27136e95/7/1/7101024-2_1/stimolatore-love-triangle-12.jpg
https://www.msxdistribution.com/media/catalog/product/cache/4/thumbnail/800x800/9df78eab33525d08d6e5fb8d27136e95/7/1/7101024-3_1/stimolatore-love-triangle-13.jpg
https://www.msxdistribution.com/media/catalog/product/cache/4/thumbnail/800x800/9df78eab33525d08d6e5fb8d27136e95/7/1/7101024-4_1/stimolatore-love-triangle-14.jpg
https://www.msxdistribution.com/media/catalog/product/cache/4/thumbnail/800x800/9df78eab33525d08d6e5fb8d27136e95/7/1/7101024-5/stimolatore-love-triangle-15.jpg
I got this project where I'm scraping data on Trulia.com and where I want to get the max number of page (last number) for a specific location (photo below) so I can loop through it and get all the hrefs.
To get that last number, I have my code that run as planned and should return an integer but it doesn't always return the same number. I added the print(comprehension list) to understand what's wrong. Here is the code and the output below. The return is commented but sould return the last number of the output list as an int.
city_link = "https://www.trulia.com/for_rent/San_Francisco,CA/"
def bsoup(url):
resp = r.get(url, headers=req_headers)
soup = bs(resp.content, 'html.parser')
return soup
def max_page(link):
soup = bsoup(link)
page_num = soup.find_all(attrs={"data-testid":"pagination-page-link"})
print([x.get_text() for x in page_num])
# return int(page_num[-1].get_text())
for x in range(10):
max_page(city_link)
I have no clue why sometimes it's returning something wrong. The photo above is the corresponding link.
Okay, now if I understand what you want, you are trying to see how many pages of links there are for a given location for rent. If we can assume the given link is the only required link, this code:
import requests
import bs4
url = "https://www.trulia.com/for_rent/San_Francisco,CA/"
req = requests.get(url)
soup = bs4.BeautifulSoup(req.content, features='lxml')
def get_number_of_pages(soup):
caption_tag = soup.find('div', class_="Text__TextBase-sc-1cait9d-0-
div Text__TextContainerBase-sc-1cait9d-1 RBSGf")
pagination = caption_tag.text
words = pagination.split(" ")
values = []
for word in words:
if not word.isalpha():
values.append(word)
links_per_page = values[0].split('-')[1]
total_links = values[1].replace(',', '')
no_of_pages = round(int(total_links)/int(links_per_page) + 0.5)
return no_of_pages
for i in range(10):
print(get_number_of_pages(soup))
achieves what you're looking for, and has repeatability because it doesn't interact with javascript, but the pagination caption at the bottom of the page.
I'm doing some simple web scraping, and need to find a better way to loop through pagination on the target site. The only way I could do this was to write 10 + "for loops" to get it to work.
Basically I'm looking for a "Next" icon in the url, if it exist, I need to grab the parent link of the icon image and append it to the url, go to the new updated url, and search for the same icon and repeat until I get to the last page (the icon will be gone).
How could I perform this without hard coding a bunch of for loops?
url = "http://www.somewebsite.com/"
r = requests.get(wurl)
soup = BeautifulSoup(r.text, "lxml")
for img in soup.findAll("img"):
if "/Next_Icon" in img["src"]:
link = img.find_parent("a", href=True)
extLink = (link["href"])
url = "http://www.somewebsite.com/" + extLink
url_stack = ["http://www.somewebsite.com/"]
while url_stack:
wurl = url_stack.pop()
r = requests.get(wurl)
soup = BeautifulSoup(r.text, "lxml")
for img in soup.findAll("img"):
if "/Next_Icon" in img["src"]:
link = img.find_parent("a", href=True)
extLink = (link["href"])
url = "http://www.somewebsite.com/" + extLink
url_stack.append(url)
You should use list to store all the urls
I am looking to identify the urls that request external resources in html files.
I currently use the scr attribute in the img and script tags, and the href attribute in the link tag (to identify css).
Are there other tags that I should be examining to identify other resources?
For reference, my code in Python is currently:
html = read_in_file(file)
soup = BeautifulSoup(html)
image_scr = [x['src'] for x in soup.findAll('img')]
css_link = [x['href'] for x in soup.findAll('link')]
scipt_src = [] ## Often times script doesn't have attributes 'src' hence need for try/except
for x in soup.findAll('script'):
try:
scipt_src.append(x['src'])
except KeyError:
pass
Updated my code to capture what seemed like the most common resources in html code. Obviously this doesn't look at resources requested in either CSS or Javascript. If I am missing tags please comment.
from bs4 import BeautifulSoup
def find_list_resources (tag, attribute,soup):
list = []
for x in soup.findAll(tag):
try:
list.append(x[attribute])
except KeyError:
pass
return(list)
html = read_in_file(file)
soup = BeautifulSoup(html)
image_scr = find_list_resources('img',"src",soup)
scipt_src = find_list_resources('script',"src",soup)
css_link = find_list_resources("link","href",soup)
video_src = find_list_resources("video","src",soup)
audio_src = find_list_resources("audio","src",soup)
iframe_src = find_list_resources("iframe","src",soup)
embed_src = find_list_resources("embed","src",soup)
object_data = find_list_resources("object","data",soup)
soruce_src = find_list_resources("source","src",soup)
I am screen scraping data using a web crawler and storing the results - (tweets from a twitter page) as separate html files for each user I'm crawling. I intend to later parse the html files and store the data into a database for analysis. However, I am having a bizarre problem.
When I run the following program - a small snippet from the overall crawler - I am able to get a separate html file for each follower:
import re
import urllib2
import twitter
start_follower = "NYTimesKrugman"
depth = 3
searched = set()
api = twitter.Api()
def crawl(follower, in_depth):
if in_depth > 0:
searched.add(follower)
directory = "C:\\Python28\\Followertest1\\" + follower + ".html"
output = open(directory, 'a')
output.write(follower)
output.write('\n\n')
users = api.GetFriends(follower)
names = set([str(u.screen_name) for u in users])
names -= searched
for name in list(names)[0:5]:
crawl(name, in_depth-1)
crawl(start_follower, depth)
for x in searched:
print x
print "Program is completed."
However, when I run the full crawler, I do not get a separate file for each follower:
import twitter
import urllib
from BeautifulSoup import BeautifulSoup
import re
import time
start_follower = "NYTimeskrugman"
depth = 2
searched = set()
api = twitter.Api()
def add_to_U(user):
U.append(user)
def site(follower): #creates a twitter site url in string format based on the follower username
followersite = "http://mobile.twitter.com/" + follower
return followersite
def getPage(follower): #obtains access to a webapge
url = site(follower)
response = urllib.urlopen(url)
return response
def getSoup(response): #creates the parsing module
html = response.read()
soup = BeautifulSoup(html)
return soup
def gettweets(soup, output):
tags = soup.findAll('div', {'class' : "list-tweet"})#to obtain tweet of a follower
for tag in tags:
a = tag.renderContents()
b = str (a)
output.write(b)
output.write('\n\n')
def are_more_tweets(soup):#to check whether there is more than one page on mobile twitter
links = soup.findAll('a', {'href': True}, {id: 'more_link'})
for link in links:
b = link.renderContents()
test_b = str(b)
if test_b.find('more') != -1:
return True
return False
def getnewlink(soup): #to get the link to go to the next page of tweets on twitter
links = soup.findAll('a', {'href': True}, {id : 'more_link'})
for link in links:
b = link.renderContents()
if str(b) == 'more':
c = link['href']
d = 'http://mobile.twitter.com' +c
return d
def crawl(follower, in_depth): #main method of sorts
if in_depth > 0:
searched.add(follower)
directory = "C:\\Python28\\Followertest2\\" + follower + ".html"
output = open(directory, 'a')
output.write(follower)
output.write('\n\n')
a = getPage(follower)
soup = getSoup(a)
gettweets(soup, output)
tweets = are_more_tweets(soup)
while(tweets):
b = getnewlink(soup)
red = urllib.urlopen(b)
html = red.read()
soup = BeautifulSoup(html)
gettweets(soup, output)
tweets = are_more_tweets(soup)
users = api.GetFriends(follower)
names = set([str(u.screen_name) for u in users])
names -= searched
for name in list(names)[0:5]:
print name
crawl(name, in_depth - 1)
crawl(start_follower, depth)
print("Program done. Look at output file.")
More specifically, I seem to get a separate html file for about the first five followers and then no new files appear to be created. Any help would be appreciated!
The depth value is different between the snippet and the full code (you're only going to get one level of recursion in the full code). Also, you only grab the first five names from the followers list: for name in list(names)[0:5]: So you get six people total: the starting follower and their first five friends.