How to take multiple images links - python

def get_links(statu, data, n_img, url, agent):
if statu==0:
print("The website doesn't response. Please try again later",end=" ")
else:
img_links=[]
r=requests.get(url,headers=agent).text
soup=BeautifulSoup(r,"lxml")
results=soup.find_all("div",attrs={"class":"view"})
results=soup.find_all("div",attrs={"class":"view"})
results=soup.find_all("div",attrs={"class":"interaction-view"})
results=soup.find_all("div",attrs={"class":"photo-list-photo-interaction"})
# results=soup.find_all("a",attrs={"class":"overlay"},limit=n_img)
print(results)
for result in results:
link=result.get("href")
img_links.append(link)
return img_links
In order to download multiple image, I try to get links from Flickr. To do that, I write above code, and everything was good until come that the line "results=soup.find_all("div",attrs={"class":"photo-list-photo-interaction"})". Before that line I can take HTML code. However, in that line I couldn't got it.
How can I solve that problem. Thank you!

Instead of scraping with Beautiful Soup, why not use the API instead? Alternatively, you could use Flickr's RSS Feeds and parse them with the feedparser module.
If you still want to use BeautifulSoup:
def flickr_photos(url):
img_urls = []
resp = requests.get(url)
soup = BeautifulSoup(resp.text)
photos = soup.find_all('div', {'class': 'view'})
for photo in photos:
try:
img = photo['style'].split('(//').pop()
if img.startswith('live'):
img_urls.append(f'https://{img[:-1]}')
except:
pass
return img_urls
The reason your code doesn't work is because Flickr has the image's url in the background-image style attribute.

Related

BeautifulSoup not Retrieving Accurate HTML - Requests_HTML

I am trying to parse a picture off of this page. Specifically, I am trying to parse the image under the div class "gOenxf". When you inspect the webpage, the HTML elements show an "encrypted" image URL, which is useful to me and what I am trying to retrieve. However, when I parse that same page/class the image is retrieved get as a "Data URL" which is not very useful to me. I am using request_html because I need something faster than selenium. I am also using BeautifulSoup because it is easier than request_html's built-in ".find" system. Does anyone know why this is happening or a solution to the problem?
for google_post in google_initiate():
post_image_url = str(google_post.find(class_='gOenxf'))
post_image_url = post_image_url[post_image_url.find('src="') + len('src="'):post_image_url.rfind('"')]
print(post_image_url)
def google_initiate():
url = 'https://www.google.com/search?tbm=shop&q=desk'
session = HTMLSession()
data = session.get(url)
google_soup = BeautifulSoup(data.text, features='html.parser')
google_parsed = google_soup.find_all('div', {'class': ['sh-dgr__gr-auto', 'sh-dgr__grid-result']})
google_initiate.google_parse_page = URL
session.close()
return google_parsed

What's the best way to scrape and plot connected pages on a website with Python?

I've been working on a project that takes an input of a url and creates a map of the page connections on a website.
The way I was approaching this was to scrape the page for links, then create a page object to hold the href of the page and a list of all the child links on that page. Once I have the data pulled from all the pages on the site I would pass it to a graphing function like matplotlib or plotly in order to get a graphical representation of the connections between pages on a website.
This is my code so far:
from urllib.request import urlopen
import urllib.error
from bs4 import BeautifulSoup, SoupStrainer
#object to hold page href and child links on page
class Page:
def __init__(self, href, links):
self.href = href
self.children = links
def getHref(self):
return self.href
def getChildren(self):
return self.children
#method to get an array of all hrefs on a page
def getPages(url):
allLinks = []
try:
#combine the starting url and the new href
page = urlopen('{}{}'.format(startPage, url))
for link in BeautifulSoup(page, 'html.parser', parse_only=SoupStrainer('a')):
try:
if 'href' in link.attrs:
allLinks.append(link)
except AttributeError:
#if there is no href, skip the link
continue
#return an array of all the links on the page
return allLinks
#catch pages that can't be opened
except urllib.error.HTTPError:
print('Could not open {}{}'.format(startPage, url))
#get starting page url from user
startPage = input('Enter a URL: ')
page = urlopen(startPage)
#sets to hold unique hrefs and page objects
pages = set()
pageObj = set()
for link in BeautifulSoup(page, 'html.parser', parse_only=SoupStrainer('a')):
try:
if 'href' in link.attrs:
if link.attrs['href'] not in pages:
newPage = link.attrs['href']
pages.add(newPage)
#get the child links on this page
pageChildren = getPages(newPage)
#create a new page object, add to set of page objects
pageObj.add(Page(newPage, pageChildren))
except AttributeError:
print('{} has an attribute error.'.format(link))
continue
Would Scrapy be better for what I'm trying to do?
What library would work best for displaying the connections?
How do I fix the getPages function to correctly combine the user-inputted url with the hrefs pulled from the page? If I'm at 'https://en.wikipedia.org/wiki/Main_Page', I'll get 'Could not open https://en.wikipedia.org/wiki/Main_Page/wiki/English_language'. I think I need to combine from the end of the .org/ and drop the /wiki/Main_Page but I don't know the best way to do this.
This is my first real project so any pointers on how I could improve my logic are appreciated.
thats a nice idea for a first project!
Would Scrapy be better for what I'm trying to do?
There are numerous advantages that a scrapy version of your project would have over your current version. The advantage you would feel immediatly is the speed at which your requests are made. However, it may take you a while to get used to the structure of scrapy projects.
How do I fix the getPages function to correctly combine the user-inputted url with the hrefs pulled from the page? If I'm at 'https://en.wikipedia.org/wiki/Main_Page', I'll get 'Could not open https://en.wikipedia.org/wiki/Main_Page/wiki/English_language'. I think I need to combine from the end of the .org/ and drop the /wiki/Main_Page but I don't know the best way to do this.
You can achieve this using urllib.parse.urljoin(startPage, relativeHref). Most of the links you're gonna find are relative links which you can then convert to an absolute link using the urljoin function.
In your code you would change newPage = link.attrs['href'] to newPage = urllib.parse.urljoin(startPage, link.attrs['href']) and page = urlopen('{}{}'.format(startPage, url)) to page = urlopen(url).
Here are a couple of examples as to where you can change your code slightly for some benefits.
Instead of for link in BeautifulSoup(page, 'html.parser', parse_only=SoupStrainer('a')): you can use BeautifulSoup's find_all() function like this for link in BeautifulSoup(page, 'html.parser').find_all('a', href=True):. This way all your links are already guaranteed to have an href.
In order to prevent links on the same page from occuring twice, you should change allLinks = [] to be a set instead.
This is up to preference, but since Python 3.6 there is another syntax called "f-Strings" for referencing variables in strings. You could change print('{} has an attribute error.'.format(link)) to print(f'{link} has an attribute error.') for example.

Can't find the Web scraping selector for div class

I am new to Web scraping and this is one of my first web scraping project, I cant find the right selector for my soup.select("")
I want to get the "data-phone" (See picture bellow to undersdtand) But it In a div class and after it in a <a href>, who make that a little complicate for me!
I searched online and I foud that I have to use soup.find_all but this is not very helpfull Can anyone help me or give me a quick tip ?Thanks you!
my code:
import webbrowser, requests, bs4, os
url = "https://www.pagesjaunes.ca/search/si/1/electricien/Montreal+QC"
res = requests.get(url)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text)
result = soup.find('a', {'class', 'mlr__item__cta jsMlrMenu'})
Phone = result['data-phone']
print(Phone)
I think one of the simplest way is to use the soup.select which allows the normal css selectors.
https://www.crummy.com/software/BeautifulSoup/bs4/doc/#css-selectors
soup.select('a.mlr__item_cta.jsMlrMenu')
This should return the entire list of anchors from which you can pick the data attribute.
Note I just tried it in the terminal:
from bs4 import BeautifulSoup
import requests
url = 'https://en.wikipedia.org/wiki/Web_scraping'
r = requests.get(url)
soup = BeautifulSoup(r.text)
result = soup.select('a.mw-jump-link') # or any other selector
print(result)
print(result[0].get("href"))
You will have to loop over the result of soup.select and just collect the data-phone value from the attribute.
UPDATE
Ok I have searched in the DOM myself, and here is how I managed to retrieve all the phone data:
anchores = soup.select('a[data-phone]')
for a in anchores:
print(a.get('data-phone'))
It works also with only data selector like this: soup.select('[data-phone]')
Here real proof:
Surprisingly, for me it works also this one with classes:
for a in soup.select('a.mlr__item__cta.jsMlrMenu'):
print(a.get('data-phone'))
There is no surprise, we just had a typo in our first selector...
Find the difference :)
GOOD: a.mlr__item__cta.jsMlrMenu
BAD : a.mlr__item_cta.jsMlrMenu

Function from list of parameters

can you please help me with my python code? I wanted to parse several homepages with beautiful soup provided in the list html with the function stars
html=["https://www.onvista.de/aktien/fundamental/Adidas-Aktie-DE000A1EWWW0", "https://www.onvista.de/aktien/fundamental/Allianz-Aktie-DE0008404005", "https://www.onvista.de/aktien/fundamental/BASF-Aktie-DE000BASF111"]
def stars(html):
bsObj = BeautifulSoup(html.read())
starbewertung = bsObj.findAll("section")[8].findAll("div")[1].findAll("span")[16]
str_cells = str(starbewertung)
cleantext = BeautifulSoup(str_cells, "lxml").get_text()
print(cleantext)
lst=[]
lst.append(cleantext)
stars(html)
Instead I am getting an error "AttributeError: 'list' object has no attribute 'read'"
As some of the comments mentioned you need to use the requests library to actually grab the content of each link in your list.
import requests
from bs4 import BeautifulSoup
html=["https://www.onvista.de/aktien/fundamental/Adidas-Aktie-DE000A1EWWW0", "https://www.onvista.de/aktien/fundamental/Allianz-Aktie-DE0008404005", "https://www.onvista.de/aktien/fundamental/BASF-Aktie-DE000BASF111"]
def stars(html):
for url in html:
resp = requests.get(url)
bsObj = BeautifulSoup(resp.content, 'html.parser')
print(bsObj) # Should print the entire html document.
# Do other stuff with bsObj here.
stars(html)
The IndexError from bsObj.findAll("section")[8].findAll("div")[1].findAll("span")[16] is something you'll need to figure out yourself.
You have a couple of errors here.
you are trying to load the whole list of pages into BeautifulSoup. You should process page by page.
You should get the source code of the page before processing it.
there is no "section" element on the page you are loading, so you will get an exception as you are trying to get the 8th element. So you might need to evaluate whether you found anything.
def stars(html):
request = requests.get(html)
if request.status_code != 200:
return
page_content = request.content
bsObj = BeautifulSoup(page_content)
starbewertung = bsObj.findAll("section")[8].findAll("div")[1].findAll("span")[16]
str_cells = str(starbewertung)
cleantext = BeautifulSoup(str_cells, "lxml").get_text()
print(cleantext)
for page in html:
stars(page)

How to download the picture which is located on the same place on each url from a url list text file?

I have a code which I wrote with the help of this community. (shoutout to #chitown88)
Now I want to implement the same method for scraping photos on the pages. one example is the following URL:
https://www.meisamatr.com/fa/product/%D8%A2%D8%B1%D8%A7%DB%8C%D8%B4%DB%8C/%D9%84%D9%88%D8%A7%D8%B2%D9%85-%D8%AC%D8%A7%D9%86%D8%A8%DB%8C-%D8%A2%D8%B1%D8%A7%DB%8C%D8%B4%DB%8C/%D8%A8%D8%B1%D8%B3%D8%8C-%D8%A7%D9%BE%D9%84%DB%8C%DA%A9%D8%A7%D8%AA%D9%88%D8%B1-%D9%88-%D8%B3%D8%A7%DB%8C%D8%B1/4647-%D8%A7%D8%B3%D9%86%D8%B3-%D8%A8%D8%B1%D8%B3-%D9%84%D8%A8-%D9%87%D9%84%D9%88-%D9%87%D9%BE%DB%8C%D9%86%D8%B3.html
I want to download the full-size picture which can be found if we inspect element on the picture:
<img src="https://www.meisamatr.com/upload/thumb1/product/1518428319.jpg" alt="اسنس برس لب هلو هپینس" title="اسنس برس لب هلو هپینس" class="thumb" data-large-img-url="https://www.meisamatr.com/upload/product/1518428319.jpg" id="magnifier-item-0">
the following URL is what we need:
data-large-img-url="https://www.meisamatr.com/upload/product/1518428319.jpg"
Let's suppose we have a file called links.txt which looks like this
https://www.meisamatr.com/fa/product/آرایشی/آرایش-صورت/کانسیلر/6494-اسنس-کانسیلر-کموفلاژ-با-پوشانندگی-کامل-10.html
https://www.meisamatr.com/fa/product/آرایشی/آرایش-صورت/کانسیلر/6493-اسنس-کانسیلر-کموفلاژ-با-پوشانندگی-کامل-05.html
https://www.meisamatr.com/fa/product/آرایشی/آرایش-صورت/کرم-پودر/6492-اسنس-هایلایتر-برنزه-کننده-مایع.html
https://www.meisamatr.com/fa/product/آرایشی/آرایش-صورت/پودر-صورت/6491-اسنس-پودر-فشرده-صورت-10.html
.
.
.
The following is what I came up with. But it shows "No connection adapters were found for" error.
What do you suggest? Thank you in advance for your time.
>>> import requests
>>> import urllib.request
>>> from bs4 import BeautifulSoup
with open('links.txt','r') as f:
urls = f.read().split()
for url in urls:
try:
source = requests.get(url).text
soup = BeautifulSoup(source, 'lxml')
page = soup.find_all('div', class_='slick-slide slick-active')
pic = page.find('img', class_='thumb')['data-large-img-url']
print(pic)
urllib.request.urlretrieve(pic, "local-filename.jpg")
except Exception as e:
print(e)
break
you're not too far off. Again just need to adjust to get those specific tags.
soup.find_all will return a list of elements. You'll need to iterate through those to get what you want. However, in the links I've seen, what you are trying to get is in the first instance of that tag, so I simply changed from .find_all() to just .find()
I also removed the break. With the break, your loop will stop once it reaches an error from a link, even if it didn't get through all the links. Removeing the break with let it continue to the other links even if it comes across a link that doesn't pull the image/url.
You also may want to have the image file name you save as to be dynamic, otherwise it'll overwrite each time:
import requests
import urllib.request
from bs4 import BeautifulSoup
with open('links.txt','r', encoding="utf8") as f:
urls = f.read().split()
num = 1
for url in urls:
try:
source = requests.get(url).text
soup = BeautifulSoup(source, 'lxml')
page = soup.find('div', class_='single_slideshow_big')
pic = page.find('img')['data-large-img-url']
print(pic)
urllib.request.urlretrieve(pic, "local-filename_%02d.jpg" %(num))
num += 1
except Exception as e:
print(e)

Categories