Extract image links from the webpage using Python - python

So I wanted to get all of the pictures on this page(of the nba teams).
http://www.cbssports.com/nba/draft/mock-draft
However, my code gives a bit more than that. It gives me,
<img src="http://sports.cbsimg.net/images/nba/logos/30x30/ORL.png" alt="Orlando Magic" width="30" height="30" border="0" />
How can I shorten it to only give me, http://sports.cbsimg.net/images/nba/logos/30x30/ORL.png.
My code:
import urllib2
from BeautifulSoup import BeautifulSoup
# or if your're using BeautifulSoup4:
# from bs4 import BeautifulSoup
soup = BeautifulSoup(urllib2.urlopen('http://www.cbssports.com/nba/draft/mock-draft').read())
rows = soup.findAll("table", attrs = {'class': 'data borderTop'})[0].tbody.findAll("tr")[2:]
for row in rows:
fields = row.findAll("td")
if len(fields) >= 3:
anchor = row.findAll("td")[1].find("a")
if anchor:
print anchor

I know this can be "traumatic", but for those automatically generated pages, where you just want to grab the damn images away and never come back, a quick-n-dirty regular expression that takes the desired pattern tends to be my choice (no Beautiful Soup dependency is a great advantage):
import urllib, re
source = urllib.urlopen('http://www.cbssports.com/nba/draft/mock-draft').read()
## every image name is an abbreviation composed by capital letters, so...
for link in re.findall('http://sports.cbsimg.net/images/nba/logos/30x30/[A-Z]*.png', source):
print link
## the code above just prints the link;
## if you want to actually download, set the flag below to True
actually_download = False
if actually_download:
filename = link.split('/')[-1]
urllib.urlretrieve(link, filename)
Hope this helps!

To save all the images on http://www.cbssports.com/nba/draft/mock-draft,
import urllib2
import os
from BeautifulSoup import BeautifulSoup
URL = "http://www.cbssports.com/nba/draft/mock-draft"
default_dir = os.path.join(os.path.expanduser("~"),"Pictures")
opener = urllib2.build_opener()
urllib2.install_opener(opener)
soup = BeautifulSoup(urllib2.urlopen(URL).read())
imgs = soup.findAll("img",{"alt":True, "src":True})
for img in imgs:
img_url = img["src"]
filename = os.path.join(default_dir, img_url.split("/")[-1])
img_data = opener.open(img_url)
f = open(filename,"wb")
f.write(img_data.read())
f.close()
To save any particular image on http://www.cbssports.com/nba/draft/mock-draft,
use
soup.find("img",{"src":"image_name_from_source"})

You can use this functions for getting the list of all images url from url.
#
#
# get_url_images_in_text()
#
# #param html - the html to extract urls of images from him.
# #param protocol - the protocol of the website, for append to urls that not start with protocol.
#
# #return list of imags url.
#
#
def get_url_images_in_text(html, protocol):
urls = []
all_urls = re.findall(r'((http\:|https\:)?\/\/[^"\' ]*?\.(png|jpg))', html, flags=re.IGNORECASE | re.MULTILINE | re.UNICODE)
for url in all_urls:
if not url[0].startswith("http"):
urls.append(protocol + url[0])
else:
urls.append(url[0])
return urls
#
#
# get_images_from_url()
#
# #param url - the url for extract images url from him.
#
# #return list of images url.
#
#
def get_images_from_url(url):
protocol = url.split('/')[0]
resp = requests.get(url)
return get_url_images_in_text(resp.text, protocol)

Related

Unable to read document

I can't seem to figure out how to get this script to read the actual documents within the links it pulls. I can't seem to get it to bring back in text from the actual document within the links. I also tried to use iframe and src but was unsuccessful.
I have never tried anything like this before so a little stumped on what else I can do.
from bs4 import BeautifulSoup
from selenium import webdriver
from urllib.parse import urlparse, parse_qs
import io
from PyPDF2 import PdfReader
# specify the web driver path
driver = webdriver.Chrome("path/to/chromedriver")
# navigate to the website
url = "https://probaterecords.shelbyal.com/shelby/search.do?indexName=opr&templateName=Main&searchQuery=richard+wygle&lq=&searchType=1&regex=%5B%5Ea-zA-Z0-9_%5D&regexwithspaces=%5B%5Ea-zA-Z0-9%5Cs_%5D&regexwithasterisks=%5B%5E*a-zA-Z0-9%5Cs_%5D&sortBy=InstrumentFilter&desc=N&searchable=DisplayName%2CLastName%2CFirstName%2CInstrument%2CRecDate%2CPartyRole%2CDocTypeDesc%2CDocType%2CBook%2CPage%2CLot%2CBlock%2CTownship%2COther%2CFreeform%2COtherName&isPhoneticSearch=&q=richard+wygle&basicSortOrder=InstrumentFilter%7CN&Instrument=&Instrument_select=AND&RecDate=&RecDate=&RecDate_select=AND&LastName=&LastName_select=AND&searchkindLast=StartsLast&FirstName=&FirstName_select=OR&FirstName2=&FirstName2_select=AND&DocTypeDesc=&DocTypeDesc_select=AND&Book=&Book_select=AND&Page=&Page_select=AND&MAPBOOK=&MAPBOOK_select=AND&MAPPAGE=&MAPPAGE_select=AND&Lot%23=&Lot%23_select=AND&Lot=&Lot_select=AND&Block=&Block_select=AND&Section=&Section_select=AND&Township=&Township_select=AND&Range=&Range_select=AND&QT=&QT_select=AND&BQT=&BQT_select=AND&LegacyNum=&LegacyNum_select=AND&advancedSortOrder=InstrumentFilter%7CN"
driver.get(url)
# get the page source
html = driver.page_source
# parse the HTML
soup = BeautifulSoup(html, 'html.parser')
# find all the anchor tags with class "nocolor pphoto"
links = soup.select('a[class="nocolor pphoto"]')
# Create an empty dictionary to store the links
unique_links = {}
for link in links:
href = link['href']
if href.startswith('/shelby/search.do?indexName=shelbyimages&lq='):
# construct the full link
full_link = 'https://probaterecords.shelbyal.com' + href
# parse the query parameters from the link
parsed_url = urlparse(full_link)
query_params = parse_qs(parsed_url.query)
# extract the instrument number from the query parameters
instrument_number = query_params['lq'][0]
# Extract the document type
options = soup.select('select[name="DocTypeDesc"] option')
for option in options:
# check if the option value contains "deed"
if "deeds" in option.get_text().lower():
doc_type = option.get_text()
# add the link to the dictionary
unique_links[instrument_number] = (full_link, doc_type)
# Iterate over the unique links
for instrument_number, link_info in unique_links.items():
full_link, doc_type = link_info
# Open the PDF file from the url
response = requests.get(full_link)
pdf_file = io.BytesIO(response.content)
pdf_reader = PdfReader(pdf_file)
# Get the number of pages
pages = len(pdf_reader.pages)
# Initialize a variable to store the text
text = ""
# Iterate over the pages
for page in pdf_reader.pages:
# Extract the text from the page
text += page.extract_text()
# Print the document type, instrument number and the text
print("Document Type: ", doc_type)
print("Instrument Number: ", instrument_number)
print("Text: ", text)

Problem in fetching long URLs using BeautifulSoup

I am trying to fetch a URL from a webpage, here is how the URL looks in the Inspect section:
Here is how the URL looks in my python-code:
How can I get the actual URL without the ../../ part using BeautifulSoup?
Here is my code in case it's needed:
import re
import requests
from bs4 import BeautifulSoup
source = requests.get('https://books.toscrape.com/catalogue/category/books_1/index.html').text
soup = BeautifulSoup(source, 'lxml')
# article = soup.find('article')
# title = article.div.a.img['alt']
# print(title['alt'])
titles, topics,urls,sources = [], [], [],[]
article_productPod = soup.findAll('article', {"class":"product_pod"})
for i in article_productPod:
titles.append(i.div.a.img['alt'])
# print(titles)
for q in article_productPod:
urls.append(q.h3.a['href'])
print(urls[0])
# for z in range(len(urls)):
# source2 = requests.get("https://" + urls[z])
Use urllib:
import urllib
Store your target URL in a separate variable :
src_url = r'https://books.toscrape.com/catalogue/category/books_1/index.html'
source = requests.get(src_url).text
Join the website's URL and the relative URL:
for q in article_productPod:
urls.append(urllib.parse.urljoin(src_url, q.h3.a['href']))

How to scrape data from interactive chart using python?

I have a next link which represent an exact graph I want to scrape: https://index.minfin.com.ua/ua/economy/index/svg.php?indType=1&fromYear=2010&acc=1
I'm simply can't understand is it a xml or svg graph and how to scrape data. I think I need to use bs4, requests but don't know the way to do that.
Anyone could help?
You will load HTML like this:
import requests
url = "https://index.minfin.com.ua/ua/economy/index/svg.php?indType=1&fromYear=2010&acc=1"
resp = requests.get(url)
data = resp.text
Then you will create a BeatifulSoup object with this HTML.
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, features="html.parser")
After this, it is usually very subjective how to parse out what you want. The candidate codes may vary a lot. This is how I did it:
Using BeautifulSoup, I parsed all "rect"s and check if "onmouseover" exists in that rect.
rects = soup.svg.find_all("rect")
yx_points = []
for rect in rects:
if rect.has_attr("onmouseover"):
text = rect["onmouseover"]
x_start_index = text.index("'") + 1
y_finish_index = text[x_start_index:].index("'") + x_start_index
yx = text[x_start_index:y_finish_index].split()
print(text[x_start_index:y_finish_index])
yx_points.append(yx)
As you can see from the image below, I scraped onmouseover= part and get those 02.2015 155,1 parts.
Here, this is how yx_points looks like now:
[['12.2009', '100,0'], ['01.2010', '101,8'], ['02.2010', '103,7'], ...]
from bs4 import BeautifulSoup
import requests
import re
#First get all the text from the url.
url="https://index.minfin.com.ua/ua/economy/index/svg.php?indType=1&fromYear=2010&acc=1"
response = requests.get(url)
html = response.text
#Find all the tags in which the data is stored.
soup = BeautifulSoup(html, 'lxml')
texts = soup.findAll("rect")
final = []
for each in texts:
names = each.get('onmouseover')
try:
q = re.findall(r"'(.*?)'", names)
final.append(q[0])
except Exception as e:
print(e)
#The details are appended to the final variable

how do I create a list from a sitemap.xml file to extract the url in python?

I need to create a code to extract a word from one scrape of images.
I'll explain, from a page sitemap.xml ,my code must try in every link present in this xml file, found insiede each link if there a specific word, inside an image link.
the sitemap is adidas = http://www.adidas.it/on/demandware.static/-/Sites-adidas-IT-Library/it_IT/v/sitemap/product/adidas-IT-it-it-product.xml
this is the code i created for search the image contains the word "ZOOM" :
import requests
from bs4 import BeautifulSoup
html = requests.get(
'http://www.adidas.it/scarpe-superstar/C77124.html').text
bs = BeautifulSoup(html)
possible_links = bs.find_all('img')
for link in possible_links:
if link.has_attr('src'):
if link.has_key('src'):
if 'zoom' in link['src']:
print link['src']
but im search a metod to scrape a list in automatic
thankyou so much
i try to do this for have list :
from bs4 import BeautifulSoup
import requests
url = "http://www.adidas.it/on/demandware.static/-/Sites-adidas-IT-Library/it_IT/v/sitemap/product/adidas-IT-it-it-product.xml"
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data)
for url in soup.findAll("loc"):
print url.text
but i cant to attach request..
i can find the word "Zoom" in any link present in sitemap.xml
thankyou so much
import requests
from bs4 import BeautifulSoup
import re
def make_soup(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
return soup
# put urls in a list
def get_xml_urls(soup):
urls = [loc.string for loc in soup.find_all('loc')]
return urls
# get the img urls
def get_src_contain_str(soup, string):
srcs = [img['src']for img in soup.find_all('img', src=re.compile(string))]
return srcs
if __name__ == '__main__':
xml = 'http://www.adidas.it/on/demandware.static/-/Sites-adidas-IT-Library/it_IT/v/sitemap/product/adidas-IT-it-it-product.xml'
soup = make_soup(xml)
urls = get_xml_urls(soup)
# loop through the urls
for url in urls:
url_soup = make_soup(url)
srcs = get_src_contain_str(url_soup, 'zoom')
print(srcs)

Extracting image src based on attribute with BeautifulSoup

I'm using BeautifulSoup to get a HTML page from IMDb, and I would like to extract the poster image from the page. I've got the image based on one of the attributes, but I don't know how to extract the data inside it.
Here's my code:
url = 'http://www.imdb.com/title/tt%s/' % (id)
soup = BeautifulSoup(urllib2.urlopen(url).read())
print("before FOR")
for src in soup.find(itemprop="image"):
print("inside FOR")
print(link.get('src'))
You're almost there - just a couple of mistakes. soup.find() gets the first element that matches, not a list, so you don't need to iterate over it. Once you have got the element, you can get its attributes (like src) using dictionary access. Here's a reworked version:
film_id = '0423409'
url = 'http://www.imdb.com/title/tt%s/' % (film_id)
soup = BeautifulSoup(urllib2.urlopen(url).read())
link = soup.find(itemprop="image")
print(link["src"])
# output:
http://ia.media-imdb.com/images/M/MV5BMTg2ODMwNTY3NV5BMl5BanBnXkFtZTcwMzczNjEzMQ##._V1_SY317_CR0,0,214,317_.jpg
I've changed id to film_id, because id() is a built-in function, and it's bad practice to mask those.
I believe your example is very close. You need to use findAll() instead of find() and when you iterate, you switch from src to link. In the below example I switched it to tag
This code is working for me with BeautifulSoup4:
url = 'http://www.imdb.com/title/tt%s/' % (id,)
soup = BeautifulSoup(urllib2.urlopen(url).read())
print "before FOR"
for tag in soup.findAll(itemprop="image"):
print "inside FOR"
print(tag['src'])
If I understand correctly you are looking for the src of the image, for the extraction of it after that.
In the first place you need to find (using the inspector) in which position in the HTML is the image. For example, in my particle case that I was scrapping soccer team shields, I needed:
m_url = 'http://www.marca.com/futbol/primera/equipos.html'
client = uOpen(m_url)
page = client.read()
client.close()
page_soup = BS(page, 'html.parser')
teams = page_soup.findAll('li', {'id': 'nombreEquipo'})
for team in teams:
name = team.h2.text
shield_url = team.img['src']
Then, you need to process the image. You have to options.
1st: using numpy:
def url_to_image(url):
'''
FunciĆ³n para extraer una imagen de una URL
'''
resp = uOpen(url)
image = np.asarray(bytearray(resp.read()), dtype='uint8')
image = cv2.imdecode(image, cv2.IMREAD_COLOR)
return image
shield = url_to_image(shield_url)
2nd Using scikit-image library (that you will probably need to install):
shield = io.imread('http:' + shield_url)
Note: Just in this particular example I needed to add http: at the beggining.
Hope it helps!
Here's a full working example with gazpacho:
Step 1 - import everything and download the html:
from pathlib import Path
from urllib.request import urlretrieve as download
from gazpacho import Soup
id = 'tt5057054'
url = f"https://www.imdb.com/title/{id}"
soup = Soup.get(url)
Step 2 - find the src url for the image asset:
image = (soup
.find("div", {"id": "title-overview"})
.find("div", {"class": "poster"})
.find("img")
.attrs['src']
)
Step 3 - save it to your machine:
directory = "images"
Path(directory).mkdir(exist_ok=True)
extension = image.split('.')[-1]
download(image, f"{directory}/{id}.{extension}")

Categories