Looping through HTML & following links - python

I am writing a code that is supposed to open a url, identify the 3rd link and repeat this process 3 times (each time with the new url).
I wrote a loop (below), but it seems to each time sart over with the original url.
Can someone help me fix my code?
import urllib.request, urllib.parse, urllib.error
from urllib.parse import urljoin
from bs4 import BeautifulSoup
#blanc list
l = []
#starting url
url = input('Enter URL: ')
if len(url) < 1:
url = 'http://py4e-data.dr-chuck.net/known_by_Fikret.html'
#loop
for _ in range(4):
html = urllib.request.urlopen(url).read() #open url
soup = BeautifulSoup(html, 'html.parser') #parse through BeautifulSoup
tags = soup('a') #extract tags
for tag in tags:
url = tag.get('href', None) #extract links from tags
l.append(url) #add the links to a list
url = l[2:3] #slice the list to extract the 3rd url
url = ' '.join(str(e) for e in url) #change the type to string
print(url)
Current Output:
http://py4e-data.dr-chuck.net/known_by_Montgomery.html
http://py4e-data.dr-chuck.net/known_by_Montgomery.html
http://py4e-data.dr-chuck.net/known_by_Montgomery.html
http://py4e-data.dr-chuck.net/known_by_Montgomery.html
Desired output:
http://py4e-data.dr-chuck.net/known_by_Montgomery.html
http://py4e-data.dr-chuck.net/known_by_Mhairade.html
http://py4e-data.dr-chuck.net/known_by_Butchi.html
http://py4e-data.dr-chuck.net/known_by_Anayah.html

You need to define the empty list within the loop. The following code works:
import urllib.request, urllib.parse, urllib.error
from urllib.parse import urljoin
from bs4 import BeautifulSoup
#blanc list
# l = []
#starting url
url = input('Enter URL: ')
if len(url) < 1:
url = 'http://py4e-data.dr-chuck.net/known_by_Fikret.html'
#loop
for _ in range(4):
l = []
html = urllib.request.urlopen(url).read() #open url
soup = BeautifulSoup(html, 'html.parser') #parse through BeautifulSoup
tags = soup('a') #extract tags
for tag in tags:
url = tag.get('href', None) #extract links from tags
l.append(url) #add the links to a list
url = l[2:3] #slice the list to extract the 3rd url
url = ' '.join(str(e) for e in url) #change the type to string
print(url)
Result in terminal:
http://py4e-data.dr-chuck.net/known_by_Montgomery.html
http://py4e-data.dr-chuck.net/known_by_Mhairade.html
http://py4e-data.dr-chuck.net/known_by_Butchi.html
http://py4e-data.dr-chuck.net/known_by_Anayah.html

Just making it simple for You in your way with 1 loop:
for _ in range(4):
html = urllib.request.urlopen(url).read() #open url
soup = BeautifulSoup(html, 'html.parser') #parse through BeautifulSoup
tag = soup('a')[2]
url = tag.get('href', None)
print(url)

Related

How can I change the code to make it such that the html tags do not appear

from bs4 import BeautifulSoup
import requests
url = 'https://www.mediacorp.sg/en/your-mediacorp/our-artistes/tca/male-artistes/ayden-sng-12357686'
artiste_name = 'celeb-name'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
txt = soup.find_all('h1', attrs={'class':artiste_name})
print(txt)
with the above code, i get the output:
[<*h1 class="celeb-name">Ayden Sng</h1*>] #asterisks added to show h1 tags
What do i need to change in my code or how can i make it such that i only get 'Ayden Sng' as my output?
Iterate over each entry of the txt list and extract its txt property:
txt = [element.text for element in txt] # ['Ayden Sng']
Repl.it
from bs4 import BeautifulSoup
import requests
url = 'https://www.mediacorp.sg/en/your-mediacorp/our-artistes/tca/male-artistes/ayden-sng-12357686'
artiste_name = 'celeb-name'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
txt = soup.find_all('h1', attrs={'class':artiste_name})
print(txt[0].text)
if there are more than one reuslt you can use this code:
from bs4 import BeautifulSoup
import requests
url = 'https://www.mediacorp.sg/en/your-mediacorp/our-artistes/tca/male-artistes/ayden-sng-12357686'
artiste_name = 'celeb-name'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
txt = soup.find_all('h1', attrs={'class':artiste_name})
for i in txt:
print(i.text)

How do I extract the underlined value in red below and save it as a list?

How do I extract the underlined value in red below and save it as a list?
You want to extract the Memcode value in href to a in p tag using soup.
However, I don't know how to extract it at all.
Please help me.
My code
import urllib.request
from bs4 import BeautifulSoup
url = "https://www.council.ulsan.kr/kor/councillor/viewByPerson.do"
req = urllib.request.Request(url)
sourcecode = urllib.request.urlopen(url).read()
soup = BeautifulSoup(sourcecode, "html.parser")
list = []
for href in soup.find("div", class_="memList memList-col-3").find_all("a"):
print(href)
try this, using css selector
import requests
from bs4 import BeautifulSoup
resp = requests.get('https://www.gjcouncil.go.kr/kr/member/name.do')
soup = BeautifulSoup(resp.text, "html.parser")
for a in soup.select("div[id='member_list'] > ul > li > a"):
print(a['href'].split("/")[2])
08070
00716
08040
....
....
You can use split on the "=" and take the -1 index. I also changed the class .
import urllib.request
from bs4 import BeautifulSoup
url = "https://www.council.ulsan.kr/kor/councillor/viewByPerson.do"
req = urllib.request.Request(url)
sourcecode = urllib.request.urlopen(url).read()
soup = BeautifulSoup(sourcecode, "html.parser")
ids = [i['href'].split('=')[-1] for i in soup.select('.btn-home')]
import urllib.request
from bs4 import BeautifulSoup
url = "https://www.council.ulsan.kr/kor/councillor/viewByPerson.do"
req = urllib.request.Request(url)
sourcecode = urllib.request.urlopen(url).read()
soup = BeautifulSoup(sourcecode, "html.parser")
href_list = []
for href in soup.find("div", class_="memList memList-col-3").find_all("a"):
if href['href'] == '#LINK':
pass
else:
href_list.append(href['href'][-7:])
print(href_list)
['7620212', '7670126', '7670420', '7650601', '7890930', '7800407', '7660925', '7641102', '7731222', '7801011', '7570803', '7770106', '7590808', '7700831', '7580115', '7710713', '7680112', '7621125', '7711117', '7680213', '7640925', '7591214']
One of the best method is using Regular-Expression.
Check out this code :
import urllib.request
from bs4 import BeautifulSoup
import re
url = "https://www.council.ulsan.kr/kor/councillor/viewByPerson.do"
req = urllib.request.Request(url)
sourcecode = urllib.request.urlopen(url).read()
soup = BeautifulSoup(sourcecode, "html.parser")
list_ = []
for href in soup.find("div", class_="memList memList-col-3").find_all("a"):
list_.append(href['href'])
regobj = re.compile(r'memCode=(\w+)')
final = list(filter('#LINK'.__ne__, list_))
result = list(map(lambda i: regobj.search(i).group(1) ,final))
print(result)

Iterating over urls fails to find correct href in Python using BeautifulSoup

I am iterating through the website in the code. The following is what my code does. Loops through the 52 pages and gets the link to each URLs.
Then it iterates through those URLs and tries to get the link for the English Translation. if you see the Mongolian website, it has a section "Орчуулга" on the top right and it has "English" underneath - that is the link to the English translation.
However, my code fails to grab the link for the english translation and gives a wrong url.
Below is a sample output for the first article.
1
{'https://mn.usembassy.gov/mn/2020-naadam-mn/': 'https://mn.usembassy.gov/mn/sitemap-mn/'}
The expected output for the first page should be
1
{'https://mn.usembassy.gov/mn/2020-naadam-mn/': 'https://mn.usembassy.gov/2020-naadam/'}
Below is my code
import requests
from bs4 import BeautifulSoup
url = 'https://mn.usembassy.gov/mn/news-events-mn/page/{page}/'
urls = []
for page in range(1, 53):
print(str(page) + "/52")
soup = BeautifulSoup(requests.get(url.format(page=page)).content, 'html.parser')
for h in soup.find_all('h2'):
a = h.find('a')
urls.append(a.attrs['href'])
print(urls)
i = 0
bilingual_dict = {}
for url in urls:
i += 1
print(i)
soup = BeautifulSoup(requests.get(url.format(page=url)).content, 'html.parser')
for div in soup.find_all('div', class_='translations_sidebar'):
for ul in soup.find_all('ul'):
for li in ul.find_all('li'):
a = li.find('a')
bilingual_dict[url] = a['href']
print(bilingual_dict)
print(bilingual_dict)
This script will print link to english translation:
import requests
from bs4 import BeautifulSoup
url = 'https://mn.usembassy.gov/mn/2020-naadam-mn/'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
link = soup.select_one('a[hreflang="en"]')
print(link['href'])
Prints:
https://mn.usembassy.gov/2020-naadam/
Complete code: (Where there isn't link to english translation, the value is set to None)
import requests
from bs4 import BeautifulSoup
from pprint import pprint
url = 'https://mn.usembassy.gov/mn/news-events-mn/page/{page}/'
urls = []
for page in range(1, 53):
print('Page {}...'.format(page))
soup = BeautifulSoup(requests.get(url.format(page=page)).content, 'html.parser')
for h in soup.find_all('h2'):
a = h.find('a')
urls.append(a.attrs['href'])
pprint(urls)
bilingual_dict = {}
for url in urls:
print(url)
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
link = soup.select_one('a[hreflang="en"]')
bilingual_dict[url] = link['href'] if link else None
pprint(bilingual_dict)

How to extract href links from anchor tags using BeautifulSoup?

I've been trying to extract just the links corresponding to the jobs on each page. But for some reason they dont print when I execute the script. No errors occur.
for the inputs I put engineering, toronto respectively. Here is my code.
import requests
from bs4 import BeautifulSoup
import webbrowser
jobsearch = input("What type of job?: ")
location = input("What is your location: ")
url = ("https://ca.indeed.com/jobs?q=" + jobsearch + "&l=" + location)
r = requests.get(url)
rcontent = r.content
prettify = BeautifulSoup(rcontent, "html.parser")
all_job_url = []
for tag in prettify.find_all('div', {'data-tn-element':"jobTitle"}):
for links in tag.find_all('a'):
print (links['href'])
You should be looking for the anchor a tag. It looks like this:
<a class="turnstileLink" data-tn-element="jobTitle" href="/rc/clk?jk=3611ac98c0167102&fccid=459dce363200e1be" ...>Project <b>Engineer</b></a>
Call soup.find_all and iterate over the result set, extracting the links through the href attribute.
import requests
from bs4 import BeautifulSoup
# valid query, replace with something else
url = "https://ca.indeed.com/jobs?q=engineer&l=Calgary%2C+AB"
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")
all_job_url = []
for tag in soup.find_all('a', {'data-tn-element':"jobTitle"}):
all_job_url.append(tag['href'])

how do I create a list from a sitemap.xml file to extract the url in python?

I need to create a code to extract a word from one scrape of images.
I'll explain, from a page sitemap.xml ,my code must try in every link present in this xml file, found insiede each link if there a specific word, inside an image link.
the sitemap is adidas = http://www.adidas.it/on/demandware.static/-/Sites-adidas-IT-Library/it_IT/v/sitemap/product/adidas-IT-it-it-product.xml
this is the code i created for search the image contains the word "ZOOM" :
import requests
from bs4 import BeautifulSoup
html = requests.get(
'http://www.adidas.it/scarpe-superstar/C77124.html').text
bs = BeautifulSoup(html)
possible_links = bs.find_all('img')
for link in possible_links:
if link.has_attr('src'):
if link.has_key('src'):
if 'zoom' in link['src']:
print link['src']
but im search a metod to scrape a list in automatic
thankyou so much
i try to do this for have list :
from bs4 import BeautifulSoup
import requests
url = "http://www.adidas.it/on/demandware.static/-/Sites-adidas-IT-Library/it_IT/v/sitemap/product/adidas-IT-it-it-product.xml"
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data)
for url in soup.findAll("loc"):
print url.text
but i cant to attach request..
i can find the word "Zoom" in any link present in sitemap.xml
thankyou so much
import requests
from bs4 import BeautifulSoup
import re
def make_soup(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
return soup
# put urls in a list
def get_xml_urls(soup):
urls = [loc.string for loc in soup.find_all('loc')]
return urls
# get the img urls
def get_src_contain_str(soup, string):
srcs = [img['src']for img in soup.find_all('img', src=re.compile(string))]
return srcs
if __name__ == '__main__':
xml = 'http://www.adidas.it/on/demandware.static/-/Sites-adidas-IT-Library/it_IT/v/sitemap/product/adidas-IT-it-it-product.xml'
soup = make_soup(xml)
urls = get_xml_urls(soup)
# loop through the urls
for url in urls:
url_soup = make_soup(url)
srcs = get_src_contain_str(url_soup, 'zoom')
print(srcs)

Categories