How to extract links from HTML using BeautifulSoup?

How to extract links from HTML using BeautifulSoup? - python

I am trying to use regex function in python to filter out links from the html that I scraped on eBay website.
My question is how can I filter out those links with using following pattern: https://www.ebay.com/itm/ + all other characters.
I am getting successfully the https://www.ebay.com/itm/ part but I am not sure how to do the rest.
Python version that I am using: 3.8.8.
Here is the code:
from bs4 import BeautifulSoup
import requests
import re
url = 'https://www.ebay.com/sch/i.html?_from=R40&_trksid=p2380057.m570.l1313&_nkw=Universal+Adjustable+Hand+Shower+Holder+Suction+Cup+Holder+Full+Plating+Shower+Rail+Head+Holder+Bathroom+Bracket+Stable+rotation&_sacat=0'
r = requests.get(url)
soup = BeautifulSoup(r.content, features="lxml")
listings = soup.find_all('li')
pattern = 'https://www.ebay.com/itm/'
results = re.findall('https://www.ebay.com/itm/', str(listings))
print(results)

To get links that starts with https://www.ebay.com/itm/ you can do:
import requests
from bs4 import BeautifulSoup
url = "https://www.ebay.com/sch/i.html?_from=R40&_trksid=p2380057.m570.l1313&_nkw=Universal+Adjustable+Hand+Shower+Holder+Suction+Cup+Holder+Full+Plating+Shower+Rail+Head+Holder+Bathroom+Bracket+Stable+rotation&_sacat=0"
r = requests.get(url)
soup = BeautifulSoup(r.content, features="lxml")
listings = soup.select("li a")
for a in listings:
link = a["href"]
if link.startswith("https://www.ebay.com/itm/"):
print(link)
Prints:
https://www.ebay.com/itm/01920391?epid=26039819083&_trkparms=ispr%3D1&hash=item3b542eae7a:g:FQkAAOSwK21gKvEZ&amdata=enc%3AAQAFAAACcBaobrjLl8XobRIiIML1V4Imu%252Fn%252BzU5L90Z278x5ickkrDx%252B2NLp21dg6hHbHAkGMYdiW1E6zjXxnQ0bf7c%252Fx%252Fvs5PW%252FYFw1ZdbGMi8wsGV6qXw8OFLl4Os1ACX3bnQxFkVpRib9hMb5gVyLha4q9L0xiporu5InbX0LrSgg7nCCCwtC7y3vOE3hc8PszsrXWLb5KFdj7%252BD98et12MdkEfMPFhJZuS%252BkFsp2esVTRCYctOhcwzPSdfzCOYprlr2miQc4czCv1Tcfs3LKUPJn8uQyRc%252BAnKY1oyTeYnJ7wYuGkBU%252FSVYjziLBaPhT%252FlVu0hR9ZX6OnAeRaJ1g0iCaDjrRXEXRwUO87riWeI8kExm1zzY7QicPeMnfWZdBvVhg05GOScPOlLTVPHakqGLX0y2GUXV6fkTLua3nSF5YBmLX%252FqdCxT6yS0dutVs5MPWvQYlN474hUzbubkZVAs7Y%252BBBEsHrGjVzCj0szZ6w1%252BHgkV5O9jrXGnyew5%252Bnxy7VCq5xEkUDIt1nSg996AeDksNmSNumhfsIOGltIXbqAbjqEUpPcVO%252BDPymxlh0iMxCZQalYnmljBRzKILYWkES0vfA14Gh5E7KWrztdC6WzEEFtgVuABakQ1eAOZnuEueqK6IakC%252BIfRbXv96Tv01IPDvwPeM8wMo6j8bMjY3D5KHS5EXPVdHKUnjCJiYCcVUqcKwhL6eN2MZ%252Bn9yxmWESUPN394NPrX%252FI2z7t0Bbo7iqmsWNQcyi0EHzDwJPMK%252FNSif8%252F2adRF7dT1JrbL9sryKSN2kv9OsdGQ0fMMC1LV3Ph43HivUJdqkgjGxqEqX5v1xQ%253D%253D%7Ccksum%3A25481541593068896952f4834d93a0bb998f5b5ba5fe%7Campid%3APL_CLK%7Cclp%3A2334524
https://www.ebay.com/itm/333909214974?_trkparms=ispr%3D1&hash=item4dbe8a46fe:g:wjYAAOSwXZtgQKMK&amdata=enc%3AAQAGAAADAPYe5NmHp%252B2JMhMi7yxGiTJkPrKr5t53CooMSQt2orsSprd4IgPv6LBzlteBBg9Pe7gpGas6KEYWvIQmgmNrIcf1ZIcLwSyYlVIojBTlJPAd4XoDj5k2AzULwOu2K1678su8RhEOIAhD%252FWN3J3erMgWugg2CT0y8zcUjbbWHtNfzYrnVuG%252FUvWmYzvxdGvHZvRdzha0juBNJzGKXnChJ6MECxb2uzy5G631xzrIxblEW3AM1ogDVv19il%252Fn7Vlzr0cERgdjLqGUj5KIynU8Xj9sHRx%252F%252FcSxPq8W4v1RHP%252FYZQhE4keU2U3pZGBCy9hyaYaf5pr78zEcjg2aoyh%252B6DUFkxtjTnmPhNqiwO5QNkl21UaDagoNQYRkZb4iW0QpXX628qHxIqBVL4GjtRczXt%252Bi0ZVn0B6GBqNzHtHarXqCSe7966rspJOMXwYB%252Fj3m%252Fpe3oJO8dKBNDmzxUNhnE4Bf76ZlFiW%252Fh9TXWEO3vdhLjRjjmjJKjgj7IXVorj%252BypBfHCbTXCA7nHsBSsOdNR%252B3LopnwoQutVCsk%252FQ8bESJCn0unvLARcM6wBRakZl3Uxhe1iSnMV%252F69vpTVm2XC1ZjU0IzF7xB5laYPRPcrOwXSJHwZJHkwyziteo%252FWBZNkklgTjY%252BdEanyHKlzppyJWLAufvIbWt4YWy3Xz%252FAKaKScpMGsHWNqEKwGJ%252BPeSFwq57qnm5xK0InzfsHtyVHd6lWoaKqyP8e7nVQBjqfNciDdLD5D3fuiIHQupPyX2m1Z%252B%252FeQOayocP6GV%252BAfSHCS44Y%252Fgh4njFjCVO5KAR%252BZXiNddo%252BfFGSpUZO1q38Cq3MDvyeRF3vYd2r3FNaimwBT2d3%252Fnl3seechJuFBWHONudGgMuSiXwoy66SbYwIj0CD427%252B1FVDHw%252F4LSkJhi78oWKX2I9sGnvHrj4boKie6mlPN6ZGOenVXOpLWNK8ZSN%252FzEqUzt6iC32EFb8rj%252FC4HK9AWddRejsdJHnsFKs7QAysEUN%252Fh6ltSA3FR64qww0Qvp3A%253D%253D%7Campid%3APL_CLK%7Cclp%3A2334524
https://www.ebay.com/itm/333909214974?_trkparms=ispr%3D1&hash=item4dbe8a46fe:g:wjYAAOSwXZtgQKMK&amdata=enc%3AAQAGAAADAPYe5NmHp%252B2JMhMi7yxGiTJkPrKr5t53CooMSQt2orsSprd4IgPv6LBzlteBBg9Pe7gpGas6KEYWvIQmgmNrIcf1ZIcLwSyYlVIojBTlJPAd4XoDj5k2AzULwOu2K1678su8RhEOIAhD%252FWN3J3erMgWugg2CT0y8zcUjbbWHtNfzYrnVuG%252FUvWmYzvxdGvHZvRdzha0juBNJzGKXnChJ6MECxb2uzy5G631xzrIxblEW3AM1ogDVv19il%252Fn7Vlzr0cERgdjLqGUj5KIynU8Xj9sHRx%252F%252FcSxPq8W4v1RHP%252FYZQhE4keU2U3pZGBCy9hyaYaf5pr78zEcjg2aoyh%252B6DUFkxtjTnmPhNqiwO5QNkl21UaDagoNQYRkZb4iW0QpXX628qHxIqBVL4GjtRczXt%252Bi0ZVn0B6GBqNzHtHarXqCSe7966rspJOMXwYB%252Fj3m%252Fpe3oJO8dKBNDmzxUNhnE4Bf76ZlFiW%252Fh9TXWEO3vdhLjRjjmjJKjgj7IXVorj%252BypBfHCbTXCA7nHsBSsOdNR%252B3LopnwoQutVCsk%252FQ8bESJCn0unvLARcM6wBRakZl3Uxhe1iSnMV%252F69vpTVm2XC1ZjU0IzF7xB5laYPRPcrOwXSJHwZJHkwyziteo%252FWBZNkklgTjY%252BdEanyHKlzppyJWLAufvIbWt4YWy3Xz%252FAKaKScpMGsHWNqEKwGJ%252BPeSFwq57qnm5xK0InzfsHtyVHd6lWoaKqyP8e7nVQBjqfNciDdLD5D3fuiIHQupPyX2m1Z%252B%252FeQOayocP6GV%252BAfSHCS44Y%252Fgh4njFjCVO5KAR%252BZXiNddo%252BfFGSpUZO1q38Cq3MDvyeRF3vYd2r3FNaimwBT2d3%252Fnl3seechJuFBWHONudGgMuSiXwoy66SbYwIj0CD427%252B1FVDHw%252F4LSkJhi78oWKX2I9sGnvHrj4boKie6mlPN6ZGOenVXOpLWNK8ZSN%252FzEqUzt6iC32EFb8rj%252FC4HK9AWddRejsdJHnsFKs7QAysEUN%252Fh6ltSA3FR64qww0Qvp3A%253D%253D%7Campid%3APL_CLK%7Cclp%3A2334524
https://www.ebay.com/itm/313269877281?hash=item48f056fa21:g:hd0AAOSwz6tfknZF
https://www.ebay.com/itm/313269877281?hash=item48f056fa21:g:hd0AAOSwz6tfknZF
https://www.ebay.com/itm/184741430233?_trkparms=ispr%3D1&hash=item2b037283d9:g:TyQAAOSwVYBgZxlp&amdata=enc%3AAQAGAAADAPYe5NmHp%252B2JMhMi7yxGiTJkPrKr5t53CooMSQt2orsS7UXID%252BRPOSNsnm8kYPghtOhpdocScHh%252BDgY2CTi3nJTgQcXQEKAs8Ix13gdusXFa3yngmHdTBh2vatEHVnazTxou3w%252FDC26a237oG00lpNN0M8zpUGIN9nYQpAgSrl8Z2cNTqgManH1%252Bz3LIm4YNAlHJk%252BHZA8%252FzFNts2J8C7tWjwB%252FA16vRrvHG7nYYLh0tdLtgbf0dpc2Qe9fAJlzaEBwz89Vmv30NMzTFBw4HT%252FzDvPHrTnvmygxCs75Law6GL0yq5FZiufYo%252Fj1bGviIrCjul2tV%252FqMzojlafc7De3fEH0%252Fx7e%252B3S%252BRPOUwrq%252BjDlOH8%252Bp7tZBwZd70GEYnMNBPUFcFmnDcsZgJhS4xnlt5ibp7JgMABsL7TnNK6pm0ran5wu7KVyRsCnTEkWCx0WZ9emEe1xzzVQrJt%252B7ICl%252FNqMq9gaz3%252F2O%252B4Zf83FlMWgd2K0kvtYiBYlFzRZTKrLIkJUl9pDo8nYDJROrSUnT3xZdDu38mqKft9ckuwd8LsRn8emIR%252BJJQi2I6M1gWYnuPrD09rQQMKD7FA2VxWfA4JfePSSoAwyFVyvOkYN47cJkG3ymeQJZUZOFXt8h2j%252F3b0KlMlImQcYcancrGMk7cx4AfUBca%252BSPEC8e3w2RIDzp62%252F9If%252FVLm6Vq6rvdjjspmxA9r2LErZAgEIu6%252BXWuF3eicCrtX9dQic3TmmPPeordO3nK2QKgdSc487ywrfOX9i5XmQqmFmOyN5W%252B%252FwDmeGHmb1l1KPnWa14048eIHqDITTFibsKyRk7H04VwWrkqhfOxsrek7VxmnK7Ciqgli22Se7VPqPhlVRddgYe%252FIvp5Yw8%252BGoewf4mpOiRQ%252Bmm5QpRVpA%252BusDd2id6Gz2crCd%252FnoYT9shk8tfXCfGwFson3CxpgjeSoLd1tsQ%252FeY0qgZelpVjKdZ%252BeaaJpiC1uKa4r5FZxxJ0b5VxYp8j6F9255HFQGZzAQ2MVJyu647sA%253D%253D%7Campid%3APL_CLK%7Cclp%3A2334524
https://www.ebay.com/itm/184741430233?_trkparms=ispr%3D1&hash=item2b037283d9:g:TyQAAOSwVYBgZxlp&amdata=enc%3AAQAGAAADAPYe5NmHp%252B2JMhMi7yxGiTJkPrKr5t53CooMSQt2orsS7UXID%252BRPOSNsnm8kYPghtOhpdocScHh%252BDgY2CTi3nJTgQcXQEKAs8Ix13gdusXFa3yngmHdTBh2vatEHVnazTxou3w%252FDC26a237oG00lpNN0M8zpUGIN9nYQpAgSrl8Z2cNTqgManH1%252Bz3LIm4YNAlHJk%252BHZA8%252FzFNts2J8C7tWjwB%252FA16vRrvHG7nYYLh0tdLtgbf0dpc2Qe9fAJlzaEBwz89Vmv30NMzTFBw4HT%252FzDvPHrTnvmygxCs75Law6GL0yq5FZiufYo%252Fj1bGviIrCjul2tV%252FqMzojlafc7De3fEH0%252Fx7e%252B3S%252BRPOUwrq%252BjDlOH8%252Bp7tZBwZd70GEYnMNBPUFcFmnDcsZgJhS4xnlt5ibp7JgMABsL7TnNK6pm0ran5wu7KVyRsCnTEkWCx0WZ9emEe1xzzVQrJt%252B7ICl%252FNqMq9gaz3%252F2O%252B4Zf83FlMWgd2K0kvtYiBYlFzRZTKrLIkJUl9pDo8nYDJROrSUnT3xZdDu38mqKft9ckuwd8LsRn8emIR%252BJJQi2I6M1gWYnuPrD09rQQMKD7FA2VxWfA4JfePSSoAwyFVyvOkYN47cJkG3ymeQJZUZOFXt8h2j%252F3b0KlMlImQcYcancrGMk7cx4AfUBca%252BSPEC8e3w2RIDzp62%252F9If%252FVLm6Vq6rvdjjspmxA9r2LErZAgEIu6%252BXWuF3eicCrtX9dQic3TmmPPeordO3nK2QKgdSc487ywrfOX9i5XmQqmFmOyN5W%252B%252FwDmeGHmb1l1KPnWa14048eIHqDITTFibsKyRk7H04VwWrkqhfOxsrek7VxmnK7Ciqgli22Se7VPqPhlVRddgYe%252FIvp5Yw8%252BGoewf4mpOiRQ%252Bmm5QpRVpA%252BusDd2id6Gz2crCd%252FnoYT9shk8tfXCfGwFson3CxpgjeSoLd1tsQ%252FeY0qgZelpVjKdZ%252BeaaJpiC1uKa4r5FZxxJ0b5VxYp8j6F9255HFQGZzAQ2MVJyu647sA%253D%253D%7Campid%3APL_CLK%7Cclp%3A2334524
https://www.ebay.com/itm/154108126132?_trkparms=ispr%3D1&hash=item23e18f63b4:g:5g0AAOSwkApfch0l&amdata=enc%3AAQAGAAADAPYe5NmHp%252B2JMhMi7yxGiTJkPrKr5t53CooMSQt2orsStEKTPzZMfQmny3knR97t0JsD6L97oauzovvk2KO0WEt%252Fy1DvPmlzbWssMnX7EK9TVYjaGDcIoX%252FGC%252FmDBJiZuzo4Q57rySuBn9egod2B14d0XvKYJrNvSErcJ2RIJIQABAo473RTmNDf6Ql8ivp1PqCbAFg3a3CFzJJNK1in5oOpyIPlur6D%252BrPTA0SbM8%252F6ryLrpO9VpOBncBnX3aQFMBkALPsmmbolhK1Z2wtACsanrnaWudSur0%252BWE2VNOx8K2DSdSW7FqtEDE0lE%252F%252FUhnHvrdqsLg%252F1GsgbpoyWNV1TSap52SOr0Ndsb6HhxIaOdngXwWmW%252BD4qbhHsUxFesHcMrPsOtlb7gIRYj5ubReQpvgK9GiBSCwiEBMUcODZoXhNEsXp9MiBZF9qeWKTApnNy3pbBWAI65p3v0TiawoyWN%252BGywOpl8laWKrZseCWQMn4o0ZmSC08wU%252Fj7Zn2biBdULHfRiJm%252FPvDzSeAVy92JchTe69dUO9%252BHL7%252B5zurnYfPEjLiOaKhHW60bbsO3ru3PZ1RMwF4iMxP%252FgGIKc8xk11Zsms1%252BGfiApbCy7rQraZPQBzQJpzQOIzd7xrkPyOBuNjj2HdPE7bZjLGOCjuIYDIuBxCP0ZMajblQrr5pzvUjdMT4zJ76GmJ28qGDm5TkxD8tDv0eaWnBfDssMpHtAW8GND7g3hE%252Fhqvoc2072E78fzVz0Ts51H%252BW77rrbpC9DmtLHopXEp5fCMcP3vuTv%252BaCqKuaMuy8blc6S6ldjbuqSkRe5qtI5edG5HeingADQenYM%252FbViMlpVwv1ul9GZquueAo89zMOUKKRhsfq6oP4LyLN6%252BNUcOoAxOSBMw7bWC7oYD5yilolDFw7RauJooyv3wMOuZLCABtjABogDu63sdJpcNptiYVj84nsGwDi4AJ4uTPjw1jItB87NskmHFAyY2sdHH%252Be3OqwybMZKg7OKXzx2WPSDPVdN2K5TjZA%253D%253D%7Campid%3APL_CLK%7Cclp%3A2334524
...and so on.

You could do a more efficient filtering within css using ^ starts with operator to identify the appropriate links starting with that string. Use a set comprehension to return only unique items.
from bs4 import BeautifulSoup
import requests
url = 'https://www.ebay.com/sch/i.html?_from=R40&_trksid=p2380057.m570.l1313&_nkw=Universal+Adjustable+Hand+Shower+Holder+Suction+Cup+Holder+Full+Plating+Shower+Rail+Head+Holder+Bathroom+Bracket+Stable+rotation&_sacat=0'
r = requests.get(url)
soup = BeautifulSoup(r.content, features="lxml")
links = {i['href'] for i in soup.select('[href^="https://www.ebay.com/itm/"]')}
print(links)
If you wish to specify the href is a descendant of a li then add that into the selector with a descendant combinator and type selector:
links = {i['href'] for i in soup.select('li [href^="https://www.ebay.com/itm/"]')}

Related

How to select all links of apps from app store and extract its href?

from bs4 import BeautifulSoup
import requests
from urllib.request import urlopen
url = f'https://www.apple.com/kr/search/youtube?src=globalnav'
response = requests.get(url)
html = response.text
soup = BeautifulSoup(html, 'html.parser')
links = soup.select(".rf-serp-productname-list")
print(links)
I want to crawl through all links of shown apps. When I searched for a keyword, I thought links = soup.select(".rf-serp-productname-list") would work, but links list is empty.
What should I do?

Just check this code, I think is what you want:
import re
import requests
from bs4 import BeautifulSoup
pages = set()
def get_links(page_url):
global pages
pattern = re.compile("^(/)")
html = requests.get(f"your_URL{page_url}").text # fstrings require Python 3.6+
soup = BeautifulSoup(html, "html.parser")
for link in soup.find_all("a", href=pattern):
if "href" in link.attrs:
if link.attrs["href"] not in pages:
new_page = link.attrs["href"]
print(new_page)
pages.add(new_page)
get_links(new_page)
get_links("")
Source:
https://gist.github.com/AO8/f721b6736c8a4805e99e377e72d3edbf
You can change the part:
for link in soup.find_all("a", href=pattern):
#do something
To check for a keyword I think

You are cooking a soup so first at all taste it and check if everything you expect contains in it.
ResultSet of your selection is empty cause structure in response differs a bit from your expected one from the developer tools.
To get the list of links select more specific:
links = [a.get('href') for a in soup.select('a.icon')]
Output:
['https://apps.apple.com/kr/app/youtube/id544007664', 'https://apps.apple.com/kr/app/%EC%BF%A0%ED%8C%A1%ED%94%8C%EB%A0%88%EC%9D%B4/id1536885649', 'https://apps.apple.com/kr/app/youtube-music/id1017492454', 'https://apps.apple.com/kr/app/instagram/id389801252', 'https://apps.apple.com/kr/app/youtube-kids/id936971630', 'https://apps.apple.com/kr/app/youtube-studio/id888530356', 'https://apps.apple.com/kr/app/google-chrome/id535886823', 'https://apps.apple.com/kr/app/tiktok-%ED%8B%B1%ED%86%A1/id1235601864', 'https://apps.apple.com/kr/app/google/id284815942']

How to extract url/links that are contents of a webpage with BeautifulSoup

So the website I am using is : https://keithgalli.github.io/web-scraping/webpage.html and I want to extract all the social media links on the webpage.
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://keithgalli.github.io/web-scraping/webpage.html')
soup = bs(r.content)
links = soup.find_all('a', {'class':'socials'})
actual_links = [link['href'] for link in links]
I get an error, specifically:
KeyError: 'href'
For a different example and webpage, I was able to use the same code to extract the webpage link but for some reason this time it is not working and I don't know why.
I also tried to see what the problem was specifically and it appears that
links is a nested array where links[0] outputs the entire content of the ul tag that has class=socials so its not iterable so to speak since the first element contains all the links rather than having each social li tag be seperate elements inside links

Here is the solution using css selectors:
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://keithgalli.github.io/web-scraping/webpage.html')
soup = bs(r.content, 'lxml')
links = soup.select('ul.socials li a')
actual_links = [link['href'] for link in links]
print(actual_links)
Output:
['https://www.instagram.com/keithgalli/', 'https://twitter.com/keithgalli', 'https://www.linkedin.com/in/keithgalli/', 'https://www.tiktok.com/#keithgalli']

Why not try something like:
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://keithgalli.github.io/web-
scraping/webpage.html')
soup = bs(r.content)
links = soup.find_all('a', {'class':'socials'})
actual_links = [link['href'] for link in links if 'href' in link.keys()]
After gaining some new information from you and visiting the webpage, I've realized that you did the following mistake:
The socials class is never used in any a-element and thus you won't find any such in your script. Instead you should look for the li-elements with the class "social".
Thus your code should look like:
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://keithgalli.github.io/web-
scraping/webpage.html')
soup = bs(r.content, "lxml")
link_list_items = soup.find_all('li', {'class':'social'})
links = [item.find('a').get('href') for item in link_list_items]
print(links)

How do I exclude certain beautifulsoup results that I don't want?

I am having issues trying to exclude results given from my beautiful soup program this is my code:
from bs4 import BeautifulSoup
import requests
URL = 'https://en.wikipedia.org/wiki/List_of_Wikipedia_mobile_applications'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
for link in soup.find_all('a'):
print(link.get('href'))
I don't want to get the results that start with a "#" for example: #cite_ref-18
I have tried using for loops but I get this error message: KeyError: 0

You can use the str.startswith() method:
from bs4 import BeautifulSoup
import requests
URL = 'https://en.wikipedia.org/wiki/List_of_Wikipedia_mobile_applications'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
for tag in soup.find_all('a'):
link = tag.get('href')
if not str(link).startswith('#'):
print(link)

You can use CSS selector a[href]:not([href^="#"]). This will select all <a> tags with href= attribute but not the ones starting with # character:
import requests
from bs4 import BeautifulSoup
URL = 'https://en.wikipedia.org/wiki/List_of_Wikipedia_mobile_applications'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
for link in soup.select('a[href]:not([href^="#"])'):
print(link['href'])

Extracting links from html from the link of the following website

I want to extract the link
/stocks/company_info/stock_news.php?sc_id=CHC&scat=&pageno=2&next=0&durationType=Y&Year=2018&duration=1&news_type=
from the html of the page
http://www.moneycontrol.com/company-article/piramalenterprises/news/PH05#PH05
The following is the code that is used
url_list = "http://www.moneycontrol.com/company-article/piramalenterprises/news/PH05#PH05"
html = requests.get(url_list)
soup = BeautifulSoup(html.text,'html.parser')
link = soup.find_all('a')
print(link)
using beautiful soup. How would I go about it, using find_all('a") doesn't return the required link in the returned html.

Please try this to get Exact Url you want.
import bs4 as bs
import requests
import re
sauce = requests.get('https://www.moneycontrol.com/stocks/company_info/stock_news.php?sc_id=CHC&durationType=Y&Year=2018')
soup = bs.BeautifulSoup(sauce.text, 'html.parser')
for a in soup.find_all('a', href=re.compile("company_info")):
# print(a['href'])
if 'pageno' in a['href']:
print(a['href'])
output:
/stocks/company_info/stock_news.php?sc_id=CHC&scat=&pageno=2&next=0&durationType=Y&Year=2018&duration=1&news_type=
/stocks/company_info/stock_news.php?sc_id=CHC&scat=&pageno=3&next=0&durationType=Y&Year=2018&duration=1&news_type=

You just have to use the get method to find the href attribute:
from bs4 import BeautifulSoup as soup
import requests
url_list = "http://www.moneycontrol.com/company-article/piramalenterprises/news/PH05#PH05"
html = requests.get(url_list)
page= soup(html.text,'html.parser')
link = page.find_all('a')
for l in link:
print(l.get('href'))

BeautifulSoup, findAll after findAll?

I'm pretty new to Python and mainly need it for getting information from websites.
Here I tried to get the short headlines from the bottom of the website, but cant quite get them.
from bfs4 import BeautifulSoup
import requests
url = "http://some-website"
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")
nachrichten = soup.findAll('ul', {'class':'list'})
Now I would need another findAll to get all the links/a from the var "nachrichten", but how can I do this ?

Use a css selector with select if you want all the links in a single list:
anchors = soup.select('ul.list a')
If you want individual lists:
anchors = [ ul.find_all(a) for a in soup.find_all('ul', {'class':'list'})]
Also if you want the hrefs you can make sure you only find the anchors with href attributes and extract:
hrefs = [a["href"] for a in soup.select('ul.list a[href]')]
With find_all set href=True i.e ul.find_all(a, href=True) .

from bs4 import BeautifulSoup
import requests
url = "http://www.n-tv.de/ticker/"
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")
nachrichten = soup.findAll('ul', {'class':'list'})
links = []
for ul in nachrichten:
links.extend(ul.findAll('a'))
print len(links)
Hope this solves your problem and I think the import is bs4. I never herd of bfs4

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to extract links from HTML using BeautifulSoup? - python

Related

How to select all links of apps from app store and extract its href?

How to extract url/links that are contents of a webpage with BeautifulSoup

How do I exclude certain beautifulsoup results that I don't want?

Extracting links from html from the link of the following website

BeautifulSoup, findAll after findAll?

Categories

Resources