How to get rid of duplicate links using python - python

I'm new at coding and I'm trying to scrape all unique web links from https://www.census.gov/programs-surveys/popest.html. I've tried to put the links into a set but the output comes back as {'/'}. I don't know any other way to get rid of duplicates. Below is my code. Thank you for you help.
from bs4 import BeautifulSoup
import urllib
import urllib.request
import requests
with urllib.request.urlopen('https://www.census.gov/programs-surveys/popest.html') as response:
html = response.read()
soup = BeautifulSoup(html, 'html.parser')
for link in soup.find_all('a', href=True):
links = (link['href'])
link = str(link.get('href'))
if link.startswith('https'):
print(link)
elif link.endswith('html'):
print(link)
unique_links = set(link)
print(unique_links)

Let's say all the links are stored in a list called links1. Here is how you can remove duplicates without the use of set():
links2 = []
for link in links1:
if link not in link2:
links2.append(link)

Your set only contains the final link, declare the set() earlier, then add to it.
unique_links = set()
for link in soup.find_all('a', href=True):
link = str(link.get('href'))
if link.startswith('https'):
print(link)
elif link.endswith('html'):
print(link)
unique_links.add(link)
print(unique_links)

Create the set outside the for loop, then add to set inside the loop.
link_set = set()
for link in soup.find_all('a', href=True):
link_set.add(link['href']

Related

How to select all links of apps from app store and extract its href?

from bs4 import BeautifulSoup
import requests
from urllib.request import urlopen
url = f'https://www.apple.com/kr/search/youtube?src=globalnav'
response = requests.get(url)
html = response.text
soup = BeautifulSoup(html, 'html.parser')
links = soup.select(".rf-serp-productname-list")
print(links)
I want to crawl through all links of shown apps. When I searched for a keyword, I thought links = soup.select(".rf-serp-productname-list") would work, but links list is empty.
What should I do?
Just check this code, I think is what you want:
import re
import requests
from bs4 import BeautifulSoup
pages = set()
def get_links(page_url):
global pages
pattern = re.compile("^(/)")
html = requests.get(f"your_URL{page_url}").text # fstrings require Python 3.6+
soup = BeautifulSoup(html, "html.parser")
for link in soup.find_all("a", href=pattern):
if "href" in link.attrs:
if link.attrs["href"] not in pages:
new_page = link.attrs["href"]
print(new_page)
pages.add(new_page)
get_links(new_page)
get_links("")
Source:
https://gist.github.com/AO8/f721b6736c8a4805e99e377e72d3edbf
You can change the part:
for link in soup.find_all("a", href=pattern):
#do something
To check for a keyword I think
You are cooking a soup so first at all taste it and check if everything you expect contains in it.
ResultSet of your selection is empty cause structure in response differs a bit from your expected one from the developer tools.
To get the list of links select more specific:
links = [a.get('href') for a in soup.select('a.icon')]
Output:
['https://apps.apple.com/kr/app/youtube/id544007664', 'https://apps.apple.com/kr/app/%EC%BF%A0%ED%8C%A1%ED%94%8C%EB%A0%88%EC%9D%B4/id1536885649', 'https://apps.apple.com/kr/app/youtube-music/id1017492454', 'https://apps.apple.com/kr/app/instagram/id389801252', 'https://apps.apple.com/kr/app/youtube-kids/id936971630', 'https://apps.apple.com/kr/app/youtube-studio/id888530356', 'https://apps.apple.com/kr/app/google-chrome/id535886823', 'https://apps.apple.com/kr/app/tiktok-%ED%8B%B1%ED%86%A1/id1235601864', 'https://apps.apple.com/kr/app/google/id284815942']

Scraping all URLs from search result page BeautifulSoup

I'm trying to get 100 URLs from the following search result page:
https://www.willhaben.at/iad/kaufen-und-verkaufen/marktplatz/fahrraeder-radsport/fahrraeder-4552?rows=100&areaId=900
Here's the test code I have:
import requests
from bs4 import BeautifulSoup
urls = []
def get_urls(url):
page = requests.get(url)
soup = BeautifulSoup(page.content,'html.parser')
s = soup.find('a', class_="header w-brk")
urls.append(s)
print(urls)
Unfortunately the list returns [None]. I've also tried using href=True in the soup.find or soup.find_all method but unfortunately that doesn't work either. I can see another problem with this:
The URL the page provides in the source is for example:
a href="/iad/kaufen-und-verkaufen/d/fahrrad-429985104/"
just the end of the willhaben.at URL. When I do get all of these URLs appended to my list, I won't be able to scrape them just like they are, I'll have to somehow add the root URL to it before my scraper can load it.
What is the most effective way I can solve this?
Thanks!
You can choose many ways to get anchor URLs.
soup.select elegant way:
urls.extend([a.attrs['href'] for a in soup.select('div.header.w-brk a')])
soup.select simpler way:
for a in soup.select('div.header.w-brk a'):
urls.append(a.attrs['href'])
soup.find_all simpler way:
for div in soup.find_all('div', class_="header w-brk"):
urls.append(div.find('a').attrs['href'])
soup.find_all elegant way:
urls.extend([div.find('a').attrs['href'] for div in soup.find_all('div', class_="header w-brk")])
Checkout:
import requests
from bs4 import BeautifulSoup
urls = []
url = "https://www.willhaben.at/iad/kaufen-und-verkaufen/marktplatz/fahrraeder-radsport/fahrraeder-4552?rows=100&areaId=900"
def get_urls(url):
page = requests.get(url)
soup = BeautifulSoup(page.content,'html.parser')
s = soup.findAll("div", {"class": "w-brk"})
for link in s:
l = link.find("a")
urls.append("https://www.willhaben.at"+l['href'])
print(urls)
get_urls(url)
For the second part of your question you could use a simple list comprehension:
urls_with_base = [f"{base_url}/{url}" for url in urls]
This is the code you are looking for. I hope that you do not need any explanations for this code:
import requests
from bs4 import BeautifulSoup
urls = []
def get_urls(page_url):
global urls
page = requests.get(page_url)
soup = BeautifulSoup(page.content, "html.parser")
anchor_tags = soup.find_all("a", href=True)
urls = [anchor_tag.get("href") for anchor_tag in anchor_tags]

how to fix the def to return the links

I located some links on a web site with beautifullSoup and need to return them in a list(or txt file) to use them later on.
It's to get some text from the links on the sites they lead to. I tried to make a def to return the links but I'm not smart enough to get the def working.
for link in soup.find_all('a', href=True):
print(link["href"])
I get a list of links from the code above and could make it write into a text file (by myself) and make a new python script but I would rather prefer to "return" it to continue the script and by the way learn something.
i came up with this but doesnt work:
def linkgetter(soup):
for link in soup.find('a', href=True):
return soup
it prints out the whole site's html code and doesn't filter the links.
def get_links(soup):
return [link["href"] for link in soup.find_all('a', href=True)]
You can try the following:
from bs4 import BeautifulSoup
import urllib2
import re
def parse_links(url):
links = []
html = urllib2.urlopen(url)
soup = BeautifulSoup(html)
for link in soup.findAll('a'):
links.append(link.get('href'))
return links
print parse_links("https://stackoverflow.com/questions/57826906/how-to-fix-the-def-to-return-the-links#57826972")
If you would like to get the links starting with http://, you can use:
soup.findAll('a', attrs={'href': re.compile("^http://")})

Getting all Links from a page Beautiful Soup

I am using beautifulsoup to get all the links from a page. My code is:
import requests
from bs4 import BeautifulSoup
url = 'http://www.acontecaeventos.com.br/marketing-promocional-sao-paulo'
r = requests.get(url)
html_content = r.text
soup = BeautifulSoup(html_content, 'lxml')
soup.find_all('href')
All that I get is:
[]
How can I get a list of all the href links on that page?
You are telling the find_all method to find href tags, not attributes.
You need to find the <a> tags, they're used to represent link elements.
links = soup.find_all('a')
Later you can access their href attributes like this:
link = links[0] # get the first link in the entire page
url = link['href'] # get value of the href attribute
url = link.get('href') # or like this
Replace your last line:
links = soup.find_all('a')
By that line :
links = [a.get('href') for a in soup.find_all('a', href=True)]
It will scrap all the a tags, and for each a tags, it will append the href attribute to the links list.
If you want to know more about the for loop between the [], read about List comprehensions.
To get a list of everyhref regardless of tag use:
href_tags = soup.find_all(href=True)
hrefs = [tag.get('href') for tag in href_tags]

How to solve, finding two of each link (Beautifulsoup, python)

Im using beautifulsoup4 to parse a webpage and collect all the href values using this code
#Collect links from 'new' page
pageRequest = requests.get('http://www.supremenewyork.com/shop/all/shirts')
soup = BeautifulSoup(pageRequest.content, "html.parser")
links = soup.select("div.turbolink_scroller a")
allProductInfo = soup.find_all("a", class_="name-link")
print allProductInfo
linksList1 = []
for href in allProductInfo:
linksList1.append(href.get('href'))
print(linksList1)
linksList1 prints two of each link. I believe this is happening as its taking the link from the title as well as the item colour. I have tried a few things but cannot get BS to only parse the title link, and have a list of one of each link instead of two. I imagine its something real simple but im missing it. Thanks in advance
This code will give you the result without getting duplicate results
(also using set() may be a good idea as #Tarum Gupta)
But I changed the way you crawl
import requests
from bs4 import BeautifulSoup
#Collect links from 'new' page
pageRequest = requests.get('http://www.supremenewyork.com/shop/all/shirts')
soup = BeautifulSoup(pageRequest.content, "html.parser")
links = soup.select("div.turbolink_scroller a")
# Gets all divs with class of inner-article then search for a with name-link class
that is inside an h1 tag
allProductInfo = soup.select("div.inner-article h1 a.name-link")
# print (allProductInfo)
linksList1 = []
for href in allProductInfo:
linksList1.append(href.get('href'))
print(linksList1)
alldiv = soup.findAll("div", {"class":"inner-article"})
for div in alldiv:
linkList1.append(div.h1.a['href'])
set(linksList1) # use set() to remove duplicate link
list(set(linksList1)) # use list() convert set to list if you need

Categories