Python 3 remove duplicate weblinks with extra character rstrip - python

Using Python 3. I am trying to pull all the unique links from a website and seem to have the code working except for a few links that have a / at the end.
For example: My program will include http://www.google.com & http://www.google.com/
I'd like to make sure my program removes that last character to ensure no duplicates will return. I have researched rstrip() but I can't seem to get it to work. Here is my code:
import bs4 as bs
import urllib.request
import urllib.parse
source = urllib.request.urlopen('https://www.census.gov/data/tables/2016/demo/popest/state-total.html').read()
soup = bs.BeautifulSoup(source,'lxml')
filename = "UniqueWebLinks.csv"
f = open(filename, "w")
headers = "WebLinks\n"
f.write(headers)
all_links = soup.find_all('a')
url_set = set()
for link in all_links:
web_links = link.get("href")
ab_url = urllib.parse.urljoin('https://www.census.gov/data/tables/2016/demo/popest/state-total.html', web_links)
print (ab_url)
if ab_url and ab_url not in url_set:
f.write(str(ab_url) + "\n")
url_set.add(ab_url)

I'd keep it simple and be very explicit about how you're cleaning URLs. For example, strip the last character if it's a slash (/) or a hash (#) (if a URL ends with a hash, it's the same as it not ending with a hash). After glancing at the data, I'd also remove any blank URLs because that's probably not what you're looking for.
BASE_URL = 'https://www.census.gov/data/tables/2016/demo/popest/state-total.html'
all_links = soup.find_all('a')
def clean_links(tags, base_url):
cleaned_links = set()
for tag in tags:
link = tag.get('href')
if link is None:
continue
if link.endswith('/') or link.endswith('#'):
link = link[-1]
full_url = urllib.parse.urljoin(base_url, link)
cleaned_links.add(full_url)
return cleaned_links
cleaned_links = clean_links(all_links, BASE_URL)
for link in cleaned_links:
f.write(str(link) + '\n')

Related

python - Retrieve and save links from webpage but only one per domain

I'm having a bit of trouble trying to save the links from a website into a list without repeating urls with same domain
Example:
www.python.org/download and www.python.org/about
should only save the first one (www.python.org/download) and not repeat it later
This is what i've got so far
from bs4 import BeautifulSoup
import requests
from urllib.parse import urlparse
url = "https://docs.python.org/3/library/urllib.request.html#module-urllib.request"
result = requests.get(url)
doc = BeautifulSoup(result.text, "html.parser")
atag = doc.find_all('a', href=True)
links = []
#below should be some kind of for loop
As a one-liner:
links = {nl for a in doc.find_all('a', href=True) if (nl := urlparse(a["href"]).netloc) != ""}
Explained:
links = set() # define empty set
for a in doc.find_all('a', href=True): # loop over every <a> element
nl = urlparse(a["href"]).netloc # get netloc from url
if nl:
links.add(nl) # add to set if exists
output:
{'www.w3.org', 'datatracker.ietf.org', 'www.python.org', 'requests.readthedocs.io', 'github.com', 'www.sphinx-doc.org'}

Why does my web scraper write everything into a single line

Complete newbie but I've managed to successfully scrape EAN numbers with Python from a list of links created by an upstream piece of code. However, my output file contains all the scraped numbers as a continuous single line instead of one EAN per line.
Here's my code - what's wrong with it? (scraped URL redacted)
import requests
from bs4 import BeautifulSoup
import urllib.request
import os
subpage = 1
while subpage <= 2:
URL = "https://..." + str(subpage)
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
"""writes all links under the h2 tag into a list"""
links = []
h2s = soup.find_all("h2")
for h2 in h2s:
links.append("http://www.xxxxxxxxxxx.com" + h2.a['href'])
"""opens links from list and extracts EAN number from underlying page"""
with open("temp.txt", "a") as output:
for link in links:
urllib.request.urlopen(link)
page_2 = requests.get(link)
soup_2 = BeautifulSoup(page_2.content, "html.parser")
if "EAN:" in soup_2.text:
span = soup_2.find(class_="articleData_ean")
EAN = span.a.text
output.write(EAN)
subpage += 1
os.replace('temp.txt', 'EANs.txt')
output.write(EAN) is writing each EAN without anything between them. It doesn't automatically add a separator or newline. You can add a newline: output.write('\n') or comma, etc. to separate them

Removing duplicate URLs in Python including URLs that contain a forward slash

The following program is giving me output that includes URLs with and without the forward slash (e.g. ask.census.gov and ask.census.gov/). I need to eliminate one or the other. Thank you in advance for your help!
from bs4 import BeautifulSoup as mySoup
from urllib.parse import urljoin as myJoin
from urllib.request import urlopen as myRequest
my_url = "https://www.census.gov/programs-surveys/popest.html"
# call on packages
html_page = myRequest(my_url)
raw_html = html_page.read()
html_page.close()
page_soup = mySoup(raw_html, "html.parser")
f = open("censusTest.csv", "w")
hyperlinks = page_soup.findAll('a')
set_urls = set()
for checked in hyperlinks:
found_link = checked.get("href")
result_set = myJoin(my_url, found_link)
if result_set and result_set not in set_urls:
set_urls.add(result_set)
f.write(str(result_set) + "\n")
f.close()
You can always right-strip the slash - it would be removed if exists and nothing will be done if not:
result_set = myJoin(my_url, found_link).rstrip("/")
my_url = "https://www.census.gov/programs-surveys/popest.html/"
if my_url[-1:] == '/':
my_url = my_url[:-1]
This snip of code will check to see if the last character in your string is a '/', and if it is, it will delete it.
Good examples of python string manipulation:
http://www.pythonforbeginners.com/basics/string-manipulation-in-python

Having problems following links with webcrawler

I am trying to create a webcrawler that parses all the html on the page, grabs a specified (via raw_input) link, follows that link, and then repeats this process a specified number of times (once again via raw_input). I am able to grab the first link and successfully print it. However, I am having problems "looping" the whole process, and usually grab the wrong link. This is the first link
https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Fikret.html
(Full disclosure, this questions pertains to an assignment for a Coursera course)
Here's my code
import urllib
from BeautifulSoup import *
url = raw_input('Enter - ')
rpt=raw_input('Enter Position')
rpt=int(rpt)
cnt=raw_input('Enter Count')
cnt=int(cnt)
count=0
counts=0
tags=list()
soup=None
while x==0:
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html)
# Retrieve all of the anchor tags
tags=soup.findAll('a')
for tag in tags:
url= tag.get('href')
count=count + 1
if count== rpt:
break
counts=counts + 1
if counts==cnt:
x==1
else: continue
print url
Based on DJanssens' response, I found the solution;
url = tags[position-1].get('href')
did the trick for me!
Thanks for the assistance!
I also worked on that course, and help with a friend, I got this worked out:
import urllib
from bs4 import BeautifulSoup
url = "http://python-data.dr-chuck.net/known_by_Happy.html"
rpt=7
position=18
count=0
counts=0
tags=list()
soup=None
x=0
while x==0:
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html,"html.parser")
tags=soup.findAll('a')
url= tags[position-1].get('href')
count=count + 1
if count == rpt:
break
print url
I believe this is what you are looking for:
import urllib
from bs4 import *
url = raw_input('Enter - ')
position=int(raw_input('Enter Position'))
count=int(raw_input('Enter Count'))
#perform the loop "count" times.
for _ in xrange(0,count):
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html)
tags=soup.findAll('a')
for tag in tags:
url= tag.get('href')
tags=soup.findAll('a')
# if the link does not exist at that position, show error.
if not tags[position-1]:
print "A link does not exist at that position."
# if the link at that position exist, overwrite it so the next search will use it.
url = tags[position-1].get('href')
print url
The code will now loop the amount of times as specified in the input, each time it will take the href at the given position and replace it with the url, in that way each loop will look further in the tree structure.
I advice you to use full names for variables, which is a lot easier to understand. In addition you could cast them and read them in a single line, which makes your beginning easier to follow.
Here is my 2-cents:
import urllib
#import ssl
from bs4 import BeautifulSoup
#'http://py4e-data.dr-chuck.net/known_by_Fikret.html'
url = raw_input('Enter URL : ')
position = int(raw_input('Enter position : '))
count = int(raw_input('Enter count : '))
print('Retrieving: ' + url)
soup = BeautifulSoup(urllib.urlopen(url).read())
for x in range(1, count + 1):
link = list()
for tag in soup('a'):
link.append(tag.get('href', None))
print('Retrieving: ' + link[position - 1])
soup = BeautifulSoup(urllib.urlopen(link[position - 1]).read())

Access element using xpath?

I would like to get the links to all of the elements in the first column in this page (http://en.wikipedia.org/wiki/List_of_school_districts_in_Alabama).
I am comfortable using BeautifulSoup, but it seems less well-suited to this task (I've been trying to access the first child of the contents of each tr but that hasn't been working so well).
The xpaths follow a regular pattern, the row number updating for each new row in the following expression:
xpath = '//*[#id="mw-content-text"]/table[1]/tbody/tr[' + str(counter) + ']/td[1]/a'
Would someone help me by posting a means of iterating through the rows to get the links?
I was thinking something along these lines:
urls = []
while counter < 100:
urls.append(get the xpath('//*[#id="mw-content-text"]/table[1]/tbody/tr[' + str(counter) + ']/td[1]/a'))
counter += 1
Thanks!
Here's the example on how you can get all of the links from the first column:
from lxml import etree
import requests
URL = "http://en.wikipedia.org/wiki/List_of_school_districts_in_Alabama"
response = requests.get(URL)
parser = etree.HTMLParser()
tree = etree.fromstring(response.text, parser)
for row in tree.xpath('//*[#id="mw-content-text"]/table[1]/tr'):
links = row.xpath('./td[1]/a')
if links:
link = links[0]
print link.text, link.attrib.get('href')
Note, that, tbody is appended by the browser - lxml won't see this tag (just skip it in xpath).
Hope that helps.
This should work:
from lxml import html
urls = []
parser = html.parse("http://url/to/parse")
for element in parser.xpath(your_xpath_query):
urls.append(element.attrib['href'])
You could also access the href attribute in the XPath query directly, e.g.:
for href in parser.xpath("//a/#href"):
urls.append(href)
The page you linked to does not seem to have content at the XPath you specified. Here is a different XPath which does the job:
import urllib2
import lxml.html as LH
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', "Mozilla/5.0")]
url = 'http://en.wikipedia.org/wiki/List_of_school_districts_in_Alabama'
xpath = '//table[#class="wikitable sortable"]//tr/td[1]/a/#href'
doc = LH.parse(opener.open(url))
urls = doc.xpath(xpath)
print(urls)
Maybe you are looking to something like
urls = []
while True:
try:
counter = len(urls)+1
(node,) = tree.xpath('//*[#id="mw-content-text"]/table[1]/tbody/tr[' + str(counter) + ']/td[1]/a')
urls.append(node)
except ValueError:
break

Categories