Get all internal links from all pages in a Domain

Get all internal links from all pages in a Domain - python

I am looking for a code which will get all internal links from a website by iterating all internals links [both absolute & relative] found.
So far I managed to write this much, but unable to construct the right logic in the program.
import requests, csv, time
from lxml import html
from collections import OrderedDict
links = []
domain = 'bunchball.com'
base_link = 'http://www.bunchball.com/'
unique_list = []
def get_links(base_link):
r = requests.get(base_link)
source = html.fromstring(r.content)
link = source.xpath('//a/#href')
for each in link:
each = str(each)
if domain in each:
links.append(each)
elif each.startswith('/'):
links.append(base_link+each)
unique_list.append(each)
else:
pass
get_links(base_link)
#while
for each1 in list(OrderedDict.fromkeys(links)):
get_links(each1)
while each1 not in unique_list:
unique_list.append(each1)
get_links(each1)

Try with mechanize for a simpler solution:
from mechanize import Browser
br = Browser()
br.open("http://www.bunchball.com/")
list_of_links=[link for link in br.links()]

Related

Can't parse different product links from a webpage

I've created a script in Python to fetch different product links from a webpage. Although I know the content of that site are dynamic, I tried conventional way to let you inform that I tried. I looked for APIs in the dev tools but could not find one. Ain't there any way to get those links using requests?
Site Link
I've written so far:
import requests
from bs4 import BeautifulSoup
link = "https://www.amazon.com/stores/node/10699640011"
def fetch_product_links(url):
res = requests.get(url,headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(res.text,"lxml")
for item_link in soup.select("[id^='ProductGrid-'] li[class^='style__itemOuter__'] > a"):
print(item_link.get("href"))
if __name__ == '__main__':
fetch_product_links(link)
How can I fetch different product links from that site using requests?

I think you only need the asins which you can collect from another url construct you can see in network tab i.e. you can significantly shorten the final urls. You do however need to make a request to your original url to pick up an identifier to use in second url. Returns 146 links.
import requests, re, json
node = '10699640011'
with requests.Session() as s:
r = s.get(f'https://www.amazon.com/stores/node/{node}')
p = re.compile(r'var slotsStr = "\[(.*?,){3} share\]";')
identifier = p.findall(r.text)[0]
identifier = identifier.strip()[:-1]
r = s.get(f'https://www.amazon.com/stores/slot/{identifier}?node={node}')
p = re.compile(r'var config = (.*?);')
data = json.loads(p.findall(r.text)[0])
asins = data['content']['ASINList']
links = [f'https://www.amazon.com/dp/{asin}' for asin in asins]
print(links)
EDIT:
With two given nodes:
import requests, re, json
from bs4 import BeautifulSoup as bs
nodes = ['3039806011','10699640011']
with requests.Session() as s:
for node in nodes:
r = s.get(f'https://www.amazon.com/stores/node/{node}')
soup = bs(r.content, 'lxml')
identifier = soup.select('.stores-widget-btf:not([id=share],[id*=RECOMMENDATION])')[-1]['id']
r = s.get(f'https://www.amazon.com/stores/slot/{identifier}?node={node}')
p = re.compile(r'var config = (.*?);')
data = json.loads(p.findall(r.text)[0])
asins = data['content']['ASINList']
links = [f'https://www.amazon.com/dp/{asin}' for asin in asins]
print(links)

Python requests html printed response in array

I am trying to check if link contains http and print the URL.
import requests
from requests_html import HTMLSession
import sys
link = "http://www.tvil.me/view/93/4/8/v/%D7%90%D7%99%D7%99_%D7%96%D7%95%D7%9E%D7%91%D7%99_IZombie.html"
enter_episodes = HTMLSession().get(link)
page = enter_episodes.html
s = page.xpath("//*[#class='view-watch-button']/a")
for l in s:
link = l.links
if link != "set()":
print(link)
Response:
{'http://streamcloud.eu/ga4m4hizbrfb/iZombie.S04E08.HDTV.x264-SVA.mkv.html'}
{'http://uptostream.com/77p26f7twwhe'}
set()
{'https://clipwatching.com/aog2ni06rzjt/rrFhepnbFfpt6xg.mkv.html'}
set()
[Finished in 1.7s]
I tried to delete the set() response and to get only the link without {' and '}.

You just need to make sure the set has a length of more than one, and then pop it:
import requests
from requests_html import HTMLSession
import sys
link = "http://www.tvil.me/view/93/4/8/v/%D7%90%D7%99%D7%99_%D7%96%D7%95%D7%9E%D7%91%D7%99_IZombie.html"
enter_episodes = HTMLSession().get(link)
page = enter_episodes.html
s = page.xpath("//*[#class='view-watch-button']/a")
for l in s:
link = l.links
if len(link) > 0: # make sure it has a value
print(link.pop()) # get the last value (in your case, the only one)

Stuck on this task

From an online python course:
You will be given a website with 100 names. All names are in the form of a link. Each link leads to another 100 links. You must use python to select the 18th link for 7 times, and print out the results.
my code so far:
z = 0
atags = []
listurl = []
#import modules
import urllib
from bs4 import BeautifulSoup
import re
newurl = "https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Desmond.html"
while z < 7:
url = newurl
z = z + 1
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html)
soup.find_all("url")
a = soup.find_all('a')
for x in a:
atags.append(str(x))
url_end_full = atags[19]
url_end = re.findall(r'"(.*?)"', url_end_full)
url_end = str(url_end[0])
newurl = 'https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/' + url_end
str(newurl)
listurl.append(newurl)
url = newurl
print url
It does not work. It keeps giving me the same link...
this is the output:
https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Lauchlin.html
[Finished in 2.4s]
the answer was wrong when i entered it into the answer box.

There are a couple of problems.
atags[19] is not the 18th item, it is the 20th (lst[0] is the first item in a list).
soup.find_all("url") does nothing; get rid of it.
you do not need re.
The links returned are relative; you are doing a hard-join to the base path to make them absolute. In this case it works, but that is a matter of luck; do it right with urljoin.
While str(link) does get you the url, the "proper" method is by indexing into the attributes, ie link['href'].
With some judicious cleanup,
from bs4 import BeautifulSoup
import sys
# version compatibility shim
if sys.hexversion < 0x3000000:
# Python 2.x
from urlparse import urljoin
from urllib import urlopen
else:
# Python 3.x
from urllib.parse import urljoin
from urllib.request import urlopen
START_URL = "https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Desmond.html"
STEPS = 7
ITEM = 18
def get_soup(url):
with urlopen(url) as page:
return BeautifulSoup(page.read(), 'lxml')
def main():
url = START_URL
for step in range(STEPS):
print("\nStep {}: looking at '{}'".format(step, url))
# get the right item (Python arrays start indexing at 0)
links = get_soup(url).find_all("a")
rel_url = links[ITEM - 1]["href"]
# convert from relative to absolute url
url = urljoin(url, rel_url)
print(" go to '{}'".format(url))
if __name__=="__main__":
main()
which, if I did it right, ends with known_by_Gideon.html

Get list of all paginated URL's from links in txt file in python requests

Hi Guys Define a Function to Get list of all paginated URLs at bottom from links in txt file in python.
Here is an example of what i need done.
Input link
http://www.apartmentguide.com/apartments/Alabama/Hartselle/
Desired Output
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=2
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=3
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=4
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=5
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=6
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=7
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=8
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=9
so on to any limit each Input Url have.
This is the function i written so far but its not working i am not good with Python either .
import requests
#from bs4 import BeautifulSoup
from scrapy import Selector as Se
import urllib2
lists = open("C:\Users\Administrator\Desktop\\3.txt","r")
read_list = lists.read()
line = read_list.split("\n")
def get_links(line):
for each in line:
r = requests.get(each)
sel = Se(text=r.text, type="html")
next_ = sel.xpath('//a[#class="next sprite"]//#href').extract()
for next_1 in next_:
next_2 = "http://www.apartmentguide.com"+next_1
print next_2
get_links(next_1)
get_links(line)

Below are two ways to do this.
import mechanize
import requests
from bs4 import BeautifulSoup, SoupStrainer
import urlparse
import pprint
#-- Mechanize --
br = mechanize.Browser()
def get_links_mechanize(root):
links = []
br.open(root)
for link in br.links():
try:
if dict(link.attrs)['class'] == 'page':
links.append(link.absolute_url)
except:
pass
return links
#-- Requests / BeautifulSoup / urlparse --
def get_links_bs(root):
links = []
r = requests.get(root)
for link in BeautifulSoup(r.text, parse_only=SoupStrainer('a')):
if link.has_attr('href') and link.has_attr('class') and 'page' in link.get('class'):
links.append(urlparse.urljoin(root, link.get('href')))
return links
#with open("C:\Users\Administrator\Desktop\\3.txt","r") as f:
# for root in f:
# links = get_links(root)
# # <Do something with links>
root = 'http://www.apartmentguide.com/apartments/Alabama/Hartselle/'
print "Mech:"
pprint.pprint( get_links_mechanize(root) )
print "Requests/BS4/urlparse:"
pprint.pprint( get_links_bs(root) )
One uses mechanize -- it's a bit smarter with URLs but it's a lot slower and may be overkill depending on what else you're doing.
The other uses requests to fetch the page (urllib2 would suffice), BeautifulSoup to parse the markup and urlparse to form absolute URLs from the relative URLs in the page you listed.
Note that both of these functions return the following list:
['http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=2',
'http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=3',
'http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=4',
'http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=5',
'http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=2',
'http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=3',
'http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=4',
'http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=5']
which has duplicates. You can get rid of the duplicates by changing
return links
to
return list(set(links))
for whatever method you choose.
EDIT:
I noticed that the above functions only returned the links to pages 2-5, and you'd have to navigate those pages to see that there were in fact 10 pages.
A completely different approach would be to scrape the "root" page for number of results, then predict how many pages that would result in, then build links from that.
Since there are 20 results per page, figuring out how many pages is straightforward, consider:
import requests, re, math, pprint
def scrape_results(root):
links = []
r = requests.get(root)
mat = re.search(r'We have (\d+) apartments for rent', r.text)
num_results = int(mat.group(1)) # 182 at the moment
num_pages = int(math.ceil(num_results/20.0)) # ceil(182/20) => 10
# Construct links for pages 1-10
for i in range(num_pages):
links.append("%s?page=%d" % (root, (i+1)))
return links
pprint.pprint(scrape_results(root))
This will be the fastest method of the 3, but possibly more error prone.
EDIT 2:
Maybe something like:
import re, math, pprint
import requests, urlparse
from bs4 import BeautifulSoup, SoupStrainer
def get_pages(root):
links = []
r = requests.get(root)
mat = re.search(r'We have (\d+) apartments for rent', r.text)
num_results = int(mat.group(1)) # 182 at the moment
num_pages = int(math.ceil(num_results/20.0)) # ceil(182/20) => 10
# Construct links for pages 1-10
for i in range(num_pages):
links.append("%s?page=%d" % (root, (i+1)))
return links
def get_listings(page):
links = []
r = requests.get(page)
for link in BeautifulSoup(r.text, parse_only=SoupStrainer('a')):
if link.has_attr('href') and link.has_attr('data-listingid') and 'name' in link.get('class'):
links.append(urlparse.urljoin(root, link.get('href')))
return links
root='http://www.apartmentguide.com/apartments/Alabama/Hartselle/'
listings = []
for page in get_pages(root):
listings += get_listings(page)
pprint.pprint(listings)
print(len(listings))

With Re i was unsure ,so tried xpath.
links = open("C:\Users\ssamant\Desktop\Anida\Phase_II\Apartmentfinder\\2.txt","r")
read_list = links.read()
line = read_list.split("\n")
for each in line:
lines = []
r = requests.get(each)
sel = Selector(text=r.text,type="html")
mat = sel.xpath('//h1//strong/text()').extract()
mat = str(mat)
mat1 = mat.replace(" apartments for rent']","")
mat2 = mat1.replace("[u'","")
mat3 = int(mat2)
num_pages = int(math.ceil(mat3/20.0))
for i in range(num_pages):
lines.append("%s/Page%d" % (each, (i+1)))
with open('C:\Users\ssamant\Desktop\Anida\Phase_II\Apartmentfinder\\test.csv', 'ab') as f:
writer = csv.writer(f)
for val in lines:
writer.writerow([val])

Access element using xpath?

I would like to get the links to all of the elements in the first column in this page (http://en.wikipedia.org/wiki/List_of_school_districts_in_Alabama).
I am comfortable using BeautifulSoup, but it seems less well-suited to this task (I've been trying to access the first child of the contents of each tr but that hasn't been working so well).
The xpaths follow a regular pattern, the row number updating for each new row in the following expression:
xpath = '//*[#id="mw-content-text"]/table[1]/tbody/tr[' + str(counter) + ']/td[1]/a'
Would someone help me by posting a means of iterating through the rows to get the links?
I was thinking something along these lines:
urls = []
while counter < 100:
urls.append(get the xpath('//*[#id="mw-content-text"]/table[1]/tbody/tr[' + str(counter) + ']/td[1]/a'))
counter += 1
Thanks!

Here's the example on how you can get all of the links from the first column:
from lxml import etree
import requests
URL = "http://en.wikipedia.org/wiki/List_of_school_districts_in_Alabama"
response = requests.get(URL)
parser = etree.HTMLParser()
tree = etree.fromstring(response.text, parser)
for row in tree.xpath('//*[#id="mw-content-text"]/table[1]/tr'):
links = row.xpath('./td[1]/a')
if links:
link = links[0]
print link.text, link.attrib.get('href')
Note, that, tbody is appended by the browser - lxml won't see this tag (just skip it in xpath).
Hope that helps.

This should work:
from lxml import html
urls = []
parser = html.parse("http://url/to/parse")
for element in parser.xpath(your_xpath_query):
urls.append(element.attrib['href'])
You could also access the href attribute in the XPath query directly, e.g.:
for href in parser.xpath("//a/#href"):
urls.append(href)

The page you linked to does not seem to have content at the XPath you specified. Here is a different XPath which does the job:
import urllib2
import lxml.html as LH
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', "Mozilla/5.0")]
url = 'http://en.wikipedia.org/wiki/List_of_school_districts_in_Alabama'
xpath = '//table[#class="wikitable sortable"]//tr/td[1]/a/#href'
doc = LH.parse(opener.open(url))
urls = doc.xpath(xpath)
print(urls)

Maybe you are looking to something like
urls = []
while True:
try:
counter = len(urls)+1
(node,) = tree.xpath('//*[#id="mw-content-text"]/table[1]/tbody/tr[' + str(counter) + ']/td[1]/a')
urls.append(node)
except ValueError:
break

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Get all internal links from all pages in a Domain - python

Try with mechanize for a simpler solution: from mechanize import Browser br = Browser() br.open("http://www.bunchball.com/") list_of_links=[link for link in br.links()]

Related

Can't parse different product links from a webpage

Python requests html printed response in array

Stuck on this task

Get list of all paginated URL's from links in txt file in python requests

Access element using xpath?

Categories

Resources