Extract URL in Sitemap With Python - python

I need the extract links in sitemap
https://wunder.com.tr/sitemap.xml
I wrote some code
import requests
from bs4 import BeautifulSoup
wunder = requests.get("https://wunder.com.tr/sitemap.xml")
parcala = BeautifulSoup(wunder.content,"lxml")
links = parcala.find_all("html-tag")
print(links)
But unable to extract.

import requests
from bs4 import BeautifulSoup
wunder = requests.get("https://wunder.com.tr/sitemap.xml")
parcala = BeautifulSoup(wunder.content, "xml")
urls_from_xml = []
loc_tags = parcala.find_all('loc')
for loc in loc_tags:
urls_from_xml.append(loc.get_text())
print(urls_from_xml)

Related

How to extract a link from a hyperlink from a href tag with BeautifulSoup?

This isn't just a simple how to retrieve links question. When I scrape a page, the href link returns something like '/people/4849247002', but if you inspect the page itself this href URL actually links to 'https://website/people/4849247002' if you click it. how can I get the link with 'https://website/people/4849247002' instead?
also side note, but what's the correct way to use BeautifulSoup to get a webpage? I've been using both of the following:
from BeautifulSoup import BeautifulSoup
import urllib2
import re
html_page = urllib2.urlopen("http://www.yourwebsite.com")
soup = BeautifulSoup(html_page)
and
import requests
from bs4 import BeautifulSoup
import re
import time
source_code = requests.get('https://stackoverflow.com/')
soup = BeautifulSoup(source_code.content, 'lxml')
I'm currently using python 3.8
Another method.
from simplified_scrapy import SimplifiedDoc, utils, req
url = 'https://boards.greenhouse.io/adhocexternal'
html = req.get(url)
doc = SimplifiedDoc(html)
print (doc.listA(url).url) # Print all links
# Or
lstA = doc.selects('a#data-mapped=true>href()')
print ([utils.absoluteUrl(url, a) for a in lstA])
Result:
['https://adhoc.team/join/', 'https://adhoc.team/blog/', 'https://boards.greenhouse.io/adhocexternal/jobs/4877141002', 'https://boards.greenhouse.io/adhocexternal/jobs/4877155002', 'https://boards.greenhouse.io/adhocexternal/jobs/4869701002', 'https://boards.greenhouse.io/adhocexternal/jobs/4877146002', ...
['https://boards.greenhouse.io/adhocexternal/jobs/4877141002', 'https://boards.greenhouse.io/adhocexternal/jobs/4877155002', ...
Or you can use the framework directly.
from simplified_scrapy import Spider, SimplifiedDoc, SimplifiedMain, utils
class MySpider(Spider):
name = 'greenhouse'
start_urls = ['https://boards.greenhouse.io/adhocexternal']
def extract(self, url, html, models, modelNames):
doc = SimplifiedDoc(html)
urls = doc.listA(url.url)
data = doc.title # Whatever data you want to get
return {'Urls': urls, 'Data': data}
SimplifiedMain.startThread(MySpider()) # Start download

Dynamically extract text from webpage using Python BeautifulSoup

I'm trying to extract player position from many players' webpages (here's an example for Malcolm Brogdon). I'm able to extract Malcolm Brogdon's position using the following code:
player_id = 'malcolm-brogdon-1'
# Import libraries
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as soup
import pandas as pd
import numpy as np
url = "https://www.sports-reference.com/cbb/players/{}.html".format(player_id)
req = Request(url , headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
page_soup = soup(webpage, "html.parser")
pos = page_soup.p.find("strong").next_sibling.strip()
pos
However, I want to be able to do this in a more dynamic way (that is, to locate "Position:" and then find what comes after). There are other players for which the webpage is structured slightly differently, and my current code wouldn't return position (i.e. Cat Barber).
I've tried doing something like page_soup.find("strong", text="Position:") but that doesn't seem to work.
You can select the element that contains the text "Position:" and then the next text sibling:
import requests
from bs4 import BeautifulSoup
url = "https://www.sports-reference.com/cbb/players/anthony-cat-barber-1.html"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
pos = soup.select_one('strong:contains("Position")').find_next_sibling(text=True).strip()
print(pos)
Prints:
Guard
EDIT: Another version:
import requests
from bs4 import BeautifulSoup
url = "https://www.sports-reference.com/cbb/players/anthony-cat-barber-1.html"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
pos = (
soup.find("strong", text=lambda t: "Position" in t)
.find_next_sibling(text=True)
.strip()
)
print(pos)

How to scrape plaintext from multiple links off of one website?

from bs4 import BeautifulSoup
import bs4 as bs
import pandas as pd
import numpy as py
import json
import csv
import re
import urllib.request
sauce =
urllib.request.urlopen("https://www.imdb.com/list/ls003073623/").read()
soup = bs.BeautifulSoup(sauce, 'html.parser')
soup.findAll('a', href=re.compile('^/title/'))
I am trying to scrape multiple links off of a website (about 500) and I don't want to manually input each and every URL, how do I go about scraping this?
With BeautifulSoup
If I understand it right, you are trying to obtain a list containing a part of all the links on a given website. There is an example on BeautifulSoup's documentation that shows exactly how to do that:
from BeautifulSoup import BeautifulSoup
import urllib2
import re
html_page = urllib2.urlopen("https://www.imdb.com/list/ls003073623/")
soup = BeautifulSoup(html_page)
ids = []
for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
ids.append(link.get('href').split("/")[4])
print(ids)
With Selenium
For reference, and since it doesn't seem like the question is limited to only BeautifulSoup, here's how we would do the same using Selenium, a very popular alternative.
from selenium import webdriver
driver = webdriver.Chrome()
driver.get("https://www.imdb.com/list/ls003073623/")
ids = []
elems = driver.find_elements_by_xpath("//a[#href]")
for elem in elems:
ids.append(elem.get_attribute("href").split("/")[4])
print(ids)

Extract the data from a Web Page which has more Textual Content using beautifulsoup in python

I have been trying to extract the data rich nodes of a web page . Is there a way to extract the text from the webpage
import requests
import bs4
from bs4 import BeautifulSoup
import urllib2
url = "http://www.amazon.in"
r = requests.get(url)
html = BeautifulSoup(r.content)
print html.title.text
I can print the title of the webpage ,can you please help me to extract the text(only text) in the webpage.
Thanks in advance
Try doing this
import requests
import bs4
from bs4 import BeautifulSoup
import urllib2
html = urllib.urlopen('http://www.amazon.in').read()
soup = BeautifulSoup(html)
texts = soup.findAll(text=True)
def visible(element):
if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
return False
elif re.match('<!--.*-->', str(element)):
return False
return True
visible_texts = filter(visible, texts)
print visible_texts
Try this
import requests
import bs4
from bs4 import BeautifulSoup
import urllib2
url = "http://www.amazon.in"
r = requests.get(url)
html = BeautifulSoup(r.content, "html.parser")
print html.get_text()

getting email ids using beautifulsoup

I am working on python. I am learning beautifulsoup & I am parsing a link.
my url :
http://www.dtemaharashtra.gov.in/approvedinstitues/StaticPages/frmInstituteSummary.aspx?InstituteCode=1002
I want to parse email id from that url.
How can I do that?
import urllib2
from bs4 import BeautifulSoup
html = urllib2.urlopen('http://www.dtemaharashtra.gov.in/approvedinstitues/StaticPages/frmInstituteSummary.aspx?InstituteCode=1002').read()
soup = BeautifulSoup(html)
print soup.find(id='ctl00_rightContainer_ContentBox1_lblEMailAddress').text
import requests
from bs4 import BeautifulSoup
r = requests.get("http://www.dtemaharashtra.gov.in/approvedinstitues/StaticPages/frmInstituteSummary.aspx?InstituteCode=1002")
soup = BeautifulSoup(r.text)
soup.find("span", {"id":"ctl00_rightContainer_ContentBox1_lblEMailAddress"}).text

Categories