Collecting information by scraping - python

I am trying to collect names of politicians by scraping Wikipedia.
What I would need is to scrape all parties from this page: https://it.wikipedia.org/wiki/Categoria:Politici_italiani_per_partito, then for each of them scrape all the names of politicians within that party (for each party listed in the link that I mentioned above).
I wrote the following code:
from bs4 import BeautifulSoup as bs
import requests
res = requests.get("https://it.wikipedia.org/wiki/Categoria:Politici_italiani_per_partito")
soup = bs(res.text, "html.parser")
array1 = {}
possible_links = soup.find_all('a')
for link in possible_links:
url = link.get("href", "")
if "/wiki/Provenienza" in url: # It is incomplete, as I should scrape also links including word "Politici di/dei"
res1=requests.get("https://it.wikipedia.org"+url)
print("https://it.wikipedia.org"+url)
soup = bs(res1, "html.parser")
possible_links1 = soup.find_all('a')
for link in possible_links1:
url_1 = link.get("href", "")
array1[link.text.strip()] = url_1
but it does not work, as it does not collect names for each party. It collects all the parties (from the wikipedia page that I mentioned above): however, when I try to scrape the parties' pages, it does not collect the names of politician within that party.
I hope you can help me.

You could collect the urls and party names from first page and then loop those urls and add the list of associated politician names to a dict whose key is the party name. You would gain efficiency from using a session object and thereby re-use underlying tcp connection
from bs4 import BeautifulSoup as bs
import requests
results = {}
with requests.Session() as s: # use session object for efficiency of tcp re-use
s.headers = {'User-Agent': 'Mozilla/5.0'}
r = s.get('https://it.wikipedia.org/wiki/Categoria:Politici_italiani_per_partito')
soup = bs(r.content, 'lxml')
party_info = {i.text:'https://it.wikipedia.org/' + i['href'] for i in soup.select('.CategoryTreeItem a')} #dict of party names and party links
for party, link in party_info.items():
r = s.get(link)
soup = bs(r.content, 'lxml')
results[party] = [i.text for i in soup.select('.mw-content-ltr .mw-content-ltr a')] # get politicians names

EDIT : Please refer to QHarr's answer above.
I have already scraped all the parties, and nothing more, I'm sharing this code and I'll edit my answer when I get all the politicians.
from bs4 import BeautifulSoup as bs
import requests
res = requests.get("https://it.wikipedia.org/wiki/Categoria:Politici_italiani_per_partito")
soup = bs(res.text, "html.parser")
url_list = []
politicians_dict = {}
possible_links = soup.find_all('a')
for link in possible_links:
url = link.get("href", "")
if (("/wiki/Provenienza" in url) or ("/wiki/Categoria:Politici_d" in url)):
full_url = "https://it.wikipedia.org"+url
url_list.append(full_url)
for url in url_list:
print(url)

Related

How to select all links of apps from app store and extract its href?

from bs4 import BeautifulSoup
import requests
from urllib.request import urlopen
url = f'https://www.apple.com/kr/search/youtube?src=globalnav'
response = requests.get(url)
html = response.text
soup = BeautifulSoup(html, 'html.parser')
links = soup.select(".rf-serp-productname-list")
print(links)
I want to crawl through all links of shown apps. When I searched for a keyword, I thought links = soup.select(".rf-serp-productname-list") would work, but links list is empty.
What should I do?
Just check this code, I think is what you want:
import re
import requests
from bs4 import BeautifulSoup
pages = set()
def get_links(page_url):
global pages
pattern = re.compile("^(/)")
html = requests.get(f"your_URL{page_url}").text # fstrings require Python 3.6+
soup = BeautifulSoup(html, "html.parser")
for link in soup.find_all("a", href=pattern):
if "href" in link.attrs:
if link.attrs["href"] not in pages:
new_page = link.attrs["href"]
print(new_page)
pages.add(new_page)
get_links(new_page)
get_links("")
Source:
https://gist.github.com/AO8/f721b6736c8a4805e99e377e72d3edbf
You can change the part:
for link in soup.find_all("a", href=pattern):
#do something
To check for a keyword I think
You are cooking a soup so first at all taste it and check if everything you expect contains in it.
ResultSet of your selection is empty cause structure in response differs a bit from your expected one from the developer tools.
To get the list of links select more specific:
links = [a.get('href') for a in soup.select('a.icon')]
Output:
['https://apps.apple.com/kr/app/youtube/id544007664', 'https://apps.apple.com/kr/app/%EC%BF%A0%ED%8C%A1%ED%94%8C%EB%A0%88%EC%9D%B4/id1536885649', 'https://apps.apple.com/kr/app/youtube-music/id1017492454', 'https://apps.apple.com/kr/app/instagram/id389801252', 'https://apps.apple.com/kr/app/youtube-kids/id936971630', 'https://apps.apple.com/kr/app/youtube-studio/id888530356', 'https://apps.apple.com/kr/app/google-chrome/id535886823', 'https://apps.apple.com/kr/app/tiktok-%ED%8B%B1%ED%86%A1/id1235601864', 'https://apps.apple.com/kr/app/google/id284815942']

How can I debug this CCASS Web Scraping (aspx form) code please?

The codes below are supposed to scrape data from an aspx website. It is however not returning anything (no error as well).
original stackoverflow post:
Scraping .aspx page with python (HKEX)
import requests
from bs4 import BeautifulSoup
URL = "http://www.hkexnews.hk/sdw/search/searchsdw.aspx"
with requests.Session() as s:
s.headers={"User-Agent":"Mozilla/5.0"}
res = s.get(URL)
soup = BeautifulSoup(res.text,"lxml")
payload = {item['name']:item.get('value','') for item in soup.select("input[name]")}
payload['__EVENTTARGET'] = 'btnSearch'
payload['txtStockCode'] = '00001'
req = s.post(URL,data=payload,headers={"User-Agent":"Mozilla/5.0"})
soup_obj = BeautifulSoup(req.text,"lxml")
for items in soup_obj.select("table tbody tr"):
data = [item.get_text(strip=True) for item in items.select("td")]
print(data)
You need to change
http://www.hkexnews.hk/sdw/search/searchsdw.aspx
To
https://www.hkexnews.hk/sdw/search/searchsdw.aspx
They use security protocol

Python Href scraping

I'm trying to loop over a href and get the URL. I've managed to extrat the href but i need the full url to get into this link. This is my code at the minute
import requests
from bs4 import BeautifulSoup
webpage_response = requests.get('http://www.harness.org.au/racing/results/?activeTab=tab')
webpage_response.content
webpage_response = requests.get
soup = BeautifulSoup(webpage, "html.parser")
#only finding one track
#soup.table to find all links for days racing
harness_table = soup.table
#scraps a href that is an incomplete URL that im trying to get to
for link in soup.select(".meetingText > a"):
link.insert(0, "http://www.harness.org.au")
webpage = requests.get(link)
new_soup = BeautifulSoup(webpage.content, "html.parser")
#work through table to get links to tracks
print(new_soup)'''
You can store the base url of website in a variable and then once you get the href from link you can join them both to create the next url.
import requests
from bs4 import BeautifulSoup
base_url = "http://www.harness.org.au"
webpage_response = requests.get('http://www.harness.org.au/racing/results/?activeTab=tab')
soup = BeautifulSoup(webpage_response.content, "html.parser")
# only finding one track
# soup.table to find all links for days racing
harness_table = soup.table
# scraps a href that is an incomplete URL that im trying to get to
for link in soup.select(".meetingText > a"):
webpage = requests.get(base_url + link["href"])
new_soup = BeautifulSoup(webpage.content, "html.parser")
# work through table to get links to tracks
print(new_soup)
Try this solution. Maybe you'll like this library.
from simplified_scrapy import SimplifiedDoc,req
url = 'http://www.harness.org.au/racing/results/?activeTab=tab'
html = req.get(url)
doc = SimplifiedDoc(html)
links = [doc.absoluteUrl(url,ele.a['href']) for ele in doc.selects('td.meetingText')]
print(links)
Result:
['http://www.harness.org.au/racing/fields/race-fields/?mc=BA040320', 'http://www.harness.org.au/racing/fields/race-fields/?mc=BH040320', 'http://www.harness.org.au/racing/fields/race-fields/?mc=RE040320']

Scraping Amazon products names

I am trying to gather the first two pages products names on Amazon based on seller name. When I request the page, it has all elements I need ,however, when I use BeautifulSoup - they are not being listed. Here is my code:
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent':'Mozilla/5.0'}
res = requests.get("https://www.amazon.com/s?me=A3WE363L17WQR&marketplaceID=ATVPDKIKX0DER", headers=headers)
#print(res.text)
soup = BeautifulSoup(res.text, "html.parser")
soup.find_all("a",href=True)
The links of products are not listed. If the Amazon API gives this information, I am open to use it (please provide some examples of its usage). Thanks a lot in advance.
I have extracted product names from alt attribute. Is this as intended?
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://www.amazon.com/s?me=A3WE363L17WQR&marketplaceID=ATVPDKIKX0DER')
soup = bs(r.content, 'lxml')
items = [item['alt'] for item in soup.select('.a-link-normal [alt]')]
print(items)
Over two pages:
import requests
from bs4 import BeautifulSoup as bs
url = 'https://www.amazon.com/s?i=merchant-items&me=A3WE363L17WQR&page={}&marketplaceID=ATVPDKIKX0DER&qid=1553116056&ref=sr_pg_{}'
for page in range(1,3):
r = requests.get(url.format(page,page))
soup = bs(r.content, 'lxml')
items = [item['alt'] for item in soup.select('.a-link-normal [alt]')]
print(items)

How to scrape next page data as i do in the first page?

I have the following code:
from bs4 import BeautifulSoup
import requests
import csv
url = "https://coingecko.com/en"
base_url = "https://coingecko.com"
page = requests.get(url)
soup = BeautifulSoup(page.content,"html.parser")
names = [div.a.span.text for div in soup.find_all("div",attrs={"class":"coin-content center"})]
Link = [base_url+div.a["href"] for div in soup.find_all("div",attrs={"class":"coin-content center"})]
for link in Link:
inner_page = requests.get(link)
inner_soup = BeautifulSoup(inner_page.content,"html.parser")
indent = inner_soup.find("div",attrs={"class":"py-2"})
content = indent.div.next_siblings
Allcontent = [sibling for sibling in content if sibling.string is not None]
print(Allcontent)
I have successfully enter to innerpage and grabbed all coins' information from the first page listed coin. But there is next page as 1,2,3,4,5,6,7,8,9 etc. How can I go to all the next page and do the same as previously?
Further, the output of my code contains a lot of \n and space. How can I fix that.
You need to generate all the pages and requests one by one and parse using bs4
from bs4 import BeautifulSoup
import requests
req = requests.get('https://www.coingecko.com/en')
soup = BeautifulSoup(req.content, 'html.parser')
last_page = soup.select('ul.pagination li:nth-of-type(8) > a:nth-of-type(1)')[0]['href']
lp = last_page.split('=')[-1]
count = 0
for i in range(int(lp)):
count+=1
url = 'https://www.coingecko.com/en?page='+str(count)
print(url)
requests.get(url)#requests each page one by one till last page
##parse your fileds here using bs4
The way you have written your script has got a messy look. Try with .select() to make it concise and less prone to break. Although I could not find the further usage of names in your script, I kept it as it is. Here is how you can get all the available links traversing multiple pages.
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import requests
url = "https://coingecko.com/en"
while True:
page = requests.get(url)
soup = BeautifulSoup(page.text,"lxml")
names = [item.text for item in soup.select("span.d-lg-block")]
for link in [urljoin(url,item["href"]) for item in soup.select(".coin-content a")]:
inner_page = requests.get(link)
inner_soup = BeautifulSoup(inner_page.text,"lxml")
desc = [item.get_text(strip=True) for item in inner_soup.select(".py-2 p") if item.text]
print(desc)
try:
url = urljoin(url,soup.select_one(".pagination a[rel='next']")['href'])
except TypeError:break
Btw, whitespaces have also been taken care of by using .get_text(strip=True)

Categories