Beautifulsoup - Problems for webcrawler - python

How to correctly output all links on this news website? (in list form)
After output in list form, how can I return the result randomly (3~5 links a time)
note: the code I need starts from line 739 (nearly it may change a bit cause it refresh everyday)
div class="abdominis rlby clearmen"
and I need every link inside this kind of thing
<a href="https://tw.news.appledaily.com/life/realtime/20180308/1310910/>
Thanks!! the code is below:
from bs4 import BeautifulSoup
from flask import Flask, request, abort
import requests
import re
import random
import types
target_url = 'http://www.appledaily.com.tw/realtimenews/section/new/'
print('Start parsing appleNews....')
rs = requests.session()
res = rs.get(target_url, verify=False)
soup = BeautifulSoup(res.text, 'html.parser')
#can output all links but with useless information
contents = soup.select("div[class='abdominis rlby clearmen']")[0].find_all('a')
print(contents)
#can output single link but not in list form
#contents = soup.select("div[class='abdominis rlby clearmen']")[0].find('a').get('href')
#print(contents)

Here is a solution which will append each link to a list if it is contained in the specified div..
from bs4 import BeautifulSoup
from flask import Flask, request, abort
import requests
import re
import random
import types
target_url = 'http://www.appledaily.com.tw/realtimenews/section/new/'
print('Start parsing appleNews....')
rs = requests.session()
res = rs.get(target_url, verify=False)
soup = BeautifulSoup(res.text, 'html.parser')
list_links = [] # Create empty list
for a in soup.select("div[class='abdominis rlby clearmen']")[0].findAll(href=True): # find links based on div
list_links.append(a['href']) #append to the list
print(a['href']) #Check links
for l in list_links: # print list to screen (2nd check)
print(l)
To create random links to be returned.
import random #import random module
random_list = [] #create random list if needed..
random.shuffle(list_links) #random shuffle the list
for i in range(5): # specify range (5 items in this instance)
try:
res = list_links.pop(random.randint(0, len(list_links))) # pop of each item randomly based on the size of the list
print(res) #print to screen..
random)list.append(res) # or append to random_list
except IndexError:
pass
One last edit as you asked for it to be returned..
Here it is as a function that returns a list of x amount of random links..
def return_random_link(list_, num):
""" Takes in a list and returns a random amount of items """
random.shuffle(list_)
random_list = []
for i in range(num):
try: # try to append to the list
r = list_.pop(random.randint(0, len(list_)))
random_list.append(r)
except IndexError: #except an IndexError (no items
return random_list # Return the list of items
return random_list
random_list = return_random_link(list_links, 5)
for i in random_list:
print(i)

If you want the link tag without its descendents, you can clear them:
for elm in contents:
elm.clear()
I image I'd be more interested in extracting just the links, though:
contents = [a['href'] for a in contents]
To get results in a random order, try using random.shuffle() and grabbing however many elements from the reshuffled list at a time you need.

Related

Trying to get data from a table using beautifulsoup in python

Trying to get the "all splits" line of numbers from https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame (html code is in the picture) my code returns the 'all splits' text instead of the numbers I'm looking for. How do I go about changing the lookups in the GetStats function area to get the numbers instead of the first column descriptors.
import requests
from bs4 import BeautifulSoup
import re
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
import csv
urls = []
data = []
for year in range(2003, 2005):
for page in range(1, 9):
url = f'http://www.espn.com/nba/hollinger/statistics/_/page/{page}/year/{year}/qualified/false'
if url is not None:
urls.append(url)
def GetData(url):
names_list = [] # names of players
pers = [] # player efficency ratings
playeridlist = [] # list of player ids to be used in making new stats searchable url
statsurls = [] # list of urls generated to get player stats
# makes a pattern for the function to look for
pattern = re.compile('playerId=(\d+)')
# setsup soup function
req = requests.get(url)
soup = BeautifulSoup(req.text, 'lxml')
# finds players names and adds to list
names = soup.find(lambda tag: tag.name == 'a' and 'playerId' in tag['href'])
bodytext = names.text
names_list.append(bodytext)
# finds plays player efficency rating and adds to list
pertag = soup.find('td', class_='sortcell')
per = pertag.text
pers.append(per)
# finds player id
names = soup.find('a', href=pattern)
player_id = names['href'].split('playerId=')[1]
playeridlist.append(player_id)
# uses player id to make a list of new urls for that player and get stats
for player_id in playeridlist:
statsurl = f"https://insider.espn.com/nba/player/splits/_/id/{player_id}/type/nba/year/{year}/category/perGame"
if statsurl is not None:
statsurls.append(statsurl)
# parses stats to get stats
def GetStats(statsurl): # GO BACK AND MAKE A THREAD EXECUTER STATEMENT WITHIN GETDATA FUNCTION BELOW THIS!!!
statsreq = requests.get(statsurl)
statssoup = BeautifulSoup(statsreq.text, 'lxml')
focusing_search = statssoup.find('tr', class_='Table__TR Table__TR--sm Table__even', attrs={'data-idx': '1'})
playerstathtml = focusing_search.find('td', class_='Table__TD')
stat_values = [playerstats.text for playerstats in playerstathtml]
print(stat_values)
GetStats("https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame")
#name_and_stats_list = dict(map(lambda i, j: (i, j), names_list, pers))
print(f"{bodytext}: {per}")
print(player_id)
GetData('http://www.espn.com/nba/hollinger/statistics/_/page/1/year/2003/qualified/false')
To get the all_splits stats from:
https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame
This is what I did:
I grabbed the table body using soup.select
Then I grabbed the headings and relevant stats by iterating through the columns/rows.
The list comprehension provides the text in list format, which is easy to convert to a dataframe.
Code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame'
soup = BeautifulSoup(requests.get(url).content, "html.parser")
t = soup.select('main#fittPageContainer div.Table__Scroller > table > tbody')
headings = [h.text for h in t[0].find_next('tr').find_all('td')]
all_splits = [h.text for h in t[0].find_all('tr')[1].find_all('td')]
df = pd.DataFrame([all_splits], columns=headings)
print(df)
Output:

Optimize scraping through list of urls and write to csv

With a csv of 20k+ urls I want to scrape and find the html element "super-attribute-select". If found, write the url to column A, along with the product number(sku) to column B. If not found, write url to column C and sku to column D. Finally, save the dataframe to a csv file.
If i run the following code it works, but my program runs out of memory. It liked to find a way to optimize this. Now ~1500 urls take 5 hrs to process. While the entire csv is 20k.
import urllib.request
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from pandas import Series
urlList = pd.read_csv(r"url.csv")
urlList = urlList.url.tolist()
notfound = []
found = []
skulist =[]
skumissinglist =[]
# Function scrape, pass url, open with soup, and find class
def scrape(url):
tag ='select'
classused = "super-attribute-select"
d = dict(A=np.array(found), B=np.array(skulist), C=np.array(notfound), D=np.array(skumissinglist))
try:
content = urllib.request.urlopen(url)
soup = BeautifulSoup(content, features="html.parser")
sku= soup.find("div", {"itemprop": "sku"}).string
result = soup.find(tag, class_=classused)
#soup returns None if can't find anything
if result == None:
notfound.append(url)
skumissinglist.append(sku)
else:
found.append(url)
skulist.append(sku)
except:
result = print("Some extraction went wrong")
df = pd.DataFrame(dict([(k, Series(v)) for k, v in d.items()]))
df = df.to_csv('Test.csv')
for i in urlList:
scrape(i)
If I were doing this, I would try a few things:
(1) Update a dictionary instead of appending to a list. I think dictionaries are faster and more memory-efficient than lists.
(2) Rather than export each URL result as a CSV with the same name, either (a) preferred: wait until you are done to export all results as a single CSV, or (b) worse: maybe export them to different filenames by using f-strings instead of overwriting 'Test.csv' every time.
You could use a pool either with gevent or the built in one from urllib3 (or requests). Then you could do 10, or 100 a time depending on poolsize, and use an async queue to get remaining ones as the pools get exhausted.
from gevent import monkey, spawn, joinall
monkey.patch_all()
from gevent.pool import Pool as GeventPool
import pandas as pd
from pandas import Series
import numpy as np
import requests
from bs4 import BeautifulSoup
urlList = pd.read_csv(r"url.csv")
urlList = urlList.url.tolist()
pool = GeventPool(10)
notfound = []
found = []
skulist =[]
skumissinglist =[]
count = len(urllist)
# Function scrape, pass url, open with soup, and find class
def scrape(url):
tag ='select'
classused = "super-attribute-select"
d = dict(A=np.array(found), B=np.array(skulist), C=np.array(notfound), D=np.array(skumissinglist))
try:
content = requests.get(url).text
soup = BeautifulSoup(content, features="html.parser")
sku= soup.find("div", {"itemprop": "sku"}).string
result = soup.find(tag, class_=classused)
#soup returns None if can't find anything
if result == None:
notfound.append(url)
skumissinglist.append(sku)
else:
found.append(url)
skulist.append(sku)
except:
print("Some extraction went wrong")
df = pd.DataFrame(dict([(k, Series(v)) for k, v in d.items()]))
return df.to_csv('Test.csv')
pool.map(scrape, urllist)

Python Beautiful Soup not looping results

Im using BS4 for the first time and need to scrape the items from an online catalogue to csv.
I have setup my code however when i run the code the results are only repeating the first item in the catalogue n times (where n is the number of items).
Can someone review my code and let me know where i am going wrong.
Thanks
import requests
from bs4 import BeautifulSoup
from csv import writer
#response = requests.get('https://my.supplychain.nhs.uk/Catalogue/browse/27/anaesthetic-oxygen-and-resuscitation?CoreListRequest=BrowseCoreList')
response = requests.get('https://my.supplychain.nhs.uk/Catalogue/browse/32/nhs-cat?LastCartId=&LastFavouriteId=&CoreListRequest=BrowseAll')
soup = BeautifulSoup(response.text , 'html.parser')
items = soup.find_all(class_='productPrevDetails')
#print(items)
for item in items:
ItemCode = soup.find(class_='product_npc ').get_text().replace('\n','')
ItemNameS = soup.select('p')[58].get_text()
ProductInfo = soup.find(class_='product_key_info').get_text()
print(ItemCode,ItemNameS,ProductInfo)
You always see the first result because you are searching soup, not the item. Try
for item in items:
ItemCode = item.find(class_='product_npc ').get_text().replace('\n','')
ItemNameS = item.select('p')[58].get_text()
ProductInfo = item.find(class_='product_key_info').get_text()
print(ItemCode,ItemNameS,ProductInfo)

Add a string into a url python

I am trying to add a string in the middle of an url. Somehow my output looks like this:
http://www.Holiday.com/('Woman',)/Beach
http://www.Holiday.com/('Men',)/Beach
Somehow it should look like this:
http://www.Holiday.com/Woman/Beach
http://www.Holiday.com/Men/Beach
The code which I am using looks like the following:
list = {'Woman','Men'}
url_test = 'http://www.Holiday.com/{}/Beach'
for i in zip(list):
url = url_test.format(str(i))
print(url)
Almost there. Just no need for zip:
items = {'Woman','Men'} # notice that this is a `set` and not a list
url_test = 'http://www.Holiday.com/{}/Beach'
for i in items:
url = url_test.format(i)
print(url)
The purpose of the zip function is to join several collections by the index if the item. When the zip joins the values from each collection it places them in a tuple which it's __str__ representation is exactly what you got.
Here you just want to iterate the items in the collection
You can try this also, And please don't use list as a variable name.
lst = {'Woman','Men'}
url_test = 'http://www.Holiday.com/%s/Beach'
for i in lst:
url = url_test %i
print url
from urllib.request import urlopen
from bs4 import BeautifulSoup as BS
url = "https://www.imdb.com/chart/top?ref_=nv_mv_250"
html = urlopen(url)
url_list = BS(html, 'lxml')
type(url_list)
all_links = url_list.find_all('a', href=re.compile("/title/tt"))
for link in all_links:
print(link.get("href"))
all_urls = link.get("href")
url_test = 'http://www.imdb.com/{}/'
for i in all_urls:
urls = url_test.format(i)
print(urls)
this is the code to scrape the urls of all the 250 movies from the main url.
but the code gives the result as ------
http://www.imdb.com///
http://www.imdb.com/t/
http://www.imdb.com/i/
http://www.imdb.com/t/
http://www.imdb.com/l/
http://www.imdb.com/e/
http://www.imdb.com///
and so on ...
how can i split 'all_urls' using a comma, or how can I make a list of urls in
'all_urls'....

Get list of all paginated URL's from links in txt file in python requests

Hi Guys Define a Function to Get list of all paginated URLs at bottom from links in txt file in python.
Here is an example of what i need done.
Input link
http://www.apartmentguide.com/apartments/Alabama/Hartselle/
Desired Output
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=2
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=3
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=4
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=5
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=6
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=7
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=8
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=9
so on to any limit each Input Url have.
This is the function i written so far but its not working i am not good with Python either .
import requests
#from bs4 import BeautifulSoup
from scrapy import Selector as Se
import urllib2
lists = open("C:\Users\Administrator\Desktop\\3.txt","r")
read_list = lists.read()
line = read_list.split("\n")
def get_links(line):
for each in line:
r = requests.get(each)
sel = Se(text=r.text, type="html")
next_ = sel.xpath('//a[#class="next sprite"]//#href').extract()
for next_1 in next_:
next_2 = "http://www.apartmentguide.com"+next_1
print next_2
get_links(next_1)
get_links(line)
Below are two ways to do this.
import mechanize
import requests
from bs4 import BeautifulSoup, SoupStrainer
import urlparse
import pprint
#-- Mechanize --
br = mechanize.Browser()
def get_links_mechanize(root):
links = []
br.open(root)
for link in br.links():
try:
if dict(link.attrs)['class'] == 'page':
links.append(link.absolute_url)
except:
pass
return links
#-- Requests / BeautifulSoup / urlparse --
def get_links_bs(root):
links = []
r = requests.get(root)
for link in BeautifulSoup(r.text, parse_only=SoupStrainer('a')):
if link.has_attr('href') and link.has_attr('class') and 'page' in link.get('class'):
links.append(urlparse.urljoin(root, link.get('href')))
return links
#with open("C:\Users\Administrator\Desktop\\3.txt","r") as f:
# for root in f:
# links = get_links(root)
# # <Do something with links>
root = 'http://www.apartmentguide.com/apartments/Alabama/Hartselle/'
print "Mech:"
pprint.pprint( get_links_mechanize(root) )
print "Requests/BS4/urlparse:"
pprint.pprint( get_links_bs(root) )
One uses mechanize -- it's a bit smarter with URLs but it's a lot slower and may be overkill depending on what else you're doing.
The other uses requests to fetch the page (urllib2 would suffice), BeautifulSoup to parse the markup and urlparse to form absolute URLs from the relative URLs in the page you listed.
Note that both of these functions return the following list:
['http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=2',
'http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=3',
'http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=4',
'http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=5',
'http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=2',
'http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=3',
'http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=4',
'http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=5']
which has duplicates. You can get rid of the duplicates by changing
return links
to
return list(set(links))
for whatever method you choose.
EDIT:
I noticed that the above functions only returned the links to pages 2-5, and you'd have to navigate those pages to see that there were in fact 10 pages.
A completely different approach would be to scrape the "root" page for number of results, then predict how many pages that would result in, then build links from that.
Since there are 20 results per page, figuring out how many pages is straightforward, consider:
import requests, re, math, pprint
def scrape_results(root):
links = []
r = requests.get(root)
mat = re.search(r'We have (\d+) apartments for rent', r.text)
num_results = int(mat.group(1)) # 182 at the moment
num_pages = int(math.ceil(num_results/20.0)) # ceil(182/20) => 10
# Construct links for pages 1-10
for i in range(num_pages):
links.append("%s?page=%d" % (root, (i+1)))
return links
pprint.pprint(scrape_results(root))
This will be the fastest method of the 3, but possibly more error prone.
EDIT 2:
Maybe something like:
import re, math, pprint
import requests, urlparse
from bs4 import BeautifulSoup, SoupStrainer
def get_pages(root):
links = []
r = requests.get(root)
mat = re.search(r'We have (\d+) apartments for rent', r.text)
num_results = int(mat.group(1)) # 182 at the moment
num_pages = int(math.ceil(num_results/20.0)) # ceil(182/20) => 10
# Construct links for pages 1-10
for i in range(num_pages):
links.append("%s?page=%d" % (root, (i+1)))
return links
def get_listings(page):
links = []
r = requests.get(page)
for link in BeautifulSoup(r.text, parse_only=SoupStrainer('a')):
if link.has_attr('href') and link.has_attr('data-listingid') and 'name' in link.get('class'):
links.append(urlparse.urljoin(root, link.get('href')))
return links
root='http://www.apartmentguide.com/apartments/Alabama/Hartselle/'
listings = []
for page in get_pages(root):
listings += get_listings(page)
pprint.pprint(listings)
print(len(listings))
With Re i was unsure ,so tried xpath.
links = open("C:\Users\ssamant\Desktop\Anida\Phase_II\Apartmentfinder\\2.txt","r")
read_list = links.read()
line = read_list.split("\n")
for each in line:
lines = []
r = requests.get(each)
sel = Selector(text=r.text,type="html")
mat = sel.xpath('//h1//strong/text()').extract()
mat = str(mat)
mat1 = mat.replace(" apartments for rent']","")
mat2 = mat1.replace("[u'","")
mat3 = int(mat2)
num_pages = int(math.ceil(mat3/20.0))
for i in range(num_pages):
lines.append("%s/Page%d" % (each, (i+1)))
with open('C:\Users\ssamant\Desktop\Anida\Phase_II\Apartmentfinder\\test.csv', 'ab') as f:
writer = csv.writer(f)
for val in lines:
writer.writerow([val])

Categories