Python issue for crawling multiple page title - python

I am a marketer and want to conduct some basic market research using Python.
I wrote a simple coding to crawl multiple pages of title, but it does not work to put the title text in the list and to transfer it into Excel format. How can I do in this case?
I tried to create a list and used the extend() method to put these looped titles on the list, but it did not work:
import requests
import pandas as pd
from bs4 import BeautifulSoup
def content_get(url):
count = 0
while count < 4: #this case was to crawl titles of 4 pages
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")
titles = soup.find(id="main-container").find_all("div", class_="r-ent")
for title in titles:
print([title.find('div', class_='title').text])
nextpageurl = soup.find("a", string="‹ 上頁")["href"]
url = "https://www.ptt.cc" + nextpageurl
count += 1
firstpage = "https://www.ptt.cc/bbs/movie/index9002.html"
content_get(firstpage)

You need to add the titles to a list outside of the while loop:
def content_get(url):
count = 0
titles = []
while count < 4:
r = requests.get(url)
soup = BeautifulSoup(r.text)
title_page = [title.text.replace('\n', '') for title in soup.find_all('div', {'class': 'title'})]
titles.extend(title_page)
nextpageurl = soup.find("a", string="‹ 上頁")["href"]
url = "https://www.ptt.cc" + nextpageurl
count += 1
return titles
If you don't want the list comprehension to get titles_page, that can be replaced with a traditional for loop:
titles_page = []
titles = soup.find_all('div', {'class': 'title'})
for title in titles:
titles_page.append(title.text.replace('\n', ''))
For the excel file:
def to_excel(text):
df = pd.DataFrame(text, columns=['Title'])
return df.to_excel('output.xlsx')

Related

Extracting URL From Span Element Without href

I am attempting to extract links from a website that does not use a href. I have tried multiple iterations of trying to find the tag associated with the url that from what I can gather is between <span> elements.
import requests
from bs4 import BeautifulSoup
url = 'https://www.flavortownusa.com/locations'
page = requests.get(url)
f = open("test12.csv", "w")
soup = BeautifulSoup(page.content, 'html.parser')
lists = soup.find_all('div', class_ = 'listing-item-inner')
for list in lists:
title = list.find('span', class_ = '$0')
webs = list.find('#text', class_ = 'fa-fa.link')
address = list.find('ul', class_ = 'post-meta')
temp = list.find('span', class_ = 'text')
temp2 = list.find('i', class_ = '(text)')
info = [title, webs, address, temp, temp2]
f.write(str(info))
f.write("\n")
print(info)
The desired output is to extract data from <span></span> where the 345 40th Ave N and the url below i class = 'fa fa-link' and i class = 'fa fa-phone' where the three elements are placed into a CSV File
You could call next element e.find(class_ = 'fa-link').nextafter selecting the <i> with class fa-link:
for e in lists:
print(e.find(class_ = 'fa-link').next.strip() if e.find(class_ = 'fa-link') else '')
Note: Do not use reserved keywords like list and always check if element you are searching for is available.
Example
import requests
from bs4 import BeautifulSoup
url = 'https://www.flavortownusa.com/locations'
soup = BeautifulSoup(page.content, 'html.parser')
with open('somefile.csv', 'a', encoding='utf-8') as f:
for e in soup.find_all('div', class_ = 'listing-item-inner'):
title = e.h3.text
webs = e.select_one('.fa-link').next if e.select_one('.fa-link') else ''
address = e.span.text
phone = e.select_one('.fa-phone').next if e.select_one('.fa-phone') else ''
f.write(','.join([title, webs, address, phone])+'\n')

Looping until max results

I'm pretty new to web scraping but enjoying it so far so thought I'd test myself!
I've written this query to scrape this website but just wondering is there a way of making it more efficient? At the moment, I've had to set the max page to 87 as this is the last page that guitars appear on. However, amps only have 15 pages of results but I'm still looping through 87. Any ideas appreciated!
import pandas as pd
import requests
from bs4 import BeautifulSoup
guitar_products = []
n = 88
#ELECTRIC GUITAR DATA
for category in ['guitars/electric/','guitars/bass/','amps/','guitars/acoustic/','pedals/']:
for x in range(1,n):
url = "https://www.guitarguitar.co.uk/" + category + "page-" + str(x)
print(url)
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
products = [product.text.strip() for product in soup.findAll('h3', {'class': 'qa-product-list-item-title'})]
prices = [price.text.strip()[:-1] for price in soup.findAll('span', {'class': 'js-pounds'})]
avails = [avail.text.strip() for avail in soup.findAll('div', {'class': 'availability'})]
for index in range(0, len(products)):
guitar_products.append({
'product': products[index],
'price' : prices[index],
'avail' : avails[index]
})
guitar_data = pd.DataFrame(guitar_products)
guitar_data['price'] = pd.to_numeric(guitar_data['price'].str.replace('[^\d.]', '', regex=True))
Thanks
Try the following approach:
import pandas as pd
import requests
from bs4 import BeautifulSoup
guitar_products = []
#ELECTRIC GUITAR DATA
for category in ['guitars/electric/', 'guitars/bass/', 'amps/', 'guitars/acoustic/', 'pedals/']:
page_number = 1
while True:
url = f"https://www.guitarguitar.co.uk/{category}page-{page_number}"
print(url)
page_number += 1
req = requests.get(url)
soup = BeautifulSoup(req.content, 'html.parser')
for div_product in soup.find_all('div', class_="product-inner"):
product = div_product.find('h3', {'class': 'qa-product-list-item-title'}).get_text(strip=True)
price = div_product.find('span', {'class': 'js-pounds'}).get_text(strip=True)
avail = div_product.find('div', {'class': 'availability'}).get_text(strip=True)
guitar_products.append({'product' : product, 'price' : price, 'avail' : avail})
# Is there a next button?
if not soup.find('a', class_="next-page-button"):
print("No more")
break
guitar_data = pd.DataFrame(guitar_products)
guitar_data['price'] = pd.to_numeric(guitar_data['price'].str.replace('[^\d.]', '', regex=True))
Improvements:
This looks for the Next button on each page to then skip to the next category.
It locates the <div> holding each product and then uses a single find to get each product detail. This avoids the need to build multiple lists and then join them.
Build the URL using a Python f string.
You can check H1:
*soup = BeautifulSoup(page.content, 'html.parser')*
if soup.find('h1').contents[0] == 'Page Not Found':
break
or change circle from for to while:
is_page = True
x = 0
while is_page:
x = x + 1
. . .
if soup.find('h1').contents[0] == 'Page Not Found':
is_page = False
break
This is probably not the most elegant solution, but it is functional and straightforward. An infinite loop which ends if no product is found.
import pandas as pd
import requests
from bs4 import BeautifulSoup
guitar_products = []
n = 1
# ELECTRIC GUITAR DATA
for category in ['guitars/electric/', 'guitars/bass/', 'amps/', 'guitars/acoustic/', 'pedals/']:
while True:
url = "https://www.guitarguitar.co.uk/" + category + "page-" + str(n)
print(url)
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
products = [product.text.strip() for product in soup.findAll('h3', {'class': 'qa-product-list-item-title'})]
prices = [price.text.strip()[:-1] for price in soup.findAll('span', {'class': 'js-pounds'})]
avails = [avail.text.strip() for avail in soup.findAll('div', {'class': 'availability'})]
for index in range(0, len(products)):
guitar_products.append({
'product': products[index],
'price': prices[index],
'avail': avails[index]
})
if len(products) == 0:
n = 1
break
else:
n += 1
guitar_data = pd.DataFrame(guitar_products)
guitar_data['price'] = pd.to_numeric(guitar_data['price'].str.replace('[^\d.]', '', regex=True))

web-scrape: get H4 attributes & href

I am trying to web-scrape a website. But I can get access to the attributes of some fields.
here is the code i used:
import urllib3
from bs4 import BeautifulSoup
import pandas as pd
scrap_list = pd.DataFrame()
for path in range(10): # scroll over the categories
for path in range(10): # scroll over the pages
url = 'https://www.samehgroup.com/index.php?route=product/category'+str(page)+'&'+'path='+ str(path)
req = urllib3.PoolManager()
res = req.request('GET', URL)
soup = BeautifulSoup(res.data, 'html.parser')
soup.findAll('h4', {'class': 'caption'})
# extract names
scrap_name = [i.text.strip() for i in soup.findAll('h2', {'class': 'caption'})]
scrap_list['product_name']=pd.DataFrame(scrap_name,columns =['Item_name'])
# extract prices
scrap_list['product_price'] = [i.text.strip() for i in soup.findAll('div', {'class': 'price'})]
product_price=pd.DataFrame(scrap_price,columns =['Item_price'])
I want an output that provides me with each product and its price. I still can't get that right.
Any help would be very much appreciated.
I think the problem here was looping through the website pages. I got the code below working by first making a list of urls containing numbered 'paths' corresponding to pages on the website. Then looping through this list and applying a page number to the url.
If you wanted to only get all the products from a certain page, this page can be selected from the urlist and by index.
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
urlist = [] #create list of usable url's to iterate through,
for i in range(1,10): # 9 pages equal to pages on website
urlist.append('https://www.samehgroup.com/index.php?route=product/category&path=' + str(i))
namelist = []
newprice = []
for urlunf in urlist: #first loop to get 'path'
for n in range(100): #second loop to get 'pages'. set at 100 to cover website max page at 93
try: #try catches when pages containing products run out.
url = urlunf + '&page=' + str(n)
page = requests.get(url).text
soup = BeautifulSoup(page, 'html')
products = soup.find_all('div', class_='caption')
for prod in products: #loops over returned list of products for names and prices
name = prod.find('h4').text
newp = prod.find('p', class_='price').find('span', class_='price-new').text
namelist.append(name) #append data to list outside of loop
newprice.append(newp)
time.sleep(2)
except AttributeError: #if there are no more products it will move to next page
pass
df = pd.DataFrame() #create df and add scraped data
df['name'] = namelist
df['price'] = newprice

Use Beatiful Soup in scraping multiple websites

I want to know why lists all_links and all_titles don't want to receive any records from lists titles and links. I have tried also .extend() method and it didn't help.
import requests
from bs4 import BeautifulSoup
all_links = []
all_titles = []
def title_link(page_num):
page = requests.get(
'https://www.gumtree.pl/s-mieszkania-i-domy-sprzedam-i-kupie/warszawa/page-%d/v%dc9073l3200008p%d'
% (page_num, page_num, page_num))
soup = BeautifulSoup(page.content, 'html.parser')
links = ['https://www.gumtree.pl' + link.get('href')
for link in soup.find_all('a', class_ ="href-link tile-title-text")]
titles = [flat.next_element for flat in soup.find_all('a', class_ = "href-link tile-title-text")]
print(titles)
for i in range(1,5+1):
title_link(i)
all_links = all_links + links
all_titles = all_titles + titles
i+=1
print(all_links)
import pandas as pd
df = pd.DataFrame(data = {'title': all_titles ,'link': all_links})
df.head(100)
#df.to_csv("./gumtree_page_1.csv", sep=';',index=False, encoding = 'utf-8')
#df.to_excel('./gumtree_page_1.xlsx')
When I ran your code, I got
NameError Traceback (most recent call last)
<ipython-input-3-6fff0b33d73b> in <module>
16 for i in range(1,5+1):
17 title_link(i)
---> 18 all_links = all_links + links
19 all_titles = all_titles + titles
20 i+=1
NameError: name 'links' is not defined
That points to a problem - variable named links is not defined in a global scope (where you add it to all_links). You can read about python scopes here. You'd need to return links and titles from title_link. Something similar to this:
def title_link(page_sum):
# your code here
return links, titles
for i in range(1,5+1):
links, titles = title_link(i)
all_links = all_links + links
all_titles = all_titles + titles
print(all_links)
This code is exhibits confusion about scoping. titles and links inside of title_link are local to that function. When the function ends, the data disappears and it cannot be accessed from another scope such as main. Use the return keyword to return values from functions. In this case, you'd need to return a tuple pair of titles and links like return titles, links.
Since functions should do one task only, having to return a pair shows reveals a possible design flaw. A function like title_link is overloaded and should probably be two separate functions, one to get titles and one to get links.
Having said that, the functions here seem like premature abstractions since the operations can be done directly.
Here's a suggested rewrite:
import pandas as pd
import requests
from bs4 import BeautifulSoup
url = "https://www.gumtree.pl/s-mieszkania-i-domy-sprzedam-i-kupie/warszawa/page-%d/v%dc9073l3200008p%d"
data = {"title": [], "link": []}
for i in range(1, 6):
page = requests.get(url % (i, i, i))
soup = BeautifulSoup(page.content, "html.parser")
titles = soup.find_all("a", class_="href-link tile-title-text")
data["title"].extend([x.next_element for x in titles])
data["link"].extend("https://www.gumtree.pl" + x.get("href") for x in titles)
df = pd.DataFrame(data)
print(df.head(100))
Other remarks:
i+=1 is unnecessary; for loops move forward automatically in Python.
(1,5+1) is clearer as (1, 6).
List comprehensions are great, but if they run multiple lines, consider writing them as normal loops or creating an intermediate variable or two.
Imports should be at the top of a file only. See PEP-8.
list.extend(other_list) is preferable to list = list + other_list, which is slow and memory-intensive, creating a whole copy of the list.
Try this:
import requests
from bs4 import BeautifulSoup
all_links = []
all_titles = []
def title_link(page_num):
page = requests.get(
'https://www.gumtree.pl/s-mieszkania-i-domy-sprzedam-i-kupie/warszawa/page-%d/v%dc9073l3200008p%d'
% (page_num, page_num, page_num))
page.encoding = 'utf-8'
soup = BeautifulSoup(page.content, 'html.parser', from_encoding='utf-8')
links = ['https://www.gumtree.pl' + link.get('href')
for link in soup.find_all('a', class_ ="href-link tile-title-text")]
titles = [flat.next_element for flat in soup.find_all('a', class_ = "href-link tile-title-text")]
print(titles)
return links, titles
for i in range(1,5+1):
links, titles = title_link(i)
all_links.extend(links)
all_titles.extend(titles)
# i+=1 not needed in python
print(all_links)
import pandas as pd
df = pd.DataFrame(data = {'title': all_titles ,'link': all_links})
df.head(100)
I think you just needed to get links and titles out of title_link(page_num).
Edit: removed the manual incrementing per comments
Edit: changed the all_links = all_links + links to all_links.extend(links)
Edit: website is utf-8 encoded, added page.encoding = 'utf-8' and as extra (probably unnecessary) measure, from_encoding='utf-8' to the BeautifulSoup

requests.exceptions.MissingSchema: Invalid URL 'h': No schema supplied

I am working on a web scraping project and have run into the following error.
requests.exceptions.MissingSchema: Invalid URL 'h': No schema supplied. Perhaps you meant http://h?
Below is my code. I retrieve all of the links from the html table and they print out as expected. But when I try to loop through them (links) with request.get I get the error above.
from bs4 import BeautifulSoup
import requests
import unicodedata
from pandas import DataFrame
page = requests.get("http://properties.kimcorealty.com/property/output/find/search4/view:list/")
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find('table')
for ref in table.find_all('a', href=True):
links = (ref['href'])
print (links)
for link in links:
page = requests.get(link)
soup = BeautifulSoup(page.content, 'html.parser')
table = []
# Find all the divs we need in one go.
divs = soup.find_all('div', {'id':['units_box_1', 'units_box_2', 'units_box_3']})
for div in divs:
# find all the enclosing a tags.
anchors = div.find_all('a')
for anchor in anchors:
# Now we have groups of 3 list items (li) tags
lis = anchor.find_all('li')
# we clean up the text from the group of 3 li tags and add them as a list to our table list.
table.append([unicodedata.normalize("NFKD",lis[0].text).strip(), lis[1].text, lis[2].text.strip()])
# We have all the data so we add it to a DataFrame.
headers = ['Number', 'Tenant', 'Square Footage']
df = DataFrame(table, columns=headers)
print (df)
Your mistake is second for loop in code
for ref in table.find_all('a', href=True):
links = (ref['href'])
print (links)
for link in links:
ref['href'] gives you single url but you use it as list in next for loop.
So you have
for link in ref['href']:
and it gives you first char from url http://properties.kimcore... which is h
Full working code
from bs4 import BeautifulSoup
import requests
import unicodedata
from pandas import DataFrame
page = requests.get("http://properties.kimcorealty.com/property/output/find/search4/view:list/")
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find('table')
for ref in table.find_all('a', href=True):
link = ref['href']
print(link)
page = requests.get(link)
soup = BeautifulSoup(page.content, 'html.parser')
table = []
# Find all the divs we need in one go.
divs = soup.find_all('div', {'id':['units_box_1', 'units_box_2', 'units_box_3']})
for div in divs:
# find all the enclosing a tags.
anchors = div.find_all('a')
for anchor in anchors:
# Now we have groups of 3 list items (li) tags
lis = anchor.find_all('li')
# we clean up the text from the group of 3 li tags and add them as a list to our table list.
table.append([unicodedata.normalize("NFKD",lis[0].text).strip(), lis[1].text, lis[2].text.strip()])
# We have all the data so we add it to a DataFrame.
headers = ['Number', 'Tenant', 'Square Footage']
df = DataFrame(table, columns=headers)
print (df)
BTW: if you use comma in (ref['href'], ) then you get tuple and then second for works correclty.
EDIT: it create list table_data at start and add all data into this list. And it convert into DataFrame at the end.
But now I see it read the same page few times - because in every row the same url is in every column. You would have to get url only from one column.
EDIT: now it doesn't read the same url many times
EDIT: now it get text and hre from first link and add to every element in list when you use append().
from bs4 import BeautifulSoup
import requests
import unicodedata
from pandas import DataFrame
page = requests.get("http://properties.kimcorealty.com/property/output/find/search4/view:list/")
soup = BeautifulSoup(page.content, 'html.parser')
table_data = []
# all rows in table except first ([1:]) - headers
rows = soup.select('table tr')[1:]
for row in rows:
# link in first column (td[0]
#link = row.select('td')[0].find('a')
link = row.find('a')
link_href = link['href']
link_text = link.text
print('text:', link_text)
print('href:', link_href)
page = requests.get(link_href)
soup = BeautifulSoup(page.content, 'html.parser')
divs = soup.find_all('div', {'id':['units_box_1', 'units_box_2', 'units_box_3']})
for div in divs:
anchors = div.find_all('a')
for anchor in anchors:
lis = anchor.find_all('li')
item1 = unicodedata.normalize("NFKD", lis[0].text).strip()
item2 = lis[1].text
item3 = lis[2].text.strip()
table_data.append([item1, item2, item3, link_text, link_href])
print('table_data size:', len(table_data))
headers = ['Number', 'Tenant', 'Square Footage', 'Link Text', 'Link Href']
df = DataFrame(table_data, columns=headers)
print(df)

Categories