Index Error while Web Scraping - python

this is my code -
# coding: utf-8
# ## Extracting just the links from the Security home page
# In[126]:
base_url = "https://www.cnet.com"
additional_url = "/topics/security/how-to/"
import re
import numpy as np
import requests
from bs4 import BeautifulSoup
from time import sleep
# To keep a count of the number of articles to be scrapped
limit = 0;
next_page = base_url + additional_url
# List to store the links
list_of_links = []
# Change the limit as per requirements
while next_page and limit <= 200:
temp_list_of_links = []
# Load and extract the content of the page
page = requests.get(next_page)
#sleep(15)
soup = BeautifulSoup(page.content, 'html.parser')
# Find the 'news' links of the page
for link in soup.find_all('a', href=True):
if link['href'].startswith('/news/'):
temp_list_of_links.append(link['href'])
# Save the unique links
link_list = set(temp_list_of_links)
# Find the length of the list of unique links
length = len(link_list)
#print(length)
# Add the links to the final list
list_of_links.extend(link_list)
#sleep(120)
# Increment the limit
limit = limit + length
# Find the links of the Show More page
next_page = soup.find('a', class_='load-more')
# Change the href to the Show More page link
if next_page :
next_page = base_url + next_page['href']
# In[127]:
# Final list with unique links
link_list = set(list_of_links)
# Remove the lone '/news'/ link
link_list.remove('/news/')
# Converting the set into a list
link_list = list(link_list)
# ## Extracting the data from each link
# In[128]:
all_articles = []
for item in link_list:
new_page = base_url + item
page = requests.get(new_page)
soup = BeautifulSoup(page.content, 'html.parser')
sleep(120)
article = []
article_title = soup.title.text
article.append(article_title)
#print(soup.prettify())
article_content = []
content = soup.find("div", {"class":"col-7 article-main-body row"}).findAll('p')
# Writing the content found in the list in its text form
for item in content:
article_content.append(item.text)
# Joining the list elements to form a proper paragraph
article_content = " ".join(article_content)
article.append(article_content)
all_articles.append(article)
# In[129]:
import pandas as pd
df = pd.DataFrame()
df = df.append(all_articles)
df.to_csv('cnet.csv',encoding='utf-8')
# In[1181]:
And this is my error -
Traceback (most recent call last):
File "Gopika_CNET.py", line 113, in <module>
df = df.append(all_articles)
File "/usr/local/lib/python3.5/dist-packages/pandas/core/frame.py", line 4634, in append
elif isinstance(other, list) and not isinstance(other[0], DataFrame):
IndexError: list index out of range

Seems like a bug in pandas. If you try to append an empty list to a dataframe, you will get this error.
This will raise the same error. So it's not a bug in your code.
pandas.DataFrame().append([])
Make sure that your all_articles is not an empty list, and you will not get this error.

Related

web-scrape: get H4 attributes & href

I am trying to web-scrape a website. But I can get access to the attributes of some fields.
here is the code i used:
import urllib3
from bs4 import BeautifulSoup
import pandas as pd
scrap_list = pd.DataFrame()
for path in range(10): # scroll over the categories
for path in range(10): # scroll over the pages
url = 'https://www.samehgroup.com/index.php?route=product/category'+str(page)+'&'+'path='+ str(path)
req = urllib3.PoolManager()
res = req.request('GET', URL)
soup = BeautifulSoup(res.data, 'html.parser')
soup.findAll('h4', {'class': 'caption'})
# extract names
scrap_name = [i.text.strip() for i in soup.findAll('h2', {'class': 'caption'})]
scrap_list['product_name']=pd.DataFrame(scrap_name,columns =['Item_name'])
# extract prices
scrap_list['product_price'] = [i.text.strip() for i in soup.findAll('div', {'class': 'price'})]
product_price=pd.DataFrame(scrap_price,columns =['Item_price'])
I want an output that provides me with each product and its price. I still can't get that right.
Any help would be very much appreciated.
I think the problem here was looping through the website pages. I got the code below working by first making a list of urls containing numbered 'paths' corresponding to pages on the website. Then looping through this list and applying a page number to the url.
If you wanted to only get all the products from a certain page, this page can be selected from the urlist and by index.
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
urlist = [] #create list of usable url's to iterate through,
for i in range(1,10): # 9 pages equal to pages on website
urlist.append('https://www.samehgroup.com/index.php?route=product/category&path=' + str(i))
namelist = []
newprice = []
for urlunf in urlist: #first loop to get 'path'
for n in range(100): #second loop to get 'pages'. set at 100 to cover website max page at 93
try: #try catches when pages containing products run out.
url = urlunf + '&page=' + str(n)
page = requests.get(url).text
soup = BeautifulSoup(page, 'html')
products = soup.find_all('div', class_='caption')
for prod in products: #loops over returned list of products for names and prices
name = prod.find('h4').text
newp = prod.find('p', class_='price').find('span', class_='price-new').text
namelist.append(name) #append data to list outside of loop
newprice.append(newp)
time.sleep(2)
except AttributeError: #if there are no more products it will move to next page
pass
df = pd.DataFrame() #create df and add scraped data
df['name'] = namelist
df['price'] = newprice

Python issue for crawling multiple page title

I am a marketer and want to conduct some basic market research using Python.
I wrote a simple coding to crawl multiple pages of title, but it does not work to put the title text in the list and to transfer it into Excel format. How can I do in this case?
I tried to create a list and used the extend() method to put these looped titles on the list, but it did not work:
import requests
import pandas as pd
from bs4 import BeautifulSoup
def content_get(url):
count = 0
while count < 4: #this case was to crawl titles of 4 pages
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")
titles = soup.find(id="main-container").find_all("div", class_="r-ent")
for title in titles:
print([title.find('div', class_='title').text])
nextpageurl = soup.find("a", string="‹ 上頁")["href"]
url = "https://www.ptt.cc" + nextpageurl
count += 1
firstpage = "https://www.ptt.cc/bbs/movie/index9002.html"
content_get(firstpage)
You need to add the titles to a list outside of the while loop:
def content_get(url):
count = 0
titles = []
while count < 4:
r = requests.get(url)
soup = BeautifulSoup(r.text)
title_page = [title.text.replace('\n', '') for title in soup.find_all('div', {'class': 'title'})]
titles.extend(title_page)
nextpageurl = soup.find("a", string="‹ 上頁")["href"]
url = "https://www.ptt.cc" + nextpageurl
count += 1
return titles
If you don't want the list comprehension to get titles_page, that can be replaced with a traditional for loop:
titles_page = []
titles = soup.find_all('div', {'class': 'title'})
for title in titles:
titles_page.append(title.text.replace('\n', ''))
For the excel file:
def to_excel(text):
df = pd.DataFrame(text, columns=['Title'])
return df.to_excel('output.xlsx')

How to scrape the dynamic table data

I want to scrape the table data from http://5000best.com/websites/
The content of the table is paginated upto several pages and are dynamic.
I want to scrape the table data for each category. I can scrape the table manually for each category but this is not what I want.
Please look at it and give me the approach to do it.
I am able to make links for each category i.e. http://5000best.com/websites/Movies/, http://5000best.com/websites/Games/ etc.
But I am not sure how to make it further to navigate through paginated table for each category.
And after making all the links I need to extract table data by using that links.
Edit : I am using requests, BeautifulSoup4,
Simple Scrapy spider:
import scrapy
class Best500Spider(scrapy.Spider):
name = "best5000"
start_urls = ['http://5000best.com/websites/1']
def parse(self, response):
for row in response.xpath('//table[#id="ttable"]//tr'):
record = {}
record["Rank"] = row.xpath('./td[1]/text()').get()
record["Score"] = row.xpath('./td[2]/text()').get()
record["Category"] = row.xpath('string(./td[3])').get()
record["URL"] = row.xpath('string(./td[5])').get()
yield record
next_page_url = response.xpath('//div[#id="dpages"]/span[#class="pagen0"]/following-sibling::span[1]/a/#href').get()
if next_page_url:
yield scrapy.Request(
url=response.urljoin(next_page_url),
callback=self.parse
)
i saw the site and to move to another page just add /pageNumber at the end of the link ,
For example
http://5000best.com/websites/50 : will get you page 50
You can use this tool to get python requests code for one page and add a loop : https://curl.trillworks.com/
just put "curl http://5000best.com/websites/50" and adapt your code after
I came with this approach to scrape tables from each category.
# ------------Hemant Sah--------------------
# <- --------Importing Libraries-------- ->
import requests
from bs4 import BeautifulSoup
import pandas as pd
import math
import itertools
import requests
import sqlalchemy
import re
final_list = []
dataframe = pd.DataFrame([])
def make_soup(url):
try:
html = requests.get(url)
except requests.exceptions.HTTPError as e:
print(e)
else:
soup = BeautifulSoup(html.text,'lxml')
# print(html.status_code)
return soup
def get_categories_from_soup(soup):
total_list = []
for item in soup.find_all('div',{"class":"sca2"}):
total_list.append(item.text)
for item in soup.find_all('div',{"class":"sca2_a"}):
total_list.append(item.text)
for item in soup.find_all('div',{"class":"sca2_b"}):
total_list.append(item.text)
for item in soup.find_all('div',{"class":"sca2_c"}):
total_list.append(item.text)
total_list.remove("All (5000)")
total_list.remove("Porn (201)")
return total_list
def make_url(total_list,url):
path, page_num, test_page_num, modified_links, new_links = [],[],[],[],[]
for category in total_list:
reg_exp_path = re.compile(r'^\w+')
path.extend(reg_exp_path.findall(category))
test_page_num.extend(re.findall('[0-9]+',category))
# print(path)
for c in test_page_num:
temp = math.ceil(int(c)/100)
page_num.append(temp)
# print(page_num)
# print(page_num)
for p in path:
links= (url+p+"/")
modified_links.append(links)
# print(modified_links)
for w,p in zip(modified_links,page_num):
for n in range(1,p+1):
temp = w+str(n)
new_links.append(temp)
print(new_links)
return new_links
def fetch_table_data(links):
for l in links:
soup = make_soup(l)
my_table = soup.find('table',{'id':'ttable'})
rows = my_table.find_all('tr')
for tr in rows:
td = tr.find_all('td')
row = [tr.text for tr in td]
final_list.append(row)
df = pd.DataFrame(final_list,columns=["Rank","Score","Category","Audience","URL","Links","blank","Desc"])
print(df)
df = df.drop("blank", axis=1)
# print(df)
return df
# df.to_csv('final_data.csv')
def main():
url = "http://5000best.com/websites/"
soup = make_soup(url)
total_list = get_categories_from_soup(soup)
links = make_url(total_list, url)
dataframe = fetch_table_data(links)
if __name__ == "__main__":
main()

Getting web links to all items in a table and then doing pagination

I am able to get all the links on a particular web page but am having trouble with the pagination.
I am doing the following:
import requests, bs4, re
from bs4 import BeautifulSoup
from urllib.parse import urljoin
r = requests.get(start_url)
soup = BeautifulSoup(r.text,'html.parser')
a_tags = soup.find_all('a')
print(a_tags)
links = [urljoin(start_url, a['href'])for a in a_tags]
print(links)
As a toy example, I am using the following website:
start_url = 'https://www.opencodez.com/page/1'
I am able to get all the links this way. However, I am trying to automate it more by going to the next page and doing the same thing, and outputting all the links to a csv file.
I tried the following but get no outputs:
start_url = 'https://www.opencodez.com/'
with open('names.csv', mode='w') as csv_file:
fieldnames = ['Name']
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
article_link = []
def scraping(webpage, page_number):
next_page = webpage + str(page_number)
r = requests.get(str(next_page))
soup = BeautifulSoup(r.text,'html.parser')
a_tags = soup.find_all('a')
print(a_tags)
links = [urljoin(start_url, a['href'])for a in a_tags]
print(links)
for x in range(len(soup)):
article_link.append(links)
if page_number < 16:
page_number = page_number + 1
scraping(webpage, page_number)
scraping('https://www.opencodez.com/page/', 1)
#creating the data frame and populating its data into the csv file
data = { 'Name': article_link}
df = DataFrame(data, columns = ['Article_Link'])
df.to_csv(r'C:\Users\xxxxx\names.csv')
Could you please help me determine where I am going wrong?
I do not mind getting the links in either the output console or printed in a csv file
There were issues here and there with your code but this worked for me:
import requests, bs4, re
from bs4 import BeautifulSoup
from urllib.parse import urljoin
start_url = 'https://www.opencodez.com/'
r = requests.get(start_url) # first page scraping
soup = BeautifulSoup(r.text,'html.parser')
a_tags = soup.find_all('a')
article_link = []
links = [urljoin(start_url, a['href'])for a in a_tags]
article_link.append(links)
for page in range(2,19): # for every page after 1
links = [] # resetting lists on every page just in case
a_tags = []
url = 'https://www.opencodez.com/page/'+str(page)
r = requests.get(start_url)
soup = BeautifulSoup(r.text,'html.parser')
a_tags = soup.find_all('a')
links = [urljoin(start_url, a['href'])for a in a_tags]
article_link.append(links)
print(article_link)
I basically just changed how you append to the list article_link. This variable at the moment is a list of length 18. Each list within article_link is a list of 136 links.

requests.exceptions.MissingSchema: Invalid URL 'h': No schema supplied

I am working on a web scraping project and have run into the following error.
requests.exceptions.MissingSchema: Invalid URL 'h': No schema supplied. Perhaps you meant http://h?
Below is my code. I retrieve all of the links from the html table and they print out as expected. But when I try to loop through them (links) with request.get I get the error above.
from bs4 import BeautifulSoup
import requests
import unicodedata
from pandas import DataFrame
page = requests.get("http://properties.kimcorealty.com/property/output/find/search4/view:list/")
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find('table')
for ref in table.find_all('a', href=True):
links = (ref['href'])
print (links)
for link in links:
page = requests.get(link)
soup = BeautifulSoup(page.content, 'html.parser')
table = []
# Find all the divs we need in one go.
divs = soup.find_all('div', {'id':['units_box_1', 'units_box_2', 'units_box_3']})
for div in divs:
# find all the enclosing a tags.
anchors = div.find_all('a')
for anchor in anchors:
# Now we have groups of 3 list items (li) tags
lis = anchor.find_all('li')
# we clean up the text from the group of 3 li tags and add them as a list to our table list.
table.append([unicodedata.normalize("NFKD",lis[0].text).strip(), lis[1].text, lis[2].text.strip()])
# We have all the data so we add it to a DataFrame.
headers = ['Number', 'Tenant', 'Square Footage']
df = DataFrame(table, columns=headers)
print (df)
Your mistake is second for loop in code
for ref in table.find_all('a', href=True):
links = (ref['href'])
print (links)
for link in links:
ref['href'] gives you single url but you use it as list in next for loop.
So you have
for link in ref['href']:
and it gives you first char from url http://properties.kimcore... which is h
Full working code
from bs4 import BeautifulSoup
import requests
import unicodedata
from pandas import DataFrame
page = requests.get("http://properties.kimcorealty.com/property/output/find/search4/view:list/")
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find('table')
for ref in table.find_all('a', href=True):
link = ref['href']
print(link)
page = requests.get(link)
soup = BeautifulSoup(page.content, 'html.parser')
table = []
# Find all the divs we need in one go.
divs = soup.find_all('div', {'id':['units_box_1', 'units_box_2', 'units_box_3']})
for div in divs:
# find all the enclosing a tags.
anchors = div.find_all('a')
for anchor in anchors:
# Now we have groups of 3 list items (li) tags
lis = anchor.find_all('li')
# we clean up the text from the group of 3 li tags and add them as a list to our table list.
table.append([unicodedata.normalize("NFKD",lis[0].text).strip(), lis[1].text, lis[2].text.strip()])
# We have all the data so we add it to a DataFrame.
headers = ['Number', 'Tenant', 'Square Footage']
df = DataFrame(table, columns=headers)
print (df)
BTW: if you use comma in (ref['href'], ) then you get tuple and then second for works correclty.
EDIT: it create list table_data at start and add all data into this list. And it convert into DataFrame at the end.
But now I see it read the same page few times - because in every row the same url is in every column. You would have to get url only from one column.
EDIT: now it doesn't read the same url many times
EDIT: now it get text and hre from first link and add to every element in list when you use append().
from bs4 import BeautifulSoup
import requests
import unicodedata
from pandas import DataFrame
page = requests.get("http://properties.kimcorealty.com/property/output/find/search4/view:list/")
soup = BeautifulSoup(page.content, 'html.parser')
table_data = []
# all rows in table except first ([1:]) - headers
rows = soup.select('table tr')[1:]
for row in rows:
# link in first column (td[0]
#link = row.select('td')[0].find('a')
link = row.find('a')
link_href = link['href']
link_text = link.text
print('text:', link_text)
print('href:', link_href)
page = requests.get(link_href)
soup = BeautifulSoup(page.content, 'html.parser')
divs = soup.find_all('div', {'id':['units_box_1', 'units_box_2', 'units_box_3']})
for div in divs:
anchors = div.find_all('a')
for anchor in anchors:
lis = anchor.find_all('li')
item1 = unicodedata.normalize("NFKD", lis[0].text).strip()
item2 = lis[1].text
item3 = lis[2].text.strip()
table_data.append([item1, item2, item3, link_text, link_href])
print('table_data size:', len(table_data))
headers = ['Number', 'Tenant', 'Square Footage', 'Link Text', 'Link Href']
df = DataFrame(table_data, columns=headers)
print(df)

Categories