I've scraped the website for my research but I couldn't find the right way to extract it into data frame. I believe that my problem is related with list objects that are between lines 36 and 38.
The print line has worked very nice that I can see the final version of data frame in the Python console.
The solution can be really easy but I couldn't figure it out. Thanks in advance for all help.
from time import sleep
from bs4 import BeautifulSoup, SoupStrainer
import requests
import pandas as pd
# Insert the hisghest page number for website
highest_number = 12
def total_page_number(url):
all_webpage_links = []
all_webpage_links.insert(0, url)
pages = [str(each_number) for each_number in range(2, highest_number)]
for page in pages:
link = ''.join(url + '&page=' + page)
all_webpage_links.append(link)
return all_webpage_links
# Use total_page_number function to create page list for website
All_page = total_page_number(
'https://www.imdb.com/search/title?countries=tr&languages=tr&locations=Turkey&count=250&view=simple')
def clean_text(text):
""" Removes white-spaces before, after, and between characters
:param text: the string to remove clean
:return: a "cleaned" string with no more than one white space between
characters
"""
return ' '.join(text.split())
# Create list objects for data
# Problem occurs in this line !!!!!!
actor_names = []
titles = []
dates = []
def get_cast_from_link(movie_link):
""" Go to the IMDb Movie page in link, and find the cast overview list.
Prints tab-separated movie_title, actor_name, and character_played to
stdout as a result. Nothing returned
:param movie_link: string of the link to IMDb movie page (http://imdb.com
...)
:return: void
"""
movie_page = requests.get(movie_link)
# Use SoupStrainer to strain the cast_list table from the movie_page
# This can save some time in bigger scraping projects
cast_strainer = SoupStrainer('table', class_='cast_list')
movie_soup = BeautifulSoup(movie_page.content, 'html.parser', parse_only=cast_strainer)
# Iterate through rows and extract the name and character
# Remember that some rows might not be a row of interest (e.g., a blank
# row for spacing the layout). Therefore, we need to use a try-except
# block to make sure we capture only the rows we want, without python
# complaining.
for row in movie_soup.find_all('tr'):
try:
actor = clean_text(row.find(itemprop='name').text)
actor_names.append(actor)
titles.append(movie_title)
dates.append(movie_date)
print('\t'.join([movie_title, actor, movie_date]))
except AttributeError:
pass
# Export data frame
# Problem occurs in this line !!!!!!
tsd_df = pd.DataFrame({'Actor_Names': actor_names,
'Movie_Title': titles,
'Movie_Date': dates})
tsd_df.to_csv('/Users/ea/Desktop/movie_df.tsv', encoding='utf-8')
for each in All_page:
# Use requests.get('url') to load the page you want
web_page = requests.get(each)
# https://www.imdb.com/search/title?countries=tr&languages=tr&count=250&view=simple&page=2
# Prepare the SoupStrainer to strain just the tbody containing the list of movies
list_strainer = SoupStrainer('div', class_='lister-list')
# Parse the html content of the web page with BeautifulSoup
soup = BeautifulSoup(web_page.content, 'html.parser', parse_only=list_strainer)
# Generate a list of the "Rank & Title" column of each row and iterate
movie_list = soup.find_all('span', class_='lister-item-header')
for movie in movie_list:
movie_title = movie.a.text
movie_date = movie.find('span', class_='lister-item-year text-muted unbold').text
# get the link to the movie's own IMDb page, and jump over
link = 'http://imdb.com' + movie.a.get('href')
get_cast_from_link(link)
# remember to be nice, and sleep a while between requests!
sleep(15)
Related
I am working on a program that crawls Internet articles using the web crawling method.The program is started by entering the start and end pages of the website.
This program works in the following order.
web-crawling of articles information(title, sort, time, contents)
Remove special characters
Only nouns are extracted.
The problem maybe occurs lies in extracting nouns in the process of cleaning the content of the article. It works until the stage before noun extraction.
The error message is as follows
ValueError: Length of passed values is 4, index implies 5
To solve this problem, I coded using a method of adding DataFrame append.
But it doesn't solve the problem.
Use konlypy method(Korean morpheme analyzer)
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re
from konlpy.tag import Okt
from pandas import Series
i = input('Start page? : ')
k = input('End page? : ')
startpage = int(i)
lastpage = int(k)
count = int(i)
# Definition of text cleaning function
def text_cleaning(text):
hangul = re.compile('[^ㄱ-ㅣ가-힣]+')
result = hangul.sub(' ', text)
return result
# Definition of nouns extraction function
def get_nouns(x):
nouns_tagger = Okt()
nouns = nouns_tagger.nouns(x)
nouns = [noun for noun in nouns if len(noun)>1]
nouns = [noun for noun in nouns if noun not in stopwords]
return nouns
# dataframe formation
columns = ['Title', 'Sort', 'Datetime', 'Article']
news_info = pd.DataFrame(columns=columns)
idx = 0
Web-site page loop
while startpage<lastpage + 1:
url = f'http://www.koscaj.com/news/articleList.html?page={startpage}&total=72698&box_idxno=&sc_section_code=S1N2&view_type=sm'
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all(class_='list-titles')
print(f'-----{count}page result-----')
# Articles loop in the web-site page
for link in links:
news_url = "http://www.koscaj.com"+link.find('a')['href']
news_link = urllib.request.urlopen(news_url).read()
soup2 = BeautifulSoup(news_link, 'html.parser')
# an article's title
title = soup2.find('div', {'class':'article-head-title'})
if title:
title = soup2.find('div', {'class':'article-head-title'}).text
else:
title = ''
# an article's sort
sorts = soup2.find('nav', {'class':'article-head-nav auto-marbtm-10'})
try:
sorts2 = sorts.find_all('a')
sort = sorts2[2].text
except:
sort =''
# an article's time
date = soup2.find('div',{'class':'info-text'})
try:
datetime = date.find('i', {'class':'fa fa-clock-o fa-fw'}).parent.text.strip()
datetime = datetime.replace("승인", "")
except:
datetime = ''
# an article's content
article = soup2.find('div', {'id':'article-view-content-div'})
if article:
article = soup2.find('div', {'id':'article-view-content-div'}).text
article = article.replace("\n", "")
article = article.replace("\r", "")
article = article.replace("\t", "")
article = article.replace("[전문건설신문] koscaj#kosca.or.kr", "")
article = article.replace("저작권자 © 대한전문건설신문 무단전재 및 재배포 금지", "")
article = article.replace("전문건설신문", "")
article = article.replace("다른기사 보기", "")
else:
article = ''
# Remove special characters
news_info['Title'] = news_info['Title'].apply(lambda x: text_cleaning(x))
news_info['Sort'] = news_info['Sort'].apply(lambda x: text_cleaning(x))
news_info['Article'] = news_info['Article'].apply(lambda x: text_cleaning(x))
So far, the program works without any problems. But if you see the program error message, it is indicated that the operation is not working because the input value and index are different.
Text data cleaning for extraction nouns
# Dataframe for storing after crawling individual articles
row = [title, sort, datetime, article]
series = pd.Series(row, index=news_info.columns)
news_info = news_info.append(series, ignore_index=True)
# Load Korean stopword dictionary file
path = "C:/Users/이바울/Desktop/이바울/코딩파일/stopwords-ko.txt"
with open(path, encoding = 'utf-8') as f:
stopwords = f.readlines()
stopwords = [x.strip() for x in stopwords]
news_info['Nouns'] = news_info['Article'].apply(lambda x: get_nouns(x))
startpage += 1
count += 1
news_info.to_excel(f'processing{lastpage-int(1)}-{startpage-int(1)}.xlsx')
print('Complete')
After setting the existing 4 columns in the Pandas DataFrame, the append was used to add the column extracted as a noun as the 5th column. I know this method adds a column regardless of the index name. And if you look at the image link at the bottom, as a result, the first article is crawled and shows the results. From the next article, it does not work and an error occurs.
enter image description here(Program error result)
enter link description here(Korean stopwords dictionary)
I solves the problem.
It depends on the location of the code in the for loop statement.
I've been able to fix the problem as a result of continuing to reposition the problematic areas except for the code that worked before.
I solved the problem by applying backspace only twice in the code below.
news_info['Nouns'] = news_info['Article'].apply(lambda x: get_nouns(x))
This is my first time trying to use python with selenium and bs4.
I'm trying to scrape data from this website
To begin I select GE from cantone dropdown menu, click the checkbox "Conffermo" and the button "Ricerca". Then I can see the data. I have to click each arrow to expand the data and scrape it from every person (this is a loop, isn't it). And then do the same on the next page (by clicking on "Affiggere le seguenti entrate" at the bottom of the page)
I'd like to use relative xpath for the data since not all persons have all the data (I'd like to put an empty cell in excel when data is missing)
This is my code so far:
import urllib2
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
browser = webdriver.Firefox()
URL = 'http://www.asca.ch/Partners.aspx?lang=it'
time.sleep(10)
page = urllib2.urlopen(quote_page) # query the website and return the html to the variable ‘page’
soup = BeautifulSoup(page, ‘html.parser’)
inputElementCantone = driver.find_element_by_xpath(//*[#id="ctl00_MainContent_ddl_cantons_Input"]).click()
browser.find_element_by_xpath(/html/body/form/div[1]/div/div/ul/li[9]).click()
browser.find_element_by_xpath(//INPUT[#id='MainContent__chkDisclaimer']).click()
driver.find_element_by_xpath(//INPUT[#id='MainContent_btn_submit']).click()
arrow = browser.find_element_by_class_name("footable-toggle")
I'm stuck after this. The data I'd like to scrape (in excel columns) are: Discipline(s) thérapeutique(s), Cognome, Cellulare and email.
Any help is appreciated.
# To find all the table
table = soup.find('table', {'class': 'footable'})
# To get all rows in that table
rows = table.find_all('tr')
# A function to process each row
def processRow(row):
#All rows with hidden data
dataFields = row.find_all('td', {'style': True}
output = {}
#Fixed index numbers are not ideal but in this case will work
output['Discipline'] = dataFields[0].text
output['Cogome'] = dataFields[2].text
output['Cellulare'] = dataFields[8].text
output['email'] = dataFields[10].text
return output
#Declaring a list to store all results
results = []
#Iterating over all the rows and storing the processed result in a list
for row in rows:
results.append(processRow(row))
print(results)
I am new to python and I am trying to turn scraping data to a CSV file but without success.
Here is the code:
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import os
import random
import re
from itertools import cycle
def cleanhtml(raw_html):
cleanr = re.compile('<.*?>') #cleaning the strings from these terms
cleantext = re.sub(cleanr, '', raw_html)
return cleantext
def scrape(url, filename, number_id):
"""
This function scrapes a web page looking for text inside its html structure and saves it in .txt file.
So it works only for static content, if you need text in a dynamic part of the web page (e.g. a banner)
look at the other file. Pay attention that the retrieved text must be filtered out
in order to keep only the part you need.
url: url to scrape
filename: name of file where to store text
number_id: itis appended to the filename, to distinguish different filenames
"""
#here there is a list of possible user agents
user_agent = random.choice(user_agent_list)
req = Request(url, headers={'User-Agent': user_agent})
page = urlopen(req).read()
# parse the html using beautiful soup and store in variable 'soup'
soup = BeautifulSoup(page, "html.parser")
row = soup.find_all(class_="row")
for element in row:
viaggio = element.find_all(class_="nowrap")
Partenza = viaggio[0]
Ritorno = viaggio[1]
Viaggiatori = viaggio[2]
Costo = viaggio[3]
Title = element.find(class_="taglist bold")
Content = element.find("p")
Destination = Title.text
Review = Content.text
Departure = Partenza.text
Arrival = Ritorno.text
Travellers = Viaggiatori.text
Cost = Costo.text
TuristiPerCasoList = [Destination, Review, Departure, Arrival, Travellers, Cost]
print(TuristiPerCasoList)
Till here, everything works. Now I have to turn it into a CSV file.
I tried with this:
import csv
with open('turistipercaso','w') as file:
writer = csv.writer(file)
writer.writerows(TuristiPerCasoList)
but it doesn't return anything in the CSV file.
Can someone help me understanding what to do to turn into a CSV file?
In each iteration, you are reassigning the TuristiPerCasoList value.
What you actually want is a list of list of strings, where the string is the value for a specific cell, the second list contains the values of a row and the first list contains all the rows.
To achieve this, you should append a list representing a row to the main list:
# instead of
TuristiPerCasoList = [Destination, Review, Departure, Arrival, Travellers, Cost]
# use
TuristiPerCasoList.append([Destination, Review, Departure, Arrival, Travellers, Cost])
This script is generating a csv with the data from only one of the urls fed into it. There are meant to be 98 sets of results, however the for loop isn't getting past the first url.
I've been working on this for 12hrs+ today, what am I missing in order get the correct results?
import requests
import re
from bs4 import BeautifulSoup
import csv
#Read csv
csvfile = open("gyms4.csv")
csvfilelist = csvfile.read()
def get_page_data(urls):
for url in urls:
r = requests.get(url.strip())
soup = BeautifulSoup(r.text, 'html.parser')
yield soup # N.B. use yield instead of return
print r.text
with open("gyms4.csv") as url_file:
for page in get_page_data(url_file):
name = page.find("span",{"class":"wlt_shortcode_TITLE"}).text
address = page.find("span",{"class":"wlt_shortcode_map_location"}).text
phoneNum = page.find("span",{"class":"wlt_shortcode_phoneNum"}).text
email = page.find("span",{"class":"wlt_shortcode_EMAIL"}).text
th = pages.find('b',text="Category")
td = th.findNext()
for link in td.findAll('a',href=True):
match = re.search(r'http://(\w+).(\w+).(\w+)', link.text)
if match:
web_address = link.text
gyms = [name,address,phoneNum,email,web_address]
gyms.append(gyms)
#Saving specific listing data to csv
with open ("xgyms.csv", "w") as file:
writer = csv.writer(file)
for row in gyms:
writer.writerow([row])
You have 3 for-loops in your code and do not specifiy which one causes problem. I assume it is the one in get_page_date() function.
You leave the looop exactly in the first run with the return assignemt. That is why you never get to the second url.
There are at least two possible solutions:
Append every parsed line of url to a list and return that list.
Move you processing code in the loops and append the parsed data to gyms in the loop.
As Alex.S said, get_page_data() returns on the first iteration, hence subsequent URLs are never accessed. Furthermore, the code that extracts data from the page needs to be executed for each page downloaded, so it needs to be in a loop too. You could turn get_page_data() into a generator and then iterate over the pages like this:
def get_page_data(urls):
for url in urls:
r = requests.get(url.strip())
soup = BeautifulSoup(r.text, 'html.parser')
yield soup # N.B. use yield instead of return
with open("gyms4.csv") as url_file:
for page in get_page_data(url_file):
name = page.find("span",{"class":"wlt_shortcode_TITLE"}).text
address = page.find("span",{"class":"wlt_shortcode_map_location"}).text
phoneNum = page.find("span",{"class":"wlt_shortcode_phoneNum"}).text
email = page.find("span",{"class":"wlt_shortcode_EMAIL"}).text
# etc. etc.
You can write the data to the CSV file as each page is downloaded and processed, or you can accumulate the data into a list and write it in one for with csv.writer.writerows().
Also you should pass the URL list to get_page_data() rather than accessing it from a global variable.
This may end up being a really novice question, because i'm a novice, but here goes.
i have a set of .html pages obtained using wget. i want to iterate through them and extract certain info, putting it in a .csv file.
using the code below, all the names print when my program runs, but only the info from the next to last page (i.e., page 29.html here) prints to the .csv file. i'm trying this with only a handful of files at first, there are about 1,200 that i'd like to get into this format.
the files are based on those here: https://www.cfis.state.nm.us/media/ReportLobbyist.aspx?id=25&el=2014 where page numbers are the id
thanks for any help!
from bs4 import BeautifulSoup
import urllib2
import csv
for i in xrange(22, 30):
try:
page = urllib2.urlopen('file:{}.html'.format(i))
except:
continue
else:
soup = BeautifulSoup(page.read())
n = soup.find(id='ctl00_ContentPlaceHolder1_lnkBCLobbyist')
name = n.string
print name
table = soup.find('table', 'reportTbl')
#get the rows
list_of_rows = []
for row in table.findAll('tr')[1:]:
col = row.findAll('td')
filing = col[0].string
status = col[1].string
cont = col[2].string
exp = col[3].string
record = (name, filing, status, cont, exp)
list_of_rows.append(record)
#write to file
writer = csv.writer(open('lob.csv', 'wb'))
writer.writerows(list_of_rows)
You need to append each time not overwrite, use a, open('lob.csv', 'wb') is overwriting each time through your outer loop:
writer = csv.writer(open('lob.csv', 'ab'))
writer.writerows(list_of_rows)
You could also declare list_of_rows = [] outside the for loops and write to the file once at the very end.
If you are wanting page 30 also you need to loop in range(22,31).