My scrapping code skips new line - Scrapy - python

I have this code to scrape review text from IMDB. I want to retrieve the entire text from the review, but it skips every time there is a new line, for example:
Saw an early screening tonight in Denver.
I don't know where to begin. So I will start at the weakest link. The
acting. Still great, but any passable actor could have been given any
of the major roles and done a great job.
The code will only retrieve
Saw an early screening tonight in Denver.
Here is my code:
reviews = driver.find_elements(By.CSS_SELECTOR, 'div.review-container')
first_review = reviews[0]
sel2 = Selector(text = first_review.get_attribute('innerHTML'))
rating_list = []
review_date_list = []
review_title_list = []
author_list = []
review_list = []
error_url_list = []
error_msg_list = []
reviews = driver.find_elements(By.CSS_SELECTOR, 'div.review-container')
for d in tqdm(reviews):
try:
sel2 = Selector(text = d.get_attribute('innerHTML'))
try:
rating = sel2.css('.rating-other-user-rating span::text').extract_first()
except:
rating = np.NaN
try:
review = sel2.css('.text.show-more__control::text').get()
except:
review = np.NaN
try:
review_date = sel2.css('.review-date::text').extract_first()
except:
review_date = np.NaN
try:
author = sel2.css('.display-name-link a::text').extract_first()
except:
author = np.NaN
try:
review_title = sel2.css('a.title::text').extract_first()
except:
review_title = np.NaN
rating_list.append(rating)
review_date_list.append(review_date)
review_title_list.append(review_title)
author_list.append(author)
review_list.append(review)
except Exception as e:
error_url_list.append(url)
error_msg_list.append(e)
review_df = pd.DataFrame({
'review_date':review_date_list,
'author':author_list,
'rating':rating_list,
'review_title':review_title_list,
'review':review_list
})

Use .extract() instead of .get() to extract all texts in the type of list. Then, you can use .join() to concatenate all texts into a single string.
review = sel2.css('.text.show-more__control::text').extract()
review = ' '.join(review)
output:
'For a teenager today, Dunkirk must seem even more distant than the
Boer War did to my generation growing up just after WW2. For some,
Christopher Nolan's film may be the most they will know about the
event. But it's enough in some ways because even if it doesn't show
everything that happened, maybe it goes as close as a film could to
letting you know how it felt. "Dunkirk" focuses on a number of
characters who are inside the event, living it ....'

Related

Yellow Pages Python web scraping stuck on first iteration

I'm trying to scrape yellow pages, my code is stuck in taking the first business of each page but skips every other business on the page. Ex. 1st company of page 1, 1st company of page2 etc.
I have no clue why it isn't iterating first through the 'web_page' variable, then checking for additional pages and thirdly looking for closing statement and executing ´break´.
If anyone can provide me with clues or help it would be highly appreciated!
web_page_results = []
def yellow_pages_scraper(search_term, location):
page = 1
while True:
url = f'https://www.yellowpages.com/search?search_terms={search_term}&geo_location_terms={location}&page={page}'
r = requests.get(url, headers = headers)
soup = bs(r.content, 'html.parser')
web_page = soup.find_all('div', {'class':'search-results organic'})
for business in web_page:
business_dict = {}
try:
business_dict['name'] = business.find('a', {'class':'business-name'}).text
print(f'{business_dict["name"]}')
except AttributeError:
business_dict['name'] = ''
try:
business_dict['street_address'] = business.find('div', {'class':'street-address'}).text
except AttributeError:
business_dict['street_address'] = ''
try:
business_dict['locality'] = business.find('div', {'class':'locality'}).text
except AttributeError:
business_dict['locality'] = ''
try:
business_dict['phone'] = business.find('div', {'class':'phones phone primary'}).text
except AttributeError:
business_dict['phone'] = ''
try:
business_dict['website'] = business.find('a', {'class':'track-visit-website'})['href']
except AttributeError:
business_dict['website'] = ''
try:
web_page_results.append(business_dict)
print(web_page_results)
except:
print('saving not working')
# If the last iterated page doesn't find the "next page" button, break the loop and return the list
if not soup.find('a', {'class': 'next ajax-page'}):
break
page += 1
return web_page_results
It's worth looking at this line;
web_page = soup.find_all('div', {'class':'search-results organic'})
When I go to the request url I can only find one instance of search-results organic on the page. You then go and iterate over the list (web_page), but there will only be 1 value in the list. So when you do the for loop;
for business in web_page:
you will always only do it once, due to the single item in the list and therefore only get the first result on the page.
You need to loop through the list of businesses on the page not the container holding the business listings. I recommend creating a list from class='srp-listing':
web_page = soup.find_all('div', {'class':'srp-listing'})
This should give you a list of all the businesses on the page. When you iterate over the new list of businesses you will go through more than just the one listing.

Python - create dataframe from output returned from a function?

Below is my code utilizing pygooglenews to scrape Google News:
def get_titles1(search):
news = [] # list of articles that will have NLP applied to it
search = gn.search(search, when = '2y') #when: duration; 2y = 2 years, 6m = 6 months, etc.
search_items = search['entries'] # list of articles retrieved before NLP
for item in search_items: # go through all of the items in search_items
try:
url = item.link
article = Article(url, config=config)
article.download()
article.parse()
authors = ", ".join(author for author in article.authors)
title = article.title
date = article.publish_date
text = article.text
image = article.top_image
videos = article.movies
url = article.url
article.nlp()
keywords = article.keywords
print("\n")
print(item.link)
print(f"Keywords: {keywords}")
print(f"Summary: {article.summary}")
report = {'title':article.title, 'link':article.link, 'published':article.published}
news.append(report)
except:
pass
return news
data = get_titles1('Los Angeles')
df = pd.DataFrame(data)
print(df)
Although I everything works when with the function and when I call it using the data variable, when trying to make a DataFrame, the result is empty. This is strange given that in the function, I created the result dictionary, which 'title', 'link', and 'published' all acting as column headers.
Does anybody know what is the issue? Please let me know as I am a beginner with Python. Thank you!

Beautiful_Soup loop over errors

Currently trying to web scrape numbeo.com with Beautifulsoup to extract the cost of living in ~200 cities in my data frame.
I have the following code but there's an issue in that they report differently in the url. For example, some include only the name of the city while other's end with a hyphen & state abbreviation.
https://www.numbeo.com/cost-of-living/in/Saint-Petersburg-FL
https://www.numbeo.com/cost-of-living/in/Detroit
There are some other issues but how do I reconfigure the code below to jump to another option if there's an error:
cofl_list = []
def cost_living(cit):
cit = str(cit)
cit = cit.replace('St. Petersburg','Saint-Petersburg-FL')
cit = cit.replace(' ','-')
cit = cit.replace('St.','Saint')
url = r.get(f'https://www.numbeo.com/cost-of-living/in/{cit}')
soup = bs(url.content)
cof = soup.find_all('span', attrs= {'class': 'emp_number'} )
cof_rev = cof[1]
cof_rev = str(cof_rev)
cof_rev = cof_rev.replace('$','')
cof_rev = cof_rev.replace('<span class="emp_number">','')
cof_rev = cof_rev.replace('</span>','')
cof_rev = float(cof_rev)
cofl_list.append(cof_rev)
for example:
def my_func():
try:
# execute a block of script
except:
# if an error occurs in the block under try it gets catched here so You can execute something here
continue
however if there is code in the block after the part that causes the error it will not be executed

Problems with text data-cleaning in python

I am working on a program that crawls Internet articles using the web crawling method.The program is started by entering the start and end pages of the website.
This program works in the following order.
web-crawling of articles information(title, sort, time, contents)
Remove special characters
Only nouns are extracted.
The problem maybe occurs lies in extracting nouns in the process of cleaning the content of the article. It works until the stage before noun extraction.
The error message is as follows
ValueError: Length of passed values is 4, index implies 5
To solve this problem, I coded using a method of adding DataFrame append.
But it doesn't solve the problem.
Use konlypy method(Korean morpheme analyzer)
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re
from konlpy.tag import Okt
from pandas import Series
i = input('Start page? : ')
k = input('End page? : ')
startpage = int(i)
lastpage = int(k)
count = int(i)
# Definition of text cleaning function
def text_cleaning(text):
hangul = re.compile('[^ㄱ-ㅣ가-힣]+')
result = hangul.sub(' ', text)
return result
# Definition of nouns extraction function
def get_nouns(x):
nouns_tagger = Okt()
nouns = nouns_tagger.nouns(x)
nouns = [noun for noun in nouns if len(noun)>1]
nouns = [noun for noun in nouns if noun not in stopwords]
return nouns
# dataframe formation
columns = ['Title', 'Sort', 'Datetime', 'Article']
news_info = pd.DataFrame(columns=columns)
idx = 0
Web-site page loop
while startpage<lastpage + 1:
url = f'http://www.koscaj.com/news/articleList.html?page={startpage}&total=72698&box_idxno=&sc_section_code=S1N2&view_type=sm'
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all(class_='list-titles')
print(f'-----{count}page result-----')
# Articles loop in the web-site page
for link in links:
news_url = "http://www.koscaj.com"+link.find('a')['href']
news_link = urllib.request.urlopen(news_url).read()
soup2 = BeautifulSoup(news_link, 'html.parser')
# an article's title
title = soup2.find('div', {'class':'article-head-title'})
if title:
title = soup2.find('div', {'class':'article-head-title'}).text
else:
title = ''
# an article's sort
sorts = soup2.find('nav', {'class':'article-head-nav auto-marbtm-10'})
try:
sorts2 = sorts.find_all('a')
sort = sorts2[2].text
except:
sort =''
# an article's time
date = soup2.find('div',{'class':'info-text'})
try:
datetime = date.find('i', {'class':'fa fa-clock-o fa-fw'}).parent.text.strip()
datetime = datetime.replace("승인", "")
except:
datetime = ''
# an article's content
article = soup2.find('div', {'id':'article-view-content-div'})
if article:
article = soup2.find('div', {'id':'article-view-content-div'}).text
article = article.replace("\n", "")
article = article.replace("\r", "")
article = article.replace("\t", "")
article = article.replace("[전문건설신문] koscaj#kosca.or.kr", "")
article = article.replace("저작권자 © 대한전문건설신문 무단전재 및 재배포 금지", "")
article = article.replace("전문건설신문", "")
article = article.replace("다른기사 보기", "")
else:
article = ''
# Remove special characters
news_info['Title'] = news_info['Title'].apply(lambda x: text_cleaning(x))
news_info['Sort'] = news_info['Sort'].apply(lambda x: text_cleaning(x))
news_info['Article'] = news_info['Article'].apply(lambda x: text_cleaning(x))
So far, the program works without any problems. But if you see the program error message, it is indicated that the operation is not working because the input value and index are different.
Text data cleaning for extraction nouns
# Dataframe for storing after crawling individual articles
row = [title, sort, datetime, article]
series = pd.Series(row, index=news_info.columns)
news_info = news_info.append(series, ignore_index=True)
# Load Korean stopword dictionary file
path = "C:/Users/이바울/Desktop/이바울/코딩파일/stopwords-ko.txt"
with open(path, encoding = 'utf-8') as f:
stopwords = f.readlines()
stopwords = [x.strip() for x in stopwords]
news_info['Nouns'] = news_info['Article'].apply(lambda x: get_nouns(x))
startpage += 1
count += 1
news_info.to_excel(f'processing{lastpage-int(1)}-{startpage-int(1)}.xlsx')
print('Complete')
After setting the existing 4 columns in the Pandas DataFrame, the append was used to add the column extracted as a noun as the 5th column. I know this method adds a column regardless of the index name. And if you look at the image link at the bottom, as a result, the first article is crawled and shows the results. From the next article, it does not work and an error occurs.
enter image description here(Program error result)
enter link description here(Korean stopwords dictionary)
I solves the problem.
It depends on the location of the code in the for loop statement.
I've been able to fix the problem as a result of continuing to reposition the problematic areas except for the code that worked before.
I solved the problem by applying backspace only twice in the code below.
news_info['Nouns'] = news_info['Article'].apply(lambda x: get_nouns(x))

Scraping a certain range from a table at a webpage

I'm trying to scrape data from this website, which has a table of game credits for different categories. There are a total of 24 categories that I want to make in to 24 columns. In the example webpage there are 5 (production, design, engineering, and thanks).
It would have been easy if they have different class but they all have the same h3 class: "clean". Different page has different categories, and depending on the page the order changes too. On top of that the information that I need is actually in the next row of the table and in a different class.
So what I figured is if I can make 24 if statements for each categories to find if h3 class:"clean" has any of the categories, then I can scrape the class that I need and else put none. but the problem is all of them share the same class. So I think I can try to use td colspan="5" as a marker for python to let python know when each category ends and starts.
My question is that is there a way to program it to scrape when it encounters td colspan="5" and then stop ??
import bs4 as bs
import urllib.request
gameurl = "https://www.mobygames.com/developer/sheet/view/developerId,1"
req = urllib.request.Request(gameurl,headers={'User-Agent': 'Mozilla/5.0'})
sauce = urllib.request.urlopen(req).read()
soup = bs.BeautifulSoup(sauce,'lxml')
infopage = soup.find_all("div", {"class":"col-md-8 col-lg-8"})
core_list =[]
for credits in infopage:
niceHeaderTitle = credits.find_all("h1", {"class":"niceHeaderTitle"})
name = niceHeaderTitle[0].text
Titles = credits.find_all("h3", {"class":"clean"})
Titles = [title.get_text() for title in Titles]
if 'Business' in Titles:
businessinfo = credits.find_all("tr", {"class":"devCreditsHighlight"})
business = businessinfo[0].get_text(strip=True)
else:
business = 'none'
if 'Production' in Titles:
productioninfo = credits.find_all("tr", {"class":"devCreditsHighlight"})
production = productioninfo[0].get_text(strip=True)
else:
production = 'none'
if 'Design' in Titles:
designinfo = credits.find_all("tr", {"class":"devCreditsHighlight"})
design = designinfo[0].get_text(strip=True)
else:
design = 'none'
if 'Writers' in Titles:
writersinfo = credits.find_all("tr", {"class":"devCreditsHighlight"})
writers = writersinfo[0].get_text(strip=True)
else:
writers = 'none'
if 'Writers' in Titles:
writersinfo = credits.find_all("tr", {"class":"devCreditsHighlight"})
writers = writersinfo[0].get_text(strip=True)
else:
writers = 'none'
if 'Programming/Engineering' in Titles:
programinfo = credits.find_all("tr", {"class":"devCreditsHighlight"})
program = programinfo[0].get_text(strip=True)
else:
video = 'none'
if 'Video/Cinematics' in Titles:
videoinfo = credits.find_all("tr", {"class":"devCreditsHighlight"})
video = videoinfo[0].get_text(strip=True)
else:
video = 'none'
if 'Audio' in Titles:
Audioinfo = credits.find_all("tr", {"class":"devCreditsHighlight"})
audio = Audioinfo[0].get_text(strip=True)
else:
audio = 'none'
if 'Art/Graphics' in Titles:
artinfo = credits.find_all("tr", {"class":"devCreditsHighlight"})
art = artinfo[0].get_text(strip=True)
else:
art = 'none'
if 'Support' in Titles:
supportinfo = credits.find_all("tr", {"class":"devCreditsHighlight"})
support = supportinfo[0].get_text(strip=True)
else:
support = 'none'
if 'Thanks' in Titles:
thanksinfo = credits.find_all("tr", {"class":"devCreditsHighlight"})
thanks = thanksinfo[0].get_text(strip=True)
else:
thanks = 'none'
games=[name,business,production,design,writers,video,audio,art,support,program,thanks]
core_list.append(games)
print (core_list)

Categories