How to save value in excel from site using python? - python

below is my code I want to scrape the website and store value in the excel-
**
import pandas as pd
import requests
from bs4 import BeautifulSoup
from openpyxl import workbook
Name = []
Mob = []
Add = []
E_mail = []
website = []
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0',
}
url = requests.get("www.example.com", headers=headers)
soup = BeautifulSoup(url.content, 'html.parser')
travel_name = soup.findAll(attrs={'class': 'list-group-item'})
for name in travel_name:
for a in name.findAll('a', attrs={"class": "text-warning"}):
user = a.text
Name.append(user)
pList = name.findAll('p', attrs={"class": "mb-2 text-truncate"})
for p in pList:
# print(p.text)
if p.text.find("Contact:") != -1:
contact = str.replace(p.text, "Contact:", "")
Mob.append(contact)
# print(contact)
if p.text.find("Location:") != -1:
location = str.replace(p.text, "Location:", "")
Add.append(location)
# print(location)
if p.text.find("Email:") != -1:
email = str.replace(p.text, "Email:", "")
E_mail.append(email)
# print(email)
if p.text.find("Website:") != -1:
web = str.replace(p.text, "Website:", "")
website.append(web)
**
I want to store value in excel column-wise. I tried by df = pd.DataFrame() but I am failing
[Name, Mob, Add, E_mail, website]

Follow below pattern hope you understand if not feel free to ask for more explanation:
data=[]
for name in travel_name:
dict_={} #i.e create a dict for each item i.e it will be representing a row in the excel
name= [a.text for a in name.findAll('a', attrs={"class": "text-warning"})]
contact=(code to extract the value like in name)
email=(code to extract the value like in name)
website=(code to extract the value like in name)
data.append(dict) #i.e append each dictionary(later to be row in excel) into a list
df=pd.DataFrame(data)
df.to_csv('data',index=False)
Here name,contact,email,website will be the names of the columns and every iteration will create a row for these columns against your data.

Related

Python write to csv file from Dict

See excel file SS The data looks as in image in csv file
This is what I have written till now to analyze reviews from IMDB.
First it fetches the reviews from imdb website (top 250 movies).
Then fetches the movie links, reviews links, extracts text from the reviews and stores it in a dictionary data format with movie_name: movie review format.
In the last step, I am able to print the Movie_Name: Movie review on the console. But when I write to CSV file it gives either errors or writes just incorrect data to CSV file.
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
import csv
import requests
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}
url = input('Enter - ')
while (True):
try:
page = requests.get(url, headers = headers)
soup = BeautifulSoup(page.content, "html.parser")
container = soup.find_all('td', class_ = 'titleColumn')
break
except:
print("Please enter a valid url:")
url = input('Enter - ')
def movies_list():
movie_names = []
movies = container[:100] #here we get the top 50 movies we want
for movie in movies:
name = movie.find('a').text
movie_names.append(name)
return movie_names
#print(movie_names)
def movie_links_list():
movie_links = []
movies = container[:100]
for movie in movies:
tag = movie.find('a')`enter code here`
link = tag.get('href', None)
movie_links.append(link)
for i in range(len(movie_links)):
movie_links[i] = 'https://www.imdb.com/'+ movie_links[i]
return movie_links
def review_link_list(movie_links):
review_links = []
for movie_link in movie_links:
title_pos = movie_link.find('title')
nxt_slash = movie_link.find('/', title_pos)
nxt2_slash = movie_link.find('/', nxt_slash+1)
review_link = movie_link[:title_pos-1] + movie_link[title_pos:nxt2_slash+1] + "reviews?ref_=tt_urv"
review_links.append(review_link)
return review_links
def get_reviews(review_links):
movie_names=movies_list()
review_dict={}
for i in range(len(review_links)):
movie_name=movie_names[i]
movie_reviews=[]
review_page = requests.get(review_links[i], headers = headers)
soup = BeautifulSoup(review_page.content, "html.parser")
tag = soup.find_all('div', class_ = 'content') #find_all to return a list
top_50= tag[:50]
for j in top_50:
try:
review=j.select('div.show-more__control')[0].text
except:
continue
movie_reviews.append(review)
review_dict[movie_name]=movie_reviews
return review_dict
file= "abc.csv"
with open(file ,'w') as csvfile:
for i in range(len(movies)):
csvwriter = csv.writer(csvfile)
Name=movies[i]
Review = reviews_dict[Name]
try:
csvwriter.writerow(Review)
except:
csvwriter.writerow("Review does not exist")
you need to open the file and write a list with the data
import csv
dict = {"mykey":10}
with open("mydata.csv", 'a') as file:
writer = csv.writer(file)
for key, value in dict.items():
data = [key, value]
writer.writerow(data)
in the csv file "mydata.csv" you will no get
mykey,10
When using the 'a' as an args in open you can append data to the file so not to over write old data

i scraped title and price and links and info table and when i write csv file i get duplicated title and price and links

I want to replace duplicate title and price and links with empty column values.
import requests
import csv
from bs4 import BeautifulSoup
requests.packages.urllib3.disable_warnings()
import pandas as pd
url = 'http://shop.kvgems-preciousstones.com/'
while True:
session = requests.Session()
session.headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"}
content = session.get(url, verify=False).content
soup = BeautifulSoup(content, "html.parser")
posts = soup.find_all('li',{'class':'item'})
data = []
for url in posts:
title = url.find('h2',{'product-name'}).text
price = url.find('span',{'price'}).text
link = url.find('a').get('href')
url_response = requests.get(link)
url_data = url_response.text
url_soup = BeautifulSoup(url_data, 'html.parser')
desciption = url_soup.find('tr')
for tr in url_soup.find_all('tr'):
planet_data = dict()
values = [td.text for td in tr.find_all('td')]
planet_data['name'] = tr.find('td').text.strip()
planet_data['info'] = tr.find_all('td')[1].text.strip()
data.append((title,price,planet_data,link))
#data_new = data +","+ data_desciption
#urls = soup.find('a',{'class': 'next i-next'}).get('href')
#url = urls
#print(url)
with open('ineryrge5szdqzrt.csv', 'a') as csv_file:
writer = csv.writer(csv_file)
writer.writerow(['title','price','name','info','link'])
#The for loop
for title,price,planet_data,link in data:
writer.writerow([title,price,planet_data['name'],planet_data['info'],link])
When I write CSV I got the result of duplicated title, price, link but I want to get only 1 title, price, info and link while the rest are empty.
The first for loop extracts the common values (title, price and link). The second for loop then extracts all the data attributes for each item.
However, you are then writing title, price and link fields to the CSV file for every row of data. You only need to do it for the first row of data.
To detect if your second for loop is on the first row or not, you can change it to use the enumerate function which gives you an extra index variable. You can then use this value to only write the title, price, link if 0:
for index, tr in enumerate(url_soup.find_all('tr')):
planet_data = dict()
values = [td.text for td in tr.find_all('td')]
planet_data['name'] = tr.find('td').text.strip()
planet_data['info'] = tr.find_all('td')[1].text.strip()
if index == 0:
data.append((title,price,planet_data,link))
else:
data.append((None,None,planet_data,None))
(Also I don't think you need the initial while True: part.)

Python scrape, skipping a <tr> tag and row

Scraping a webpage and encountering an "IndexError: list index out of range"
pretty sure it's because a row in the table I am scraping is using as a header - http://www.wsj.com/mdc/public/page/2_3022-mfsctrscan-moneyflow-20161205.html?mod=mdc_pastcalenda
from urllib2 import urlopen
import requests
from bs4 import BeautifulSoup
import re
import datetime
date = datetime.datetime.today()
url = "http://www.wsj.com/mdc/public/page/2_3022-mfsctrscan-moneyflow- 20161205.html?mod=mdc_pastcalendar"
date_time = urlopen(url.format(date=date.strftime('%Y%m%d')))
address = url
print 'Retrieving information from: ' + address
print '\n'
soup = BeautifulSoup (requests.get(address).content, "lxml")
div_main = soup.find('div', {'id': 'column0'})
table_one = div_main.find('table')
rows = table_one.findAll('tr')
if len(soup.findAll('tr')) > 0:
rows = rows[2:]
#print rows
for row in rows:
cells = row.findAll('td')
name = cells[0].get_text()
last = cells[1].get_text()
chg = cells[2].get_text()
pct_chg = cells[3].get_text()
money_flow = cells[4].get_text()
tick_up = cells[5].get_text()
tick_down = cells[6].get_text()
up_down_Ratio = cells[7].get_text()
money_flow = cells[8].get_text()
tick_up = cells[9].get_text()
tick_down = cells[10].get_text()
up_down_Ratio = cells[11].get_text()
The intermediate rows with single cells like "Dow Jones U.S. Total Stock Market Sectors" is the reason you are having this error.
But, instead, why don't you pre-define a list of headers and dynamically create a dictionary from the values of the "data" rows zipping with the list of headers:
rows = soup.select('div#column0 table tr')[2:]
headers = ['name', 'last', 'chg', 'pct_chg',
'total_money_flow', 'total_tick_up', 'total_tick_down', 'total_up_down_ratio',
'block_money_flow', 'block_tick_up', 'block_tick_down', 'block_up_down_ratio']
for row in rows:
# skip non-data rows
if row.find("td", class_="pnum") is None:
continue
print(dict(zip(headers, [cell.get_text(strip=True) for cell in row.find_all('td')])))
div_main = soup.find('div', {'id': 'column0'})
table_one = div_main.find('table')
# to id the right row
def target_row(tag):
is_row = len(tag.find_all('td')) > 5
row_name = tag.name == 'tr'
return is_row and row_name
rows = table_one.find_all(target_row)
for row in rows:
cells = row.findAll('td')
name = cells[0].get_text()
last = cells[1].get_text()
chg = cells[2].get_text()
pct_chg = cells[3].get_text()
money_flow = cells[4].get_text()
tick_up = cells[5].get_text()
tick_down = cells[6].get_text()
up_down_Ratio = cells[7].get_text()
money_flow = cells[8].get_text()
tick_up = cells[9].get_text()
tick_down = cells[10].get_text()
up_down_Ratio = cells[11].get_text()
you can use a function that return a bool as find's parameter, in this way, you code is much clean and maintainable.

Scrape page using Python requests

I have some problems with web scraping, here is my code:
from bs4 import BeautifulSoup
import requests
import re
import csv
import argparse
def save_csv_file(filename, array):
with open(filename, 'wb') as f:
writer = csv.writer(f)
writer.writerow(["item_name","item_price","item_category"])
writer.writerows(array)
def process_data(name, price, category):
item_name = name.text if name else 'NA'
item_price = price.text if price else 'NA'
item_category = category.text if category else 'NA'
item_name = item_name.replace(" ", "").replace("\r\n", "").replace("\n", "").encode("utf-8")
item_price = item_price.replace(" ", "").replace("\r\n", "").replace("\n", "").encode("utf-8")
item_category = item_category.replace(" ", "").replace("\r\n", "").replace("\n", "").encode("utf-8")
return (item_name, item_price, item_category)
def do_scrap(filename, url, payload, headers):
# Request the URL with parameters and headers
r = requests.post(url, payload, headers = headers, allow_redirects = True)
if(r.status_code == 200):
# Save response content in html variable
html = r.content
# Parsed html variable into HTML file with bs4
parsed_html = BeautifulSoup(html, "html.parser")
# Print document title
print parsed_html.head.find('title').text
# Find all of the HTML elements which are describing hotels
tables = parsed_html.find_all("a", {"class" : "result-link"})
# Print the numbers of the hotels
print "Found %s records." % len(tables)
# Empty helpers
items = []
count = 0
# Looping the HTML elements and print properties for each hotel
for table in tables:
name = table.find("h3", {"class" : "result-title"})
price = table.find("p", {"class" : "price text-truncate"})
category = table.find("p", {"class" : "merchant-name text-truncate"})
items.append(process_data(name, price, category))
count += 1
if count > 0:
# Save array with data to csv file
save_csv_file(filename = filename, array = items)
# Print end of job info
print "\n%s records downloaded and saved to %s." % (count, filename)
else:
print "Code error: %s" % r.status_code
if __name__ == '__main__':
ap = argparse.ArgumentParser()
ap.add_argument("-p","--product",required=True,help="Product name")
ap.add_argument("-c","--category",default="",help="Product category")
args = vars(ap.parse_args())
product = args['product']
category = args['category']
payload = {
'siteSearchQuery':product,
'from':'colibri'
}
headers = {
'Host':'www.kelkoo.co.uk',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'pl-PL,pl;q=0.8,en-US;q=0.6,en;q=0.4',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36'
}
url = "http://www.kelkoo.co.uk/ctl/do/search"
filename = "%s_co_uk_kelkoo_data.csv" % product
do_scrap(
filename=filename,
url=url,
payload=payload,
headers=headers)
After this request I am getting different result than I put this:
www.kelkoo.co.uk/ctl/do/search?siteSearchQuery=nokia+130&from=colibri
into my web browser, what is causing this problem? Is there is something related to page redirection or something?
I can see multiple things that will cause you to get different results:
You initiate a POST not a GET. Lookup params for requests.get.
They use javascript to modify the page.

Python and BeautifulSoup encoding issue from UTF-8

I'm new to python and currently writing an application that scrapes data off the web. It's mostly done, there is only a little problem left with encoding. The site is encoded in ISO-8859-1, but when I try to html.decode('iso-8859-1'), it doesn't do anything.
If you run the program, use 50000 and 50126 for PLZs and you'll see what I mean in the output. It would be awesome if someone could help me out.
import urllib.request
import time
import csv
import operator
from bs4 import BeautifulSoup
#Performs a HTTP-'POST' request, passes it to BeautifulSoup and returns the result
def doRequest(request):
requestResult = urllib.request.urlopen(request)
soup = BeautifulSoup(requestResult)
return soup
#Returns all the result links from the given search parameters
def getLinksFromSearch(plz_von, plz_bis):
database = []
links = []
#The search parameters
params = {
'name_ff': '',
'strasse_ff': '',
'plz_ff': plz_von,
'plz_ff2': plz_bis,
'ort_ff': '',
'bundesland_ff': '',
'land_ff': 'DE',
'traeger_ff': '',
'Dachverband_ff': '',
'submit2' : 'Suchen'
}
DATA = urllib.parse.urlencode(params)
DATA = DATA.encode('utf-8')
request = urllib.request.Request(
"http://www.altenheim-adressen.de/schnellsuche/suche1.cfm",
DATA)
# adding charset parameter to the Content-Type header.
request.add_header("Content-Type", "application/x-www-form-urlencoded;charset=utf-8")
request.add_header("User-Agent", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:33.0) Gecko/20100101 Firefox/33.0")
#The search request
html = doRequest(request)
h = html.decode('iso-8859-1')
soup = BeautifulSoup(h)
for link in soup.find_all('a'):
database.append(link.get('href'))
#Remove the first Element ('None') to avoid Attribute Errors
database.pop(0)
for item in database:
if item.startswith("suche"):
links.append(item)
return links
#Performs a search on the link results
def searchOnLinks(links):
adresses = []
i = 1
j = len(links)
print("Found", j, "results, collecting data.")
for item in links:
adresses.append(getContactInfoFromPage(item, i, j))
i = i + 1
time.sleep(0.1)
print("All done.")
return adresses
#A method to scrape the contact info from the search result
def getContactInfoFromPage(page, i, j):
name = ''
straße = ''
plz = ''
stadt = ''
telefon = ''
mail = ''
url = ''
data = [
#'Name',
#'Straße',
#'PLZ',
#'Stadt',
#'Telefon',
#'E-Mail',
#'Homepage'
]
request = urllib.request.Request("http://www.altenheim-adressen.de/schnellsuche/" + page)
#request.add_header("Content-Type", "application/x-www-form-urlencoded;charset=utf-8")
request.add_header("Content-Type", "text/html;charset=UTF-8")
request.add_header("User-Agent", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:33.0) Gecko/20100101 Firefox/33.0")
print("(" , i , "/" , j , ") Making request...")
soup = doRequest(request)
print("Done.")
findeName = soup.findAll('b')
name = findeName[2]
name = name.string.split('>')
data.append(name[0])
straße = getFieldValue(soup, "Straße")
data.append(straße)
ort = getFieldValue(soup, "Ort")
(plz, stadt) = ort.split(' ', 1)
data.append(plz)
data.append(stadt)
telefon = getFieldValue(soup, "Telefon")
data.append(telefon)
mail = getFieldValue(soup, "EMail")
data.append(mail)
url = getFieldValue(soup, "Internetadresse")
data.append(url)
return data
#Strips the text from the given field's sibling
def getFieldValue(soup, field):
field_label = soup.find('td', text=field + ':')
return field_label.find_next_sibling('td').get_text(strip=True)
#The main input/output function
def inputOutput():
#PLZ is German for zip-code and consists of a five-digit number
#The program passes the numbers to the servers, and the server
#returns all search results between the two numbers
plz_von = input("Please enter first PLZ: ")
plz_bis = input("Please enter second PLZ: ")
links = getLinksFromSearch(plz_von, plz_bis)
#Checks if the search yielded any results
if len(links) > 0:
data = searchOnLinks(links)
file_name = input("Save as: ")
print("Writing to file...")
with open(file_name + '.csv', 'w', newline='') as fp:
a = csv.writer(fp, delimiter=',')
a.writerows(data)
else:
print("The search yielded no results.")
inputOutput()
Your doRequest() function returns a BeautifulSoup object, you cannot decode that object. Just use it directly:
soup = doRequest(request)
You don't need to decode the response at all; BeautifulSoup uses both hints in the HTML (<meta> headers) as well as statistical analysis to determine the correct input encoding.
In this case the HTML document claims it is Latin-1:
<meta name="content-type" content="text/html; charset=iso-8859-1">
The response doesn't include a character set in the Content-Type header either, so this is a case of a misconfigured server. You can force BeautifulSoup to ignore the <meta> header with:
soup = BeautifulSoup(requestResult, from_encoding='utf8')

Categories