I'm new to Python and I'm copying a website and storing it in a csv file, but I can't verify that it exists in the csv file. I imagine that the correct thing would be to scroll through the lines to check if the title already exists. Does anyone have any idea how to solve this? See the code below:
from pathlib import Path
import time
import csv
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
DRIVER_PATH = str(Path('geckodriver').resolve())
def write_csv(ads):
filename = 'results.csv'
with open(filename, 'a+') as f:
fields = ['title', 'url']
writer = csv.DictWriter(f, fieldnames=fields)
existing_lines = csv.reader(f)
for ad in ads:
if ad['title'] not in existing_lines:
print(existing_lines)
writer.writerow(ad)
print('success')
else:
print('fail')
def get_html(url):
browser = webdriver.Firefox(executable_path=DRIVER_PATH)
browser.get(url)
return browser.page_source
def scrapde_data(card):
try:
h2 = card.h2
except:
title = ''
url = ''
else:
title = h2.text.strip()
try:
url = card.find('a').get('href')
except:
url = ''
data = {'title': title, 'url': url}
return data
def main():
while True:
url = '#'
html = get_html(url)
soup = BeautifulSoup(html, 'lxml')
cards = soup.find_all('div', {"class": "produto--comprar"})
ads_data = []
for card in cards:
data = scrapde_data(card)
ads_data.append(data)
write_csv(ads_data)
time.sleep(5)
if __name__ == '__main__':
main()
Help me please? :(
Try the following code:-
def write_csv(ads):
filename = 'results.csv'
with open(filename, 'a+', newline='') as f:
fields = ['title', 'url']
writer = csv.DictWriter(f, fieldnames=fields)
#moving file pointer at the start of the file
f.seek(0)
existing_lines = csv.reader(f)
#finding no of lines in the file
count=0
for line in existing_lines:
count+=1
break
#if file is not empty
if count>0:
for ad in ads:
flag=0
#moving file pointer to the start of the file
f.seek(0)
#checking if ad['title'] is present in the first column of csv file
for line in existing_lines:
if ad['title'] in line[0]:
flag=1
#if ad['title'] is not found
if flag==0:
writer.writerow(ad)
#if file is empty write the dictionary contents into the csv
else:
for ad in ads:
writer.writerow(ad)
Related
I'm trying to write a list to a csv file such that the it comes out looking like this
I'm sure I'm not using the CSV library correctly since it prints each character of just the first link to the file Here's my code:
for t in terms:
fields = ["Search Term", "URL"]
url = f"https://news.google.com/rss/search?q={t}&hl=en-US&gl=US&ceid=US%3Aen"
html_page = requests.get(url)
soup = BeautifulSoup(html_page.text, "lxml")
for item in soup.find_all("item"):
link= str(item)
i = link.find("<link/>")
j = link.find("<guid")
links = link[i+7:j]
with open("urls.csv", "w") as f:
write = csv.writer(f)
write.writerow(fields)
write.writerows(links)
Any help would be so appreciated. Thanks!!
Use xml parser when creating the soup:
import csv
import requests
from bs4 import BeautifulSoup
terms = ["refrigerator", "kitchen sink"]
with open("urls.csv", "w") as f_out:
writer = csv.writer(f_out)
writer.writerow(["Search Term", "URL"])
for t in terms:
url = f"https://news.google.com/rss/search?q={t}&hl=en-US&gl=US&ceid=US%3Aen"
print(f"Getting {url}")
html_page = requests.get(url)
soup = BeautifulSoup(html_page.content, "xml")
for item in soup.find_all("link"):
writer.writerow([t, item.get_text(strip=True)])
Creates urls.csv (screenshot from LibreOffice):
There is this URL https://www.jpx.co.jp/english/listing/stocks/new/index.html#3422
I wrote(copy&paste from internet!) the following code to save all the pdfs which are inside the table in a folder
from PyPDF2 import PdfFileReader
import requests
from bs4 import BeautifulSoup
import io
import urllib.request as req
import urllib
import os
import time
from urllib.parse import urljoin
url = 'https://www.jpx.co.jp/english/listing/stocks/new/index.html'
headers = {'User-Agent':'Mozilla/5.0'}
res = req.urlopen(url)
soup = BeautifulSoup(res, "html.parser")
result = soup.select("a[href]")
link_list =[]
for link in result:
href = link.get("href")
link_list.append(href)
pdf_list = [temp for temp in link_list if temp.endswith('pdf')]
print(pdf_list)
abs_pdf_list = []
for relative in pdf_list:
temp_url = urljoin(url, relative)
abs_pdf_list.append(temp_url)
filename_list = []
for target in abs_pdf_list:
temp_list = target.split("/")
filename_list.append(temp_list[len(temp_list)-1])
newpath = r'/Users/myfolder/python/IPO'
if not os.path.exists(newpath):
os.makedirs(newpath)
target_dir = "/Users/myfolder/python/IPO/"
savepath_list = []
for filename in filename_list:
savepath_list.append(os.path.join(target_dir, filename))
savepath_list
for (pdflink, savepath) in zip(abs_pdf_list, savepath_list):
print(pdflink)
urllib.request.urlretrieve(pdflink, savepath)
time.sleep(2)
import pdfplumber
import re
def download_file(url):
local_filename = url.split('/')[-1]
with requests.get(url) as r:
with open(local_filename, 'wb') as f:
f.write(r.content)
return local_filename
ap_url = abs_pdf_list[0]
ap = download_file(ap_url)
with pdfplumber.open(ap) as pdf:
page1 = pdf.pages[0]
page2 = pdf.pages[1]
text = page1.extract_text()
print(text)
Now I need to read those pdfs and extract the below lines,
From page1
line which start with "Information & Communication"
From page2
lines which start with
"Book-building Period"
"Offering Price"
and save them in one Excel or CSV file
Sadly I reached to my coding skill limit and can’t move any further .I convert the pdf to text,but …
Please advice me how to do this
I would recommend installing our new package, pdftextract, that conserves the pdf layout as best as possible to extract text, then using some regex to extract the keywords.
Here's a working code snippet tested on 2 pdf files from your link:
import re
import csv
from pdftextract import XPdf
pdf_files = ['a.pdf', "b.pdf"]
keywords = ["Information & Communication", "Book-building Period", "Offering Price"]
def extract_infos(file:str, keywords:list):
"""extract the text from the pdf file then get the wanted keywords information"""
# extracting the text from pdf while keeping the original layout
pdf = XPdf(file)
txt = pdf.to_text(keep_layout=True)
row = []
# getting the keywords information
for keyword in keywords:
# search for the keyword
pattern = "{} (.+)\r".format(keyword) # extracting the wanted info
regex = re.compile(pattern, flags=re.I| re.M)
m = regex.search(txt)
if m is not None:
m = m.groups()[0].strip(' /\r') # strip unwanted space and characters
row.append(m)
return row
def main(files:list, fname:str, headers:list):
"""extract the wanted info from a bunch of pdf files and save them as csv file"""
with open(fname, "w") as wf:
writer = csv.writer(wf)
writer.writerow(headers)
for i, file in enumerate(files, start=1):
row = extract_infos(file, headers)
writer.writerow(row)
print("[DONE]", "writed {} rows to {}.".format(i, fname))
main(pdf_files, "stocks.csv", keywords)
Having Trouble writing to a CSV file. Code as Below. the Set is writing on the same row when I go to write it into "fofo" file
response = requests.get(href)
soup = BeautifulSoup(response.content, 'lxml')
This opens the "shomo" file with existing Hrefs
with open('shomo.csv', newline='') as f:
reader = csv.reader(f)
seen = {row[0] for row in reader}
allthreads=soup.find('table', class_='categories').find_all('p')
for thread in allthreads:
thread_link= thread.a.get('href')
#Checks if Link is in "Seen"
if thread_link not in seen:
seen.add(thread_link) #Add new href to Seen
thread_data = scrape_thread_link(thread_link) #Calls function
#Having trouble with this part
with open('fofo.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerows([seen])
The code without opening a previous file and adding to the Seen the code prints prefect fine as below:
not sure what is different/ wrong?
import csv
import time
from bs4 import BeautifulSoup
import requests
import re
response = requests.get('https://website.com')
soup = BeautifulSoup(response.content, 'lxml')
seen = set ()
with open('momo.csv', 'w', newline='') as file:
writer = csv.writer(file)
allthreads=soup.find('table', class_='categories').find_all('p')
for thread in allthreads:
#thread_name = thread.text
#print (thread_name)
thread_link= thread.a.get('href')
if thread_link not in seen:
seen.add(thread_link)
writer.writerows([seen])
I'm trying to pull in a list of stocks from a csv file, upload each stock ticker into finviz.com, and export the data to csv file. I'm new to Python programing but I know this will help me and others. This is what I got so far.
import csv
import urllib.request
from bs4 import BeautifulSoup
with open('shortlist.csv', 'r') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
name = None
for row in reader:
if row[0]:
name = row[0]
print(name)
write_header = True
sauce = print(name)
soup = BeautifulSoup(sauce.text, 'html.parser')
print(soup.title.text)
symbols = name
""""
print(symbols)
"""
URL_BASE = "https://finviz.com/quote.ashx?t="
with open('output.csv', 'w', newline='') as file:
writer = csv.writer(file)
for ticker in symbols:
URL = URL_BASE + ticker
try:
fpage = urllib.request.urlopen(URL)
fsoup = BeautifulSoup(fpage, 'html.parser')
if write_header:
# note the change
writer.writerow(['ticker'] + list(map(lambda e: e.text, fsoup.find_all('td', {'class': 'snapshot-td2-cp'}))))
write_header = False
# note the change
writer.writerow([ticker] + list(map(lambda e: e.text, fsoup.find_all('td', {'class': 'snapshot-td2'}))))
except urllib.request.HTTPError:
print("{} - not found".format(URL))
I'm missing the output on the csv file "output.csv". I'm only seeing the data from my input csv file "shortlist". The tie or link is not correctly linked.I've spent a couple of weeks researching/working on how to do this. You're help is greatly appreciated.
import csv
import urllib.request
from bs4 import BeautifulSoup
with open('shortlist.csv', 'r') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
name = None
for row in reader:
if row[0]:
name = row[0]
print(name)
write_header = True
#sauce = print(name)
#soup = BeautifulSoup(sauce.text, 'html.parser')
#print(soup.title.text)
symbols = name
""""
print(symbols)
"""
URL_BASE = "https://finviz.com/quote.ashx?t="
with open('output.csv', 'w', newline='') as file:
writer = csv.writer(file)
for ticker in symbols:
URL = URL_BASE + ticker
try:
fpage = urllib.request.urlopen(URL)
fsoup = BeautifulSoup(fpage, 'html.parser')
if write_header:
# note the change
writer.writerow(['ticker'] + list(map(lambda e: e.text, fsoup.find_all('td', {'class': 'snapshot-td2-cp'}))))
write_header = False
# note the change
writer.writerow([ticker] + list(map(lambda e: e.text, fsoup.find_all('td', {'class': 'snapshot-td2'}))))
except urllib.request.HTTPError:
This is the output:
enter image description here
When moving from the get_page_data function to for loop to extract business name, address etc. there's a problem which is giving a Nonetype error. I now know that means that a value of None is being passed into the for loop, but I'm not sure what's causing this error:
AttributeError: 'NoneType' object has no attribute 'text'
#!/opt/local/bin/python
import requests
import re
from bs4 import BeautifulSoup
import csv
#Read csv
with open ("gyms4.csv") as file:
reader = csv.reader(file)
csvfilelist = [row[0] for row in reader]
#Get data from each url
def get_page_data():
for page_data in csvfilelist:
r = requests.get(page_data.strip())
soup = BeautifulSoup(r.text, 'html.parser')
yield soup
#Complete work on data
for page in get_page_data():
name = page.find("span",{"class":"wlt_shortcode_TITLE"}).text
address = page.find("span",{"class":"wlt_shortcode_map_location"}).text
phoneNum = page.find("span",{"class":"wlt_shortcode_phoneNum"}).text
email = page.find("span",{"class":"wlt_shortcode_EMAIL"}).text
th = pages.find('b',text="Category")
td = th.findNext()
for link in td.findAll('a',href=True):
match = re.search(r'http://(\w+).(\w+).(\w+)', link.text)
if match:
web_address = link.text
gyms = [name,address,phoneNum,email,web_address]
gyms.append(gyms)
#Saving specific listing data to csv
with open ("xgyms.csv", "w") as file:
writer = csv.writer(file)
for row in gyms:
writer.writerow([row])