Remove duplicate pages from a PDF - python

I have a pdf file which has lots duplicate pages which I want to remove. This is my code:
pdf_reader = PyPDF2.PdfFileReader(filename_path)
print(pdf_reader.getNumPages())
pdf_writer = PyPDF2.PdfFileWriter()
last_page_n = pdf_reader.getNumPages() - 1
megalist1 =[]
for i in range(last_page_n):
current_page = pdf_reader.getPage(i)
megalist1.append(current_page)
res = []
[res.append(x) for x in megalist1 if x not in res]
print(len(megalist1))
It doesn't generate any error but it doesn't work either.
What is that I am doing wrong?

That's not how list comprehensions work, but you could have performed the duplicate check when adding to your original list, i.e:
megalist1 =[]
for i in range(last_page_n):
current_page = pdf_reader.getPage(i)
if current_page not in megalist:
megalist1.append(current_page)

Here's one way to fix your code:
pdf_reader = PyPDF2.PdfFileReader(filename_path)
pdf_writer = PyPDF2.PdfFileWriter()
# Create an empty list to store unique pages
unique_pages = []
# Iterate through each page in the PDF
for i in range(pdf_reader.getNumPages()):
current_page = pdf_reader.getPage(i)
# Check if the current page is already in the unique_pages list
if current_page not in unique_pages:
# If not, add it to the list
unique_pages.append(current_page)
# And also add it to the output PDF
pdf_writer.addPage(current_page)
# Write the output PDF to a new file
with open("output.pdf", "wb") as out:
pdf_writer.write(out)

Related

How to Merge two pages from a pdf file as one page

I have a pdf in which there are total 6 pages of images.I want to merge page 1 and 2 as a single pdf and so on for 3 to 6 pages.
I splitted all 6 pages of pdf as individual pdf.
import os
from PyPDF2 import PdfFileReader, PdfFileWriter
def pdf_splitter(path):
fname = os.path.splitext(os.path.basename(path))[0]
pdf = PdfFileReader(path)
for page in range(pdf.getNumPages()):
pdf_writer = PdfFileWriter()
pdf_writer.addPage(pdf.getPage(page))
output_filename = '{}_page_{}.pdf'.format(
fname, page+1)
with open(output_filename, 'wb') as out:
pdf_writer.write(out)
print('Created: {}'.format(output_filename))
if name == 'main':
path = 'D:\Tasks\Samples\fw9.pdf'
pdf_splitter(path)
I want to know how to merge page 1 and 2 of fw9 as single pdf file which contains only 1 page which have half page as page 1 of fw9 pdf file and another half as page 2 of fw9 pdf.I have to do this for all 6 pages as 1-2 as 1 pdf with 1 page ,3-4 page as another pdf which has only 1 page with both on the same page and so on.Kindly help if any one have any idea on how to do so.
The library pyPDF2 has also a PdfFileMerger object, that should do exactly what you want.
As from the example here you can just create a PdfFileMerger, read two pages and put them into one single file.
I changed your script slightly to create also files with pages 0-1, 2-3, 4-5 ecc.. (of course page 0 is the first page but python numbering starts from 0)
import os
from PyPDF2 import PdfFileReader, PdfFileWriter, PdfFileMerger
def pdf_splitter(path):
fname = os.path.splitext(os.path.basename(path))[0]
pdf = PdfFileReader(path)
input_paths = []
for page in range(pdf.getNumPages()):
pdf_writer = PdfFileWriter()
pdf_writer.addPage(pdf.getPage(page))
output_filename = '{}_page_{}.pdf'.format(fname, page+1)
input_paths.append(output_filename)
with open(output_filename, 'wb') as out:
pdf_writer.write(out)
print('Created: {}'.format(output_filename))
# every 2 pages!
# Change the two if you need every other number of pages!
if page % 2 == 1:
pdf_merger = PdfFileMerger() #create pdfilemerger
for path in input_paths:
pdf_merger.append(path) #read the single pages
# we call it pages_N-1_N, so first would be pages_0_1!
output_path = '{}_pages_{}_{}.pdf'.format(fname, page-1, page)
with open(output_path, 'wb') as fileobj:
pdf_merger.write(fileobj) # write the two pages pdf!
input_paths = []
if __name__ == '__main__':
path = 'D:\Tasks\Samples\fw9.pdf'
pdf_splitter(path)
Is this what you wanted?
This will first create single pdf for each page and then combine them 2 to 2. Creating the single pdf could also be skipped, but I was not sure whether you want it or not.
from PyPDF2 import PdfFileReader, PdfFileWriter
from PyPDF2 import PageObject
#Open the files that have to be merged
pdf1File = open('document.pdf', 'rb')
#Read the files that you have opened
pdf1Reader = PdfFileReader(pdf1File)
#Make a list of all pages
pages = []
for pageNum in range(pdf1Reader.numPages):
pageObj = pdf1Reader.getPage(pageNum)
pages.append(pageObj)
#Calculate width and height for final output page
width = pages[0].mediaBox.getWidth() * 6
height = pages[0].mediaBox.getHeight() + 100
#Create blank page to merge all pages in one page
merged_page = PageObject.createBlankPage(None, width, height)
#Loop through all pages and merge / add them to blank page
x = 0
for page in pages:
merged_page.mergeScaledTranslatedPage(page, 1, x, 10)
x = float(x) + float(page.mediaBox.getWidth())
#Create final file with one page
writer = PdfFileWriter()
writer.addPage(merged_page)
with open('out.pdf', 'wb') as f:
writer.write(f)
I wanted to merge 6 files / page so I have used 6 as a multiplier for page width.
this is the answer of how to merge two pages to one page in vertical way
from PyPDF2 import PdfFileReader, PdfFileWriter
from PyPDF2 import PageObject
#Open the files that have to be merged
pdf1File = open('1.pdf', 'rb')
#Read the files that you have opened
pdf1Reader = PdfFileReader(pdf1File)
#Make a list of all pages
pages = []
for pageNum in range(pdf1Reader.numPages):
pageObj = pdf1Reader.getPage(pageNum)
pages.append(pageObj)
#Calculate width and height for final output page
width = pages[1].mediaBox.getWidth() * 2
height = pages[1].mediaBox.getHeight()
#Create blank page to merge all pages in one page
merged_page = PageObject.createBlankPage(None, width, height)
writer = PdfFileWriter()
#Loop through all pages and merge / add them to blank page
y =0
merged_page = PageObject.createBlankPage(None, width, height)
for page in range(len(pages)):
y+=1
if y%2!=0:
merged_page.mergePage(pages[page])
x=float(pages[page+1].mediaBox.getWidth())
merged_page.mergeScaledTranslatedPage(pages[page+1], 1,x, 0)
if y%2==0:
writer.addPage(merged_page)
merged_page = PageObject.createBlankPage(None, width, height)
y=0
#Create final file with one page
with open('out.pdf', 'wb') as f:
writer.write(f)

How to go through all items and than save them in a dictionary key

I want to load automatically a code from website.
I have a list with some names and want to go through every item. Go through the first item, make request, open website, copy the code/number from HTML (text in span) and than save this result in dictionary and so on (for all items).
I read from csv all lines and save them into a list.
After this I make request to load HTML from a website, search the company and read the numbers from span.
My code:
with open(test_f, 'r') as file:
rows = csv.reader(file,
delimiter=',',
quotechar='"')
data = [data for data in rows]
print(data)
url_part1 = "http://www.monetas.ch/htm/651/de/Firmen-Suchresultate.htm?Firmensuche="
url_enter_company = [data for data in rows]
url_last_part = "&CompanySearchSubmit=1"
firma_noga = []
for data in firma_noga:
search_noga = url_part1 + url_enter_company + url_last_part
r = requests.get(search_noga)
soup = BeautifulSoup(r.content, 'html.parser')
lii = soup.find_all("span")
# print all numbers that are in a span
numbers = [d.text for d in lii]
print("NOGA Codes: ")
I want to get in dictionary the result, where the key should be the company name (item in a list) and the value should be the number that I read from the span:
dict = {"firma1": "620100", "firma2": "262000, 465101"}
Can some one help me, I am new at web scraping and python, and don't know what I am doing wrong.
Split your string with regex and do your stuff depending on wether it is a number or not:
import re
for partial in re.split('([0-9]+)', myString):
try:
print(int(partial))
except:
print(partial + ' is not a number')
EDIT:
Well, myString is somewhat expected to be a string.
To get the text content of your spans as a string you should be able to use .text something like this:
spans = soup.find_all('span')
for span in spans:
myString = span.text #
for partial in re.split('([0-9]+)', myString):
try:
print(int(partial))
except:
print(partial + ' is not a number')
Abstracting from my requirements in comments I think somethinfg like this should work for you:
firma_noga = ['firma1', 'firma2', 'firma3'] #NOT EMPTY as in your code!
res_dict = {}
for data in firma_noga:
search_noga = url_part1 + url_enter_company + url_last_part
r = requests.get(search_noga)
soup = BeautifulSoup(r.content, 'html.parser')
lii = soup.find_all("span")
for l in lii:
if data not in res_dict:
res_dict[data] = [l]
else:
res_dict[data].append(l)
Obviously this will work obviously if firma-noga won't be empty like in your code; and all the rest (your) parsing logic should be valid as well.

Can BeautifulSoup find elements with a matching value?

I am working on a web scraper for the first time, and I am using Beautiful Soup to parse a JSON file and return several attributes that I send to a CSV.
The status variable, in the JSON array, is a binary value (0/1). I'd like to return only arrays that have a 0 for status. Is it feasible to do that?
"""soup = BeautifulSoup(html)
table = soup.find()
print soup.prettify()"""
js_data = json.loads(html)
Attraction = []
event = []
status = []
for doc in js_data["response"]["docs"]:
Attraction.append(doc["Attraction"])
event.append(doc["PostProcessedData"]["Onsales"]["event"]["date"])
status.append(doc["PostProcessedData"]["Onsales"]["status"])
with open("out.csv","w") as f:
datas = zip(Attraction,event,status)
keys = ["Attraction","event","status"]
f.write(";".join(keys))
for data in datas:
f.write(",".join([str(k).replace(",",";").replace("<br>"," ") for k in data]))
f.write("\n")
I might be missing something, but maybe this helps:
for doc in js_data["response"]["docs"]:
if doc["PostProcessedData"]["Onsales"]["status"] == "0":
Attraction.append(doc["Attraction"])
event.append(doc["PostProcessedData"]["Onsales"]["event"]["date"])
status.append(doc["PostProcessedData"]["Onsales"]["status"])

How to save the string, one word per column in Python?

I'm scraping the names of massage therapists along with their addresses from a directory. The addresses are all being saved into the CSV in one column for the whole string, but the title/name of each therapist is being saved one word per column over 2 or 3 columns.
What do I need to do in order to get the string that's being extracted to save in one column, like the addresses are being saved? (The top two lines of code are example html from the page, the next set of code is the extract from the script targeting this element)
<span class="name">
<img src="/images/famt-placeholder-sm.jpg" class="thumb" alt="Tiffani D Abraham"> Tiffani D Abraham</span>
import mechanize
from lxml import html
import csv
import io
from time import sleep
def save_products (products, writer):
for product in products:
for price in product['prices']:
writer.writerow([ product["title"].encode('utf-8') ])
writer.writerow([ price["contact"].encode('utf-8') ])
writer.writerow([ price["services"].encode('utf-8') ])
f_out = open('mtResult.csv', 'wb')
writer = csv.writer(f_out)
links = ["https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY","https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY&PageIndex=2&PageSize=10","https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY&PageIndex=3&PageSize=10","https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY&PageIndex=4&PageSize=10","https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY&PageIndex=5&PageSize=10","https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY&PageIndex=6&PageSize=10","https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY&PageIndex=7&PageSize=10", "https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY&PageIndex=8&PageSize=10", "https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY&PageIndex=9&PageSize=10", "https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY&PageIndex=10&PageSize=10" ]
br = mechanize.Browser()
for link in links:
print(link)
r = br.open(link)
content = r.read()
products = []
tree = html.fromstring(content)
product_nodes = tree.xpath('//ul[#class="famt-results"]/li')
for product_node in product_nodes:
product = {}
price_nodes = product_node.xpath('.//a')
product['prices'] = []
for price_node in price_nodes:
price = {}
try:
product['title'] = product_node.xpath('.//span[1]/text()')[0]
except:
product['title'] = ""
try:
price['services'] = price_node.xpath('./span[2]/text()')[0]
except:
price['services'] = ""
try:
price['contact'] = price_node.xpath('./span[3]/text()')[0]
except:
price['contact'] = ""
product['prices'].append(price)
products.append(product)
save_products(products, writer)
f_out.close()
I'm not positive if this solves the issue you were having, but either way there are a few improvements and modifications you might be interested in.
For example, since each link varies by a page index you can loop through the links easily rather than copying all 50 down to a list. Each therapist per page also has their own index, so you can also loop through the xpaths for each therapist's information.
#import modules
import mechanize
from lxml import html
import csv
import io
#open browser
br = mechanize.Browser()
#create file headers
titles = ["NAME"]
services = ["TECHNIQUE(S)"]
contacts = ["CONTACT INFO"]
#loop through all 50 webpages for therapist data
for link_index in range(1,50):
link = "https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY&PageIndex=" + str(link_index) + "&PageSize=10"
r = br.open(link)
page = r.read()
tree = html.fromstring(page)
#loop through therapist data for each therapist per page
for therapist_index in range(1,10):
#store names
title = tree.xpath('//*[#id="content"]/div[2]/ul[1]/li[' + str(therapist_index) + ']/a/span[1]/text()')
titles.append(" ".join(title))
#store techniques and convert to unicode
service = tree.xpath('//*[#id="content"]/div[2]/ul[1]/li[' + str(therapist_index) + ']/a/span[2]/text()')
try:
services.append(service[0].encode("utf-8"))
except:
services.append(" ")
#store contact info and convert to unicode
contact = tree.xpath('//*[#id="content"]/div[2]/ul[1]/li[' + str(therapist_index) + ']/a/span[3]/text()')
try:
contacts.append(contact[0].encode("utf-8"))
except:
contacts.append(" ")
#open file to write to
f_out = open('mtResult.csv', 'wb')
writer = csv.writer(f_out)
#get rows in correct format
rows = zip(titles, services, contacts)
#write csv line by line
for row in rows:
writer.writerow(row)
f_out.close()
The script loops through all 50 links on the provided webpage, and seems to be scraping all relevant information for each therapist if provided. Finally, it prints all the data to a csv with all data stored under respective columns for 'Name', 'Technique(s)', and 'Contact Info' if this is what you were originally struggling with.
Hope this helps!

Python, BeautifulSoup iterating through files issue

This may end up being a really novice question, because i'm a novice, but here goes.
i have a set of .html pages obtained using wget. i want to iterate through them and extract certain info, putting it in a .csv file.
using the code below, all the names print when my program runs, but only the info from the next to last page (i.e., page 29.html here) prints to the .csv file. i'm trying this with only a handful of files at first, there are about 1,200 that i'd like to get into this format.
the files are based on those here: https://www.cfis.state.nm.us/media/ReportLobbyist.aspx?id=25&el=2014 where page numbers are the id
thanks for any help!
from bs4 import BeautifulSoup
import urllib2
import csv
for i in xrange(22, 30):
try:
page = urllib2.urlopen('file:{}.html'.format(i))
except:
continue
else:
soup = BeautifulSoup(page.read())
n = soup.find(id='ctl00_ContentPlaceHolder1_lnkBCLobbyist')
name = n.string
print name
table = soup.find('table', 'reportTbl')
#get the rows
list_of_rows = []
for row in table.findAll('tr')[1:]:
col = row.findAll('td')
filing = col[0].string
status = col[1].string
cont = col[2].string
exp = col[3].string
record = (name, filing, status, cont, exp)
list_of_rows.append(record)
#write to file
writer = csv.writer(open('lob.csv', 'wb'))
writer.writerows(list_of_rows)
You need to append each time not overwrite, use a, open('lob.csv', 'wb') is overwriting each time through your outer loop:
writer = csv.writer(open('lob.csv', 'ab'))
writer.writerows(list_of_rows)
You could also declare list_of_rows = [] outside the for loops and write to the file once at the very end.
If you are wanting page 30 also you need to loop in range(22,31).

Categories