If [string] in [string] not working for requested webpage text - python

I am trying to open a webpage and scrape some strings from it into a list. The list would ultimately be populated by all of the names displayed on the webpage. In trying to do so, my code looks like this:
import xlsxwriter, urllib.request, string, http.cookiejar, requests
def main():
username = 'john.mauran'
password = 'fZSUME1q'
log_url = 'https://aries.case.com.pl/'
dest_url = 'https://aries.case.com.pl/main_odczyt.php?strona=eksperci'
login_values = {'username' : username , 'password' : password }
r = requests.post(dest_url, data=login_values, verify=False, allow_redirects=False)
open_sesame = r.text
#reads the expert page
readpage_list = open_sesame.splitlines()
#opens up a new file in excel
workbook = xlsxwriter.Workbook('expert_book.xlsx')
#adds worksheet to file
worksheet = workbook.add_worksheet()
#initializing the variable used to move names and dates
#in the excel spreadsheet
boxcoA = ""
boxcoB = ""
#initializing expert attribute variables and lists
url_ticker = 0
name_ticker = 0
raw_list = []
url_list = []
name_list= []
date_list= []
#this loop goes through and finds all the lines
#that contain the expert URL and name and saves them to raw_list::
#raw_list loop
for i in open_sesame:
if '<tr><td align=left><a href=' in i:
raw_list += i
if not raw_list:
print("List is empty")
if raw_list:
print(raw_list)
main()
As you can see, all I want to do is take the lines from the text returned by the Requests operation which start with the following characters '

I don't know exactly what you're trying to do, but this doesn't make any sense:
for i in open_sesame:
if '<tr><td align=left><a href=' in i:
raw_list += i
First of all, if you iterate over open_sesame, which is a string, each item in the iteration will be a character in the string. Then '<tr><td align=left><a href=' in i will always be false.
Second of all, raw_list += i is not how you append an item to a list.
Finally, why is the variable called open_sesame? Is it a joke?

Related

How to scrape data from PDF into Excel

I am trying to scrape the data from PDF and get it saved into an excel file. This is the pdf I needed: https://www.medicaljournals.se/acta/content_files/files/pdf/98/219/Suppl219.pdf
However, I need to scrape not all the data but the following one (below), and then saved it to excel in different cells:
From page 5, starting from P001 to and including Introduction - there is a P number, title, people names, and Introduction.
For now, I can only convert a PDF file into text (my code below) and save it all in one cell, but I need it to be separated into a different cells
import PyPDF2 as p2
PDFfile = open('Abstract Book from the 5th World Psoriasis and Psoriatic Arthritis
Conference 2018.pdf', 'rb')
pdfread = p2.PdfFileReader(PDFfile)
pdflist = []
i = 6
while i<pdfread.getNumPages():
pageinfo = pdfread.getPage(i)
#print(pageinfo.extractText())
i = i + 1
pdflist.append(pageinfo.extractText().replace('\n', ''))
print(pdflist)
The main you need is 'header' regex as 15 UPPERcase letters and 'article' regex letter 'P' and 3 digits.
One more regex helps you to divide your text by any of keywords
article_re = re.compile(r'[P]\d{3}') #P001: letter 'P' and 3 digits
header_re = re.compile(r'[A-Z\s\-]{15,}|$') #min 15 UPPERCASE letters, including '\n' '-' and
key_word_delimeters = ['Peoples', 'Introduction','Objectives','Methods','Results','Conclusions','References']
file = open('data.pdf', 'rb')
pdf = pdf.PdfFileReader(file)
text = ''
for i in range(6, 63):
text += pdf.getPage(i).extractText() # all text in one variable
articles = []
for article in re.split(article_re, text):
header = re.match(header_re, article) # recieving a match
other_text = re.split(header_re, article)[1] # recieving other text
if header:
header = header.group() # get text from match
item = {'header': header}
first_name_letter = header[-1] # save the first letter of name to put it in right position. Some kind of HOT BUGFIX
header = header[:-1] # cut last character: the first letter of name
header = header.replace('\n', '') #delete linebreakers
header = header.replace('-', '') #delete line break symbol
other_text = first_name_letter + other_text
data_array = re.split(
'Introduction:|Objectives:|Methods:|Results:|Conclusions:|References:',
other_text)
for key, data in zip(key_word_delimeters, data_array):
item[key] = data.replace('\n', '')
articles.append(item)

I can not find a way to deal with new pages in docx using Python

I have a docx file with 40 pages of text and I want to separate each page and import its context into a list. Is this possible? The only way I have found is to find the empty spots in my list but that does not always mean a page break. With my code I get the text after the word "Subject" is found and it stops
after a blank spot is found. The thing is that need a way to recognise pagebreak in my code to solve some issues. This way page break is also being treated as a " " . Thanks in advance
import os
import docx
def read(name):
doc = docx.Document(name)
text =[]
for par in doc.paragraphs:
text.append(par.text)
return text
''''''
for basename in os.listdir('files'):
path = os.path.join('files', basename)
jerk = read(path)
lari =[]
vaccum = []
indices = []
for i in jerk:
if not i.find('Subject'):
lari.append(jerk.index(i))
indices.append(jerk.index(i))
for j in jerk:
if jerk.index(j) in lari:
for k in range(20):
if jerk[jerk.index(j)+k]!='':
vaccum.append(jerk[jerk.index(j) + k + 1])
else:
break
final =[]
var =''
for k in vaccum:
var = var+k
if k =='':
final.append(var)
var =''
print(vaccum)

Loops and .json parsing in python

Hi I'm having trouble understanding a few things when it comes to loops and searching through a .json. I want to get the .json from a website then retrieve 25 items from ['body']'s then restart on a new .json page with new ['body']'s and retrieve those also. Finally, send the all data to a .txt file.
Here's my code
import json
import requests
#Settings
user_id = 29851266
page_num= 1
#Finds user data
max_p_f = requests.get('http://someforum/users/'+str(user_id)+'/posts.json?page='+str(page_num))
json_string = max_p_f.text
obj = json.loads(json_string)
max_page = obj['meta']['max_page']
current_page = obj['meta']['page']
posts_count = obj['meta']['posts_count']
username = obj['users'][0]['username']
count = 0
start_page = 1
while page_num <= max_page:
requests.get('http://www.someforum/users/'+str(user_id)+'/posts.json?page='+str(page_num))
page_num += 1
print("Page "+str(start_page + 1)+ " complete")
for x in range(0, 25):
data = obj['posts'][x]['body']
file = open(username+"_postdata.txt", 'a')
file.write("\n =============="+str(count)+"==================\n")
file.write(data)
count += 1
file.close()
I want the code to give me the 25 ['body'] values from the .json on the first page. Then go to a the second page a retrieve the new 25 ['body'] values. I am having trouble because when the values are written to the text file it only shows the first 25 ['body'] values and repeats those some 25 values until the while is fulfilled.
I would start by using the native .json() for requests instead of converting it from text to json so it would be:
requests.get('http://www.someforum/users/'+str(user_id)+'/posts.json?page='+str(page_num)).json()
Also you're just using a request string in the loop, you're not actually saving the new obj with the new page number inside the loop
so outside your loop:
max_p_f = 'http://someforum/users/'+str(user_id)+'/posts.json?page='
and inside your loop it should be :
obj = requests.get(max_p_f +str(page_num)).json()
Here is a sample snippet, how I would do something very similar:
base_url = 'http://somewebsite/bunchofbjectsonapage.json?page='
max_page = 3
current_page = 0
while current_page <= max_page:
current_page = current_page + 1
obj = requests.get(base_url + str(current_page)).json()
for item in obj:
name = item['company_name']
cat = item['category']
print([name,cat])

List append is returning two different lists

import requests
import json
def decrementList(words):
for w in [words] + [words[:-x] for x in range(1,len(words))]:
url = 'http://ws.spotify.com/search/1/track.json?q='
request = requests.get(url + "%20".join(w))
json_dict = json.loads(request.content)
track_title = ' '.join(w)
for track in json_dict["tracks"]:
if track["name"].lower() == track_title.lower() and track['href']:
return "http://open.spotify.com/track/" + track["href"][14:], words[len(w):], track["href"][14:]
return "Sorry, no more track matches found!", None
if __name__ == "__main__":
message = "baby asdf".split()
size = len(message)
while message:
href, new_list, for_playlist = decrementList(message)
message = new_list
#print href
playlist = []
playlist.append(for_playlist)
print playlist
In the code above, print playlistis returning two separate lists. I realize that this is occurring because the list append happens within a while loop. How can I make these both append to the same empty list, not two separate lists?
make declaration of list and print out of while loop:
playlist = []
while message:
#....
print playlist
That's because you reassign the name playlist to an empty list before calling append():
playlist = []
If you put it before the while loop, you should get the expected result.
playlist = []
while message:

Turning a python dict. to an excel sheet

I am having an issue with the below code.
import urllib2
import csv
from bs4 import BeautifulSoup
soup = BeautifulSoup(urllib2.urlopen('http://www.ny.com/clubs/nightclubs/index.html').read())
clubs = []
trains = ["A","C","E","1","2","3","4","5","6","7","N","Q","R","L","B","D","F"]
for club in soup.find_all("dt"):
clubD = {}
clubD["name"] = club.b.get_text()
clubD["address"] = club.i.get_text()
text = club.dd.get_text()
nIndex = text.find("(")
if(text[nIndex+1]=="2"):
clubD["number"] = text[nIndex:nIndex+15]
sIndex = text.find("Subway")
sIndexEnd = text.find(".",sIndex)
if(text[sIndexEnd-1] == "W" or text[sIndexEnd -1] == "E"):
sIndexEnd2 = text.find(".",sIndexEnd+1)
clubD["Subway"] = text[sIndex:sIndexEnd2]
else:
clubD["Subway"] = text[sIndex:sIndexEnd]
try:
cool = clubD["number"]
except (ValueError,KeyError):
clubD["number"] = "N/A"
clubs.append(clubD)
keys = [u"name", u"address",u"number",u"Subway"]
f = open('club.csv', 'wb')
dict_writer = csv.DictWriter(f, keys)
dict_writer.writerow([unicode(s).encode("utf-8") for s in clubs])
I get the error ValueError: dict contains fields not in fieldnames. I dont understand how this could be. Any assistance would be great. I am trying to turn the dictionary into an excel file.
clubs is a list of dictionaries, whereas each dictionary has four fields: name, address, number, and Subway. You will need to encode each of the fields:
# Instead of:
#dict_writer.writerow([unicode(s).encode("utf-8") for s in clubs])
# Do this:
for c in clubs:
# Encode each field: name, address, ...
for k in c.keys():
c[k] = c[k].encode('utf-8').strip()
# Write to file
dict_writer.writerow(c)
Update
I looked at your data and some of the fields have ending new line \n, so I updated the code to encode and strip white spaces at the same time.

Categories