I am not getting the expected results from the following iteration. The issue is the iteration value is not updating in the browser.find_element_by_xpath function (4th line of the code below). It always yields the first result. Note that I get the expected results when I manually run the iteration by assigning sequentially values 1,2,3,4 to the variable review_num instead of using the argument + str(review_num)
review_num = 0
for review_num in range(216): # for every review
review_num = review_num + 1
fixedelement = browser.find_element_by_xpath('//*[#id="collapseReviews"]/div/div[2]/div[2]/div[1]/div["+ str(review_num)"]')
#fixedelement = browser.find_element_by_xpath('//*[#id="collapseReviews"]/div/div[2]/div[2]/div[1]/div[1]')
#fixedelement = browser.find_element_by_xpath('//*[#id="collapseReviews"]/div/div[2]/div[2]/div[1]/div[2]')
#fixedelement = browser.find_element_by_xpath('//*[#id="collapseReviews"]/div/div[2]/div[2]/div[1]/div[3]')
#fixedelement = browser.find_element_by_xpath('//*[#id="collapseReviews"]/div/div[2]/div[2]/div[1]/div[4]')
R_title = fixedelement.find_element_by_xpath('./h4')
R_author = fixedelement.find_element_by_xpath('./div[2]/p[1]/span')
R_stars = fixedelement.find_element_by_xpath('./div[1]/div[1]/div[1]/span')
R_date = fixedelement.find_element_by_xpath('./div[1]/div[1]/div[2]/small')
R_comment = fixedelement.find_element_by_xpath('./div[1]/div[3]')
R_Yesvotes = fixedelement.find_element_by_xpath('./div[2]/div/div[1]/a[1]/span')
R_Novotes = fixedelement.find_element_by_xpath('./div[2]/div/div[1]/a[2]/span')
R_title_text = R_title.text
R_author_text = R_author.text
R_stars_text = R_stars.text
R_date_text = R_date.text
R_comment_text = R_comment.text
R_Yesvotes_text = R_Yesvotes.text
R_Novotes_text = R_Novotes.text
print(R_author_text)
with open(csvfile, "a", newline='', encoding='utf-8') as output:
writer = csv.writer(output, dialect='excel')
# writer.writerow(["namerow_id", "Name", "Position_Location"])
writer.writerow([review_num, R_title_text, R_author_text, R_stars_text, R_date_text, R_comment_text, R_Yesvotes_text, R_Novotes_text])
In the line
fixedelement = browser.find_element_by_xpath('//*[#id="collapseReviews"]/div/div[2]/div[2]/div[1]/div["+ str(review_num)"]')
you're searching for the element with xpath equal to
'//*[#id="collapseReviews"]/div/div[2]/div[2]/div[1]/div["+ str(review_num)"]'
with the "str(review_num)" hardcoded into the xpath.
You want to do something like
xpath = '//*[#id="collapseReviews"]/div/div[2]/div[2]/div[1]/div[{}]'.format(review_num)
fixedelement = browser.find_element_by_xpath(xpath)
Related
I use pylucence 9.4.1 to index a document and I just noticed a weird problem. There are some words, e.g. 'baby', that are present in the document but pylucene is unable to find them in the index.
This is my code to index the document:
(The document can be downloaded from here.
filepath = os.getcwd() + '/' + 'wiki_movie_plots_deduped.csv'
def indexDocument(title, year, plot):
ft = FieldType()
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
ft.setStored(True)
ft.setTokenized(True)
ft.setStoreTermVectors(True)
ft.setStoreTermVectorOffsets(True)
ft.setStoreTermVectorPositions(True)
doc = document.Document()
doc.add(document.Field("Title", title, ft))
doc.add(document.Field("Plot", plot, ft))
writer.addDocument(doc)
def CloseWriter():
writer.close()
def makeInvertedIndex(file_path):
df = pd.read_csv(file_path)
print(df.columns)
docid = 0
for i in df.index:
print(docid, '-', df['Title'][i])
indexDocument(df['Title'][i], df['Release Year'][i], df['Plot'][i])
docid += 1
indexPath = File('index/').toPath()
indexDir = FSDirectory.open(indexPath)
writerConfig = IndexWriterConfig(EnglishAnalyzer())
writer = IndexWriter(indexDir, writerConfig)
inverted = makeInvertedIndex(filepath)
CloseWriter()
This is the code to search the created index for a keyword:
keyword = 'baby'
fieldname = 'Title'
result = list()
indexPath = File('index/').toPath()
directory = FSDirectory.open(indexPath)
analyzer = StandardAnalyzer()
reader = DirectoryReader.open(directory)
searcher = IndexSearcher(DirectoryReader.open(directory))
query = QueryParser(fieldname, analyzer).parse(keyword)
print('query', query)
numdocs = searcher.count(query)
print("#-docs:", numdocs)
searcher.setSimilarity(BM25Similarity(1.2,0.75))
scoreDocs = searcher.search(query, 1000).scoreDocs # it returns TopDocs object containing scoreDocs and totalHits
# scoreDoc object contains docId and score
print('total hit:', searcher.search(query, 100).totalHits)
print("%s total matching documents" % (len(scoreDocs)))
Any help to understand the problem is appreciated.
So this is what I have. What I get at the end I have a list of every element in the formattedList the same. But when I print features[i] it prints the right features that I want to add to the end of the list.
# function to change modified json list back to dict so it can be saved as a json file again
def makeDict(jsonlist):
features = [{}] * 364
geoid1 = [{}] * 364
geometry = [{}] * 364
properties = [{}] * 364
formattedList = []
tempList = []
keylistfeatures = ['geometry', 'properties']
keylistgeometry = ['coordinates', 'type']
keylistproperties = ['content', 'datatype', 'density', 'density', 'description', 'display', 'file', 'size', 'source'
, 'target']
keylistfile = ['date', 'name', 'size']
file = {}
size = []
density = []
for i in range(0, 364):
# make density list
density = [jsonlist[i][6], jsonlist[i][7]]
# size list
size = [jsonlist[i][8], jsonlist[i][9]]
# file dictionary
file['date'] = jsonlist[i][1]
file['name'] = jsonlist[i][0]
file['size'] = jsonlist[i][2]
# geometry dictionary
geometry[i]['coordinates'] = jsonlist[i][11]
geometry[i]['type'] = jsonlist[i][12]
# properties dictionary
properties[i]['content'] = jsonlist[i][5]
properties[i]['datatype'] = jsonlist[i][4]
properties[i]['density'] = density
properties[i]['description'] = jsonlist[i][10]
properties[i]['display'] = jsonlist[i][3]
properties[i]['file'] = file
properties[i]['size'] = size
properties[i]['source'] = jsonlist[i][14]
properties[i]['target'] = jsonlist[i][15]
features[i]['geometry'] = geometry[i]
features[i]['properties'] = properties[i]
features[i]['type'] = jsonlist[i][13]
# print(features[i])
formattedList.append(features[i])
# print(formattedList)
return formattedList
Been stuck on this for hours but my guess is it's something simple I don't understand.
When I use the debugger, here is what I see
1st iteration
2nd iteration
3rd iteration
4th iteration
And also features is always a list of 364 of the same thing for that iteration
features
So for my bot, I am first extracting data via api and storing it in csv. When I run my for loop on data via api, it gives no error and runs smoothly.
But when the csv file is read and run, it gives out of bound error.
This is my function to generate data:
full_list = pd.DataFrame(columns=("date","open","high","low","close","volume","ticker","RSI","ADX","20_sma","max_100"))
def stock_data(ticker):
create_data = fetchOHLC(ticker,'minute',60)
create_data["ticker"] = ticker
create_data["RSI"] = round(rsi(create_data,25),2)
create_data["ADX"] = round(adx(create_data,14),2)
create_data["20_sma"] = round(create_data.close.rolling(10).mean().shift(),2)
create_data["max_100"] = create_data.close.rolling(100).max().shift()
create_data.dropna(inplace=True,axis=0)
create_data.reset_index(inplace=True)
return create_data
stocklist = open("stocklist.txt","r+")
tickers = stocklist.readlines()
for x in tickers:
try:
full_list = full_list.append(stock_data(x.strip()))
except:
print(f'{x.strip()} did not work')
full_list.to_csv("All_Data")
full_list
So when I run the same code below on dataframe created I got no error. But when I run the same code on the csv file, I get out of bound error.
list_tickers = full_list["ticker"].unique()
for y in list_tickers[:2]:
main = full_list[full_list["ticker"]==y]
pos = 0
num = 0
tick = y
signal_time = 0
signal_rsi = 0
signal_adx = 0
buy_time = 0
buy_price = 0
sl = 0
#to add trailing sl in this.
for x in main.index:
maxx = main.iloc[x]["max_100"]
rsi = main.iloc[x]["RSI"]
adx = main.iloc[x]["ADX"]
sma = main.iloc[x]["20_sma"]
close = main.iloc[x]["close"]
high = main.iloc[x]["high"]
if rsi > 80 and adx > 35 and close > maxx:
if pos == 0:
buy_price = main.iloc[x+1]["open"]
buy_time = main.iloc[x+1]["date"]
pos=1
signal_time = main.iloc[x]["date"]
signal_rsi = main.iloc[x]["RSI"]
signal_adx = main.iloc[x]["ADX"]
elif close < sma:
if pos == 1:
sell_time = main.iloc[x]["date"]
sell_price = sma*.998
pos=0
positions.loc[positions.shape[0]] = [y,signal_time,signal_rsi,signal_adx,buy_time,buy_price,sell_time,sell_price]
Any idea why?
Here is a cleanup and file call code:
full_list = pd.read_csv("All_data")
full_list.dropna(inplace=True,axis=0)
full_list.drop(labels="Unnamed: 0",axis=1) < index of previous dataframe
full_list.head(5)
Thanks
I am trying to scrape the "PRINCIPAL STOCKHOLDERS" table from the linktext fileand convert it to a csv file. Right now I am only half successful. Namely, I can locate the table and parse it but somehow I cannot convert the text table to a standard one. My code is attached. Can someone help me with it?
url = r'https://www.sec.gov/Archives/edgar/data/1034239/0000950124-97-003372.txt'
# Different approach, the first approach does not work
filing_url = requests.get(url)
content = filing_url.text
splited_data = content.split('\n')
table_title = 'PRINCIPAL STOCKHOLDERS'
END_TABLE_LINE = '- ------------------------'
def find_no_line_start_table(table_title,splited_data):
found_no_lines = []
for index, line in enumerate(splited_data):
if table_title in line:
found_no_lines.append(index)
return found_no_lines
table_start = find_no_line_start_table(table_title,splited_data)
# I need help with locating the table. If I locate the table use the above function, it will return two locations and I have to manually choose the correct one.
table_start = table_start[1]
def get_start_data_table(table_start, splited_data):
for index, row in enumerate(splited_data[table_start:]):
if '<C>' in row:
return table_start + index
def get_end_table(start_table_data, splited_data ):
for index, row in enumerate(splited_data[start_table_data:]):
if END_TABLE_LINE in row:
return start_table_data + index
def row(l):
l = l.split()
number_columns = 8
if len(l) >= number_columns:
data_row = [''] * number_columns
first_column_done = False
index = 0
for w in l:
if not first_column_done:
data_row[0] = ' '.join([data_row[0], w])
if ':' in w:
first_column_done = True
else:
index += 1
data_row[index] = w
return data_row
start_line = get_start_data_table(table_start, splited_data)
end_line = get_end_table(start_line, splited_data)
table = splited_data[start_line : end_line]
# I also need help with convert the text table to a CSV file, somehow the following function does not #recognize my column.
def take_table(table):
owner = []
Num_share = []
middle = []
middle_1 = []
middle_2 = []
middle_3 = []
prior_offering = []
after_offering = []
for r in table:
data_row = row(r)
if data_row:
col_1, col_2, col_3, col_4, col_5, col_6, col_7, col_8 = data_row
owner.append(col_1)
Num_share.append(col_2)
middle.append(col_3)
middle_1.append(col_4)
middle_2.append(col_5)
middle_3.append(col_6)
prior_offering.append(col_7)
after_offering.append(col_8)
table_data = {'owner': owner, 'Num_share': Num_share, 'middle': middle, 'middle_1': middle_1,
'middle_2': middle_2, 'middle_3': middle_3, 'prior_offering': prior_offering,
'after_offering': after_offering}
return table_data
#print (table)
dict_table = take_table(table)
a = pd.DataFrame(dict_table)
a.to_csv('trail.csv')
I think what you need to do is
pd.DataFrame.from_dict(dict_table)
instead of
pd.DataFrame(dict_table)
Hi I m wanting to convert the contents of a file (in this case a Landsat 7 metadata file) into a series of variables defined by the contents of the file using Python 2.7. The file contents looks like this:
GROUP = L1_METADATA_FILE
GROUP = METADATA_FILE_INFO
ORIGIN = "Image courtesy of the U.S. Geological Survey"
REQUEST_ID = "0101305309253_00043"
LANDSAT_SCENE_ID = "LE71460402010069SGS00"
FILE_DATE = 2013-06-02T11:19:59Z
STATION_ID = "SGS"
PROCESSING_SOFTWARE_VERSION = "LPGS_12.2.1"
DATA_CATEGORY = "NOMINAL"
END_GROUP = METADATA_FILE_INFO
GROUP = PRODUCT_METADATA
DATA_TYPE = "L1T"
ELEVATION_SOURCE = "GLS2000"
OUTPUT_FORMAT = "GEOTIFF"
EPHEMERIS_TYPE = "DEFINITIVE"
SPACECRAFT_ID = "LANDSAT_7"
SENSOR_ID = "ETM"
SENSOR_MODE = "BUMPER"
WRS_PATH = 146
WRS_ROW = 040
DATE_ACQUIRED = 2010-03-10
GROUP = IMAGE_ATTRIBUTES
CLOUD_COVER = 0.00
IMAGE_QUALITY = 9
SUN_AZIMUTH = 137.38394502
SUN_ELEVATION = 48.01114126
GROUND_CONTROL_POINTS_MODEL = 55
GEOMETRIC_RMSE_MODEL = 3.790
GEOMETRIC_RMSE_MODEL_Y = 2.776
GEOMETRIC_RMSE_MODEL_X = 2.580
END_GROUP = IMAGE_ATTRIBUTES
Example of interested variable items:
GROUP = MIN_MAX_RADIANCE
RADIANCE_MAXIMUM_BAND_1 = 293.700
RADIANCE_MINIMUM_BAND_1 = -6.200
RADIANCE_MAXIMUM_BAND_2 = 300.900
RADIANCE_MINIMUM_BAND_2 = -6.400
RADIANCE_MAXIMUM_BAND_3 = 234.400
RADIANCE_MINIMUM_BAND_3 = -5.000
RADIANCE_MAXIMUM_BAND_4 = 241.100
RADIANCE_MINIMUM_BAND_4 = -5.100
RADIANCE_MAXIMUM_BAND_5 = 47.570
RADIANCE_MINIMUM_BAND_5 = -1.000
RADIANCE_MAXIMUM_BAND_6_VCID_1 = 17.040
RADIANCE_MINIMUM_BAND_6_VCID_1 = 0.000
RADIANCE_MAXIMUM_BAND_6_VCID_2 = 12.650
RADIANCE_MINIMUM_BAND_6_VCID_2 = 3.200
RADIANCE_MAXIMUM_BAND_7 = 16.540
RADIANCE_MINIMUM_BAND_7 = -0.350
RADIANCE_MAXIMUM_BAND_8 = 243.100
RADIANCE_MINIMUM_BAND_8 = -4.700
END_GROUP = MIN_MAX_RADIANCE
I am open to other ideas as I don't need all entries as variables, just a selection. And I see some headers are listed more than once. i.e. GROUP is used multiple times. I need to be able to select certain variables (integer values) and use in formulas in other areas of code. ANY help would be appreciated (novice python coder).
I'm not sure exactly what you are looking for, but maybe something like this:
s = '''GROUP = L1_METADATA_FILE
GROUP = METADATA_FILE_INFO
ORIGIN = "Image courtesy of the U.S. Geological Survey"
REQUEST_ID = "0101305309253_00043"
LANDSAT_SCENE_ID = "LE71460402010069SGS00"
FILE_DATE = 2013-06-02T11:19:59Z
STATION_ID = "SGS"
PROCESSING_SOFTWARE_VERSION = "LPGS_12.2.1"
DATA_CATEGORY = "NOMINAL"
END_GROUP = METADATA_FILE_INFO
GROUP = PRODUCT_METADATA
DATA_TYPE = "L1T"
ELEVATION_SOURCE = "GLS2000"
OUTPUT_FORMAT = "GEOTIFF"
EPHEMERIS_TYPE = "DEFINITIVE"
SPACECRAFT_ID = "LANDSAT_7"
SENSOR_ID = "ETM"
SENSOR_MODE = "BUMPER"
WRS_PATH = 146
WRS_ROW = 040
DATE_ACQUIRED = 2010-03-10'''
output = {} #Dict
for line in s.split("\n"): #Iterates through every line in the string
l = line.split("=") #Seperate by "=" and put into a list
output[l[0].strip()] = l[1].strip() #First word is key, second word is value
print output #Output is a dictonary containing all key-value pairs in your metadata seperated by "="
print output["SENSOR_ID"] #Outputs "ETM"
==============
Edited:
f = open('metadata.txt', 'r') #open file for reading
def build_data(f): #build dictionary
output = {} #Dict
for line in f.readlines(): #Iterates through every line in the string
if "=" in line: #make sure line has data as wanted
l = line.split("=") #Seperate by "=" and put into a list
output[l[0].strip()] = l[1].strip() #First word is key, second word is value
return output #Returns a dictionary with the key, value pairs.
data = build_data(f)
print data["IMAGE_QUALITY"] #prints 9