Pyspark Luigi multiple workers issue - python

I want to load multiple files in spark data frame in parallel using Luigi workflow and store them in dictionary .
Once all the files are loaded,i want to be able to access these data-frame from dictionary in main and then do further processing.This process is working when i am running Luigi with one worker.if running Luigi with more than one worker,this variable is empty in main method.
Any suggestion will be helpful.
import Luigi
from Luigi import LocalTarget
from pyspark import SQLContext
from src.etl.SparkAbstract import SparkAbstract
from src.util.getSpark import get_spark_session
from src.util import getSpark,read_json
import configparser as cp
import datetime
from src.input.InputCSVFileComponent import InputCSVFile
import os
from src.etl.Component import ComponentInfo
class fileloadTask(luigi.Task):
compinfo = luigi.Parameter()
def output(self):
return luigi.LocalTarget("src/workflow_output/"+str(datetime.date.today().isoformat() )+"-"+ str(self.compinfo.id)+".csv")
def run(self):
a = InputCSVFile(self.compinfo) ##this class is responsible to return the object of spark dataframe and put it in dictionary
a.execute()
with self.output().open('w') as f:
f.write("done")
class EnqueueTask(luigi.WrapperTask):
compinfo = read_json.read_json_config('path to json file')
def requires(self):
folders = [
comp.id for comp in list(self.compinfo) if comp.component_type == 'INPUTFILE'
]
print(folders)
newcominfo = []
for index, objid in enumerate(folders):
newcominfo.append(self.compinfo[index])
for i in newcominfo:
print(f" in compingo..{i.id}")
callmethod = [fileloadTask(compinfo) for compinfo in newcominfo]
print(callmethod)
return callmethod
class MainTask(luigi.WrapperTask):
def requires(self):
return EnqueueTask()
def output(self):
return luigi.LocalTarget("src/workflow_output/"+str(datetime.date.today().isoformat() )+"-"+ "maintask"+".csv")
def run(self):
print(f"printing mapdf..{SparkAbstract.mapDf}")
res = not SparkAbstract.mapDf
print("Is dictionary empty ? : " + str(res)) ####-------------> this is empty when workers > 1 ################
for key, value in SparkAbstract.mapDf.items():
print("prinitng from dict")
print(key, value.show(10))
with self.output().open('w') as f:
f.write("done")
"""
entry point for spark application
"""
if __name__ == "__main__":
luigi.build([MainTask()],workers=2,local_scheduler=True)

Each worker runs in its own process. That mean workers can't share python object (in this instance the dictionary in which you put the results).
Generally speaking luigi is best to orchestrate tasks with side effects (like writing to files etc).
If you you are trying to parallelise tasks that load data in memory, I'd recommand using dask instead of luigi.

Related

How to execute custom Splittable DoFn in parallel

I am trying to develop a custom I/O connector for Apache Beam, written in Python. According to the official guideline, Splittable DoFn (SDF) is the framework of choice in my case.
I tried to run the pseudocode in the SDF programming guide, however, I failed to execute the pipeline in parallel. Below is a working example.
Dummy data
myfile = open('test_beam.txt', 'w')
for i in range(0, 1000):
myfile.write("%s\n" % i)
myfile.close
Pipeline
Make sure to replace DUMMY_FILE with the absolute path of test_beam.txt.
import argparse
import logging
import os
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import StandardOptions
from time import sleep
import random
from apache_beam.io.restriction_trackers import OffsetRange
DUMMY_FILE = absolute_path_to_dummy_data_file
class FileToWordsRestrictionProvider(beam.transforms.core.RestrictionProvider
):
def initial_restriction(self, file_name):
return OffsetRange(0, os.stat(file_name).st_size)
def create_tracker(self, restriction):
return beam.io.restriction_trackers.OffsetRestrictionTracker(
offset_range=self.initial_restriction(file_name=DUMMY_FILE))
def restriction_size(self, element, restriction):
return restriction.size()
class FileToWordsFn(beam.DoFn):
def process(
self,
file_name,
# Alternatively, we can let FileToWordsFn itself inherit from
# RestrictionProvider, implement the required methods and let
# tracker=beam.DoFn.RestrictionParam() which will use self as
# the provider.
tracker=beam.DoFn.RestrictionParam(FileToWordsRestrictionProvider())):
with open(file_name) as file_handle:
file_handle.seek(tracker.current_restriction().start)
while tracker.try_claim(file_handle.tell()):
yield read_next_record(file_handle=file_handle)
def read_next_record(file_handle):
line_number = file_handle.readline()
logging.info(line_number)
sleep(random.randint(1, 5))
logging.info(f'iam done {line_number}')
def run(args, pipeline_args, file_name):
pipeline_options = PipelineOptions(pipeline_args)
with beam.Pipeline(options=pipeline_options) as p:
execute_pipeline(args, p, file_name)
def execute_pipeline(args, p, file_name):
_ = (
p |
'Create' >> beam.Create([file_name]) |
'Read File' >> beam.ParDo(FileToWordsFn(file_name=file_name))
)
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
"""Build and run the pipeline."""
parser = argparse.ArgumentParser()
# to be added later
args, pipeline_args = parser.parse_known_args()
file_name = DUMMY_FILE
run(args, pipeline_args, file_name)
The SDF is taken from the first example here, however, I had to fix a few things (e.g., define restriction_size and a minor misplacement of ()). Furthermore, I introduced a random sleep in read_next_record to check whether the pipeline is executed in parallel (which it is not apparently).
There is probably a mistake in the way I constructed the pipeline? I would expect to use my SDF as the very first step in the pipeline, but doing so results in AttributeError: 'PBegin' object has no attribute 'windowing'. To circumvent this issue, I followed this post and added created a PCollection containing the input file_name.
What is the correct way to execute an SDF within a pipeline in parallel?
Beam DoFns (including SplittableDoFns) operate on an input PCollection. For SplittableDoFn, the input is usually a PCollection of source configs (for example, input files). When executing a SplittableDoFn the Beam runner is able to parallelize the execution of even a single input element by isolating parts of the input read using the RestrictionTracker. So for a file, this would mean that you might have workers running in parallel that read data from the same file but at different offsets.
So your implementation seems correct and should already facilitate parallel execution for a Beam runner.
Splittable DoFns of apache beam allows create a custom config to runner initiated splits, my case I had to process a big file where all content don't have separators and these were in one line and dataflow don't scalled. I used beam.transforms.core.RestrictionProvider, with the function split, where I specificed the number of parts for read the file and with this config when I ran the job dataflow used varios workers and the time of process reduced a lot.
class FileToLinesRestrictionProvider(beam.transforms.core.RestrictionProvider):
def initial_restriction(self, file_name):
return OffsetRange(0, size_file) #6996999736 #43493
#return OffsetRange(0, os.stat(file_name).st_size)
def create_tracker(self, restriction):
# return beam.io.restriction_trackers.OffsetRestrictionTracker(
# offset_range=self.initial_restriction(file_name=rutaFile_Test))
return beam.io.restriction_trackers.OffsetRestrictionTracker(restriction)
def split(self, file_name, restriction):
# Configuración para leer el archivo por partes
bundle_ranges = calcular_segmentos_lectura(tamFila, tam_segmentos, size_file)
for start, stop in bundle_ranges:
yield OffsetRange(start, stop)
def restriction_size(self, element, restriction):
#print(restriction.size())
return restriction.size()
class FileToLinesFn(beam.DoFn):
def process(
self,
file_name,
# Alternatively, we can let FileToWordsFn itself inherit from
# RestrictionProvider, implement the required methods and let
# tracker=beam.DoFn.RestrictionParam() which will use self as
# the provider.
tracker=beam.DoFn.RestrictionParam(FileToLinesRestrictionProvider())):
with FileSystems.open(file_name) as file_handle:
file_handle.seek(tracker.current_restriction().start)
print(tracker.current_restriction())
while tracker.try_claim(file_handle.tell()):
#print(file_handle.tell())
yield file_handle.read(tamFila)
def calcular_segmentos_lectura(
size_line,
tam_segmentos,
tam_file):
""" Basado en el tamaño del archivo y tamaños de las lineas divide en partes de acuerdo
a los parametros de entrada
Retorna array con los caracteres que deben procesar en cada paso
"""
num_lineas = int(tam_file /size_line)
valor_segmento = int(num_lineas / tam_segmentos)
valor_segmento = valor_segmento * size_line
print(valor_segmento)
segmentos_ranges = []
valorAnterior = 0
for i in range(tam_segmentos):
start = valorAnterior
stop_position = (valorAnterior + (valor_segmento))
valorAnterior = stop_position
if (i + 1) == tam_segmentos:
stop_position = tam_file
segmentos_ranges.append((start, stop_position))
return segmentos_ranges
This example help me a lot url

Writing a dataset to multiple directories with modin and Ray pauses unexplainably

Problem
I am trying to perform IO operations with multiple directories using ray, modin(with ray backend) and python. The file writes pause and the memory and disk usages do not change at all and the program is blocked.
Setup
I have a ray actor set up as this
import os
os.environ["MODIN_ENGINE"] = "ray" # Modin will use Ray
import ray
import modin.pandas as mpd
from numpy.core import numeric
from tqdm import tqdm
#ray.remote
class DatasetHelper:
# Class Variables (static) are to be written here
#ray.method(num_returns=1)
def get_dataset(self):
return self.dataset
#ray.method(num_returns=1)
def generate_dataset(self):
# generates some dataset and returns a dictionary.
return {'status': 1,
'data_dir': self.data_dir}
#ray.method(num_returns=1)
def get_config(self):
return {
"data_dir": self.data_dir,
"data_map_dir": self.data_map_dir,
"out_dir": self.out_dir
}
def _validate_initialization(self):
# Logic here isnt relevant
if self.data_dir == "" or self.data_map == "" or self.nRows == 42:
return False
return True
def __init__(self, data_dir, data_map_dir, nRows, out_dir):
self.data = {}
self.data_map = {}
self.dataset = mpd.DataFrame()
self.timestamp = []
self.first = True
self.out_dir = out_dir
self.data_dir = data_dir
self.data_map_dir = data_map_dir
self.nRows = nRows
def _extract_data(self):
print('Reading data ...')
for each in os.listdir(self.data_dir):
self.data[each.split('.')[0]] = mpd.read_csv(os.path.join(self.data_dir, each),
header=None,
nrows=self.nRows)
print('Data read successfully ...')
print('Validating times for monotonicity and uniqueness ... ')
for each in tqdm(self.data):
if mpd.to_datetime(self.data[each][0]).is_monotonic and mpd.to_datetime(self.data[each][0]).is_unique:
pass
else:
print('Validation failed for uuid: {}'.format(each))
return
def _extract_data_maps(self):
self.data_map = mpd.read_pickle(self.data_map_dir)
print('Data-Map unpickled successfully ...')
The main logic is structured as shown below,
from functools import cached_property
import os
import threading
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from DatasetHelper import DatasetHelper
import gc
import json
import logging
from multiprocessing import Process
import asyncio
import ray
ray.init(
# Limiting the object memory store used by ray.put()
# object_store_memory=20000000000,
# Limiting the memory usage of each worker.
# _memory = (1024.0 * 3) * 0.5,
# Specifiying custom directories for temp and object spilling
_temp_dir=os.path.join("/project/bhavaraj/Anaheim/ray_tmp"),
_system_config={
"object_spilling_config": json.dumps(
{"type": "filesystem", "params": {
"directory_path": "/project/bhavaraj/Anaheim/ray_plasma"}},
)
},
logging_level=logging.DEBUG,
ignore_reinit_error=True,
num_gpus=1,
num_cpus=40,
dashboard_port=8265
)
write_lock = threading.Lock()
def cache_dataset(loc):
from datetime import datetime
params = ray.get(loc.get_config.remote())
params['out_dir'] = os.getcwd() if params['out_dir'] is None else params['out_dir']
if os.path.exists(params['out_dir']) is False:
os.mkdir(params['out_dir'])
dataset_name = datetime.now().strftime("%H:%M:%S") + \
"_{}_Cache.csv".format(id(params['out_dir']))
print("Writing to file in {}".format(params['out_dir']))
print("Acquiring Lock")
with write_lock:
print("Lock acquired ...")
ray.get(loc.get_dataset.remote()).to_csv(os.path.join(params['out_dir'], dataset_name))
print("Writing to file finished at {}".format(params['out_dir']))
R_DATA_DIR: str = '/data/intermediate/R/'
R_DATA_MAP: str = '/data/external/DataMap/R.pkl'
G_DATA_DIR: str = '/data/intermediate/G/'
G_DATA_MAP: str = 'data/external/DataMap/G.pkl'
B_DATA_DIR: str = '/data/intermediate/B/'
B_DATA_MAP: str = '/data/external/DataMap/B.pkl'
C_DATA_DIR: str = '/data/intermediate/C/'
C_DATA_MAP: str = '/data/external/DataMap/C.pkl'
Z_DATA_DIR: str = '/data/intermediate/Z/'
Z_DATA_MAP: str = '/data/external/DataMap/Z.pkl'
objs_refs = []
n = 50000
b = DatasetHelper.remote(B_DATA_DIR, B_DATA_MAP, n,"./CB")
r = DatasetHelper.remote(R_DATA_DIR, R_DATA_MAP, n, "./LR")
c = DatasetHelper.remote(C_DATA_DIR, C_DATA_MAP, n, "./CC")
g = DatasetHelper.remote(G_DATA_DIR, G_DATA_MAP, n, "./AG")
objs_refs.append(b.generate_dataset.remote())
objs_refs.append(r.generate_dataset.remote())
objs_refs.append(c.generate_dataset.remote())
objs_refs.append(r.generate_dataset.remote())
objs_refs.append(g.generate_dataset.remote())
generate_outs = ray.get([x for x in objs_refs])
print("Printing dataset generation results...")
for each in generate_outs:
print(each)
# I also tried placing these methods inside the actor but the same issue persists
cache_dataset(b)
cache_dataset(r)
cache_dataset(c)
cache_dataset(g)
I tried decorating the cache_dataset() method with #remote and calling the method as below,
locs = [b, r, c, g]
ray.get([cache_dataset.remote(each) for each in locs])
Output
There are no errors with file writes but the programs pauses execution.
2021-09-20 08:32:53,024 DEBUG node.py:890 -- Process STDOUT and STDERR is being redirected to /project/bhavaraj/Anaheim/ray_tmp/session_2021-09-20_08-32-53_008570_36561/logs.
2021-09-20 08:32:53,172 DEBUG services.py:652 -- Waiting for redis server at 127.0.0.1:6379 to respond...
2021-09-20 08:32:53,334 DEBUG services.py:652 -- Waiting for redis server at 127.0.0.1:44291 to respond...
2021-09-20 08:32:53,340 DEBUG services.py:1043 -- Starting Redis shard with 10.0 GB max memory.
2021-09-20 08:33:01,212 INFO services.py:1263 -- View the Ray dashboard at http://127.0.0.1:8265
2021-09-20 08:33:01,216 DEBUG node.py:911 -- Process STDOUT and STDERR is being redirected to /project/bhavaraj/Anaheim/ray_tmp/session_2021-09-20_08-32-53_008570_36561/logs.
2021-09-20 08:33:01,221 DEBUG services.py:1788 -- Determine to start the Plasma object store with 76.48 GB memory using /dev/shm.
2021-09-20 08:33:01,314 DEBUG services.py:652 -- Waiting for redis server at 10.2.1.35:6379 to respond...
(pid=36906) Dataset shape: (100340, 41)
(pid=36913) Dataset shape: (150692, 40)
(pid=36902) Dataset shape: (103949, 41)
(pid=36910) Dataset shape: (420269, 41)
Printing dataset generation results... # prints the results correctly
Writing to file in ./CB
Acquiring Lock
Lock acquired ...
Writing to file finished at ./CB
Writing to file in ./LR
Acquiring Lock
Lock acquired ...
2021-09-20 08:43:02,612 DEBUG (unknown file):0 -- gc.collect() freed 115 refs in 0.23721289704553783 seconds
Hypothesis
I am thinking that the ray engine is stopping before all of the tasks have finished execution. I do not know how to prove or validate this hypothesis.
I also know that ray.get is supposed to block execution till all the tasks have finished executing.
There is a deadlock "like" situation somewhere.
References
https://docs.ray.io/en/latest/actors.html
https://towardsdatascience.com/writing-your-first-distributed-python-application-with-ray-4248ebc07f41
For any future readers,
modin.DataFrame.to_csv() pauses unexplainably for unknown reasons, but modin.Dataframe.to pickle() doesnt with the same logic.
There is also a significant performance increase in terms of read/write times, when data is stored as .pkl files.

How do I get the same queue into differnet multiprocessing files?

I see a lot of tutorials on how to use queues, but they always show them implemented in the same file. I'm trying to organize my code files well from the beginning because I anticipate the project to become very large. How do I get the queue that I initialize in my main file to import into the other function files?
Here is my main file:
import multiprocessing
import queue
from data_handler import data_handler
from get_info import get_memory_info
from get_info import get_cpu_info
if __name__ == '__main__':
q = queue.Queue()
getDataHandlerProcess = multiprocessing.Process(target=data_handler(q))
getMemoryInfoProcess = multiprocessing.Process(target=get_memory_info(q))
getCPUInfoProcess = multiprocessing.Process(target=get_cpu_info(q))
getDataHandlerProcess.start()
getMemoryInfoProcess.start()
getCPUInfoProcess.start()
print("DEBUG: All tasks successfully started.")
Here is my producer:
import psutil
import struct
import time
from data_frame import build_frame
def get_cpu_info(q):
while True:
cpu_string_data = bytes('', 'utf-8')
cpu_times = psutil.cpu_percent(interval=0.0, percpu=True)
for item in cpu_times:
cpu_string_data = cpu_string_data + struct.pack('<d',item)
cpu_frame = build_frame(cpu_string_data, 0, 0, -1, -1)
q.put(cpu_frame)
print(cpu_frame)
time.sleep(1.000)
def get_memory_info(q):
while True:
memory_string_data = bytes('', 'utf-8')
virtual_memory = psutil.virtual_memory()
swap_memory = psutil.swap_memory()
memory_info = list(virtual_memory+swap_memory)
for item in memory_info:
memory_string_data = memory_string_data + struct.pack('<d',item)
memory_frame = build_frame(memory_string_data, 0, 1, -1, -1)
q.put(memory_frame)
print(memory_frame)
time.sleep(1.000)
def get_disk_info(q):
while True:
disk_usage = psutil.disk_usage("/")
disk_io_counters = psutil.disk_io_counters()
time.sleep(1.000)
print(disk_usage)
print(disk_io_counters)
def get_network_info(q):
while True:
net_io_counters = psutil.net_io_counters()
time.sleep(1.000)
print(net_io_counters)
And here is my consumer:
def data_handler(q):
while True:
next_element = q.get()
print(next_element)
print('Item received at data handler queue.')
It is not entirely clear to me what do you mean by " How do I get the queue that I initialize in my main file to import into the other function files?".
Normally you pass a queue as and argument to a function and use it within a function scope regardless of the file structure. Or perform any other variable sharing techniques used for any other data type.
Your code seems to have a few errors however. Firstly, you shouldn't be using queue.Queue with multiprocessing. It has it's own version of that class.
q = multiprocessing.Queue()
It is slower than the queue.Queue, but it works for sharing the data across processes.
Secondly, the proper way to create process objects is:
getDataHandlerProcess = multiprocessing.Process(target=data_handler, args = (q,))
Otherwise you are actually calling data_handler(q) the main thread and trying to assign its return value to the target argument of multiprocessing.Process. Your data_handler function never returns, so the program probably gets into an infinite a deadlock at this point before multiprocessing even begins. Edit: actually it probably goes into infinite wait trying to get an element from an empty queue which will never be filled.

Question about translating different Ray workers into dask workers

I am currently working to translate Ray methods into dask and I wanted to know how to create five different "workers" for one class like the Ray program below does (run program in jupyter notebook):
So far i have a similar method in my dask version, but each "actor_" prints the same actor ID from the ping function. Could someone help with creating a similar product in Dask?
"""
#CURRENT VERSION
A simple test of Ray
This example uses placement_group API to spread work around
"""
import random
import os
import platform
import ray
import time
ray.init(ignore_reinit_error=True)
#ray.remote
class Actor():
def __init__(self, actor_id) -> None:
self.pid = os.getpid()
self.hostname = platform.node()
self.ip = ray._private.services.get_node_ip_address()
self.actor_id = actor_id
def ping(self):
print(f"{self.actor_id} {self.pid} {self.hostname} {self.ip} {time.time()} - ping")
time.sleep(random.randint(1,3))
return f"{self.actor_id}"
#ray.remote
def main():
# Get list of nodes to use
print(f"Found {len(actors)} Worker nodes in the Ray Cluster:")
# Setup one Actor per node
print(f"Setting up {len(actors)} Actors...")
time.sleep(1)
# Ping-Pong test
messages = [actors[a].ping.remote() for a in actors]
time.sleep(1)
for _ in range(20):
new_messages, messages = ray.wait(messages, num_returns=1)
for ray_message_id in new_messages:
pong = ray.get(ray_message_id)
print(pong, "- pong")
check = actors[pong].ping.remote()
time.sleep(1)
messages.append(check)
actors = {
"actor1" : Actor.remote(actor_id="actor1"),
"actor2" : Actor.remote(actor_id="actor2"),
"actor3" : Actor.remote(actor_id="actor3"),
"actor4" : Actor.remote(actor_id="actor4"),
"actor5" : Actor.remote(actor_id="actor5")
}
print(actors)
if __name__ == "__main__":
main.remote()

pickling issue while using pool to count check the file

I have my code that is sprawning multiple processes to check the count of files and maintaining the records in the database. The code which is working is mentioned below :
import multiprocessing as mp
from multiprocessing import Pool
import os
import time
import mysql.connector
"""Function to check the count of the file"""
def file_wc(fname):
with open('/home/vaibhav/Desktop/Input_python/'+ fname) as f:
count = sum(1 for line in f)
return (fname,count)
class file_audit:
def __init__(self):
"""Initialising the constructor for getting the names of files
and refrencing the outside class function"""
folder = '/home/vaibhav/Desktop/Input_python'
self.fnames = (name for name in os.listdir(folder))
self.file_wc=file_wc
def count_check(self):
"Creating 4 worker threads to check the count of the file parallelly"
pool = Pool(4)
self.m=list(pool.map(self.file_wc, list(self.fnames),4))
pool.close()
pool.join()
def database_updation(self):
"""To maintain an entry in the database with details
like filename and recrods present in the file"""
self.db = mysql.connector.connect(host="localhost",user="root",password="root",database="python_showtime" )
# prepare a cursor object using cursor() method
self.cursor = self.db.cursor()
query_string = ("INSERT INTO python_showtime.audit_capture"
"(name,records)"
"VALUES(%s,%s)")
#data_user = (name,records)
for each in self.m:
self.cursor.execute(query_string, each)
self.db.commit()
self.cursor.close()
start_time = time.time()
print("My program took", time.time() - start_time, "to run")
#if __name__ == '__main__':
x=file_audit()
x.count_check() #To check the count by sprawning multiple processes
x.database_updation() #To maintain the entry in the database
Point to be considered
Now if i put my function inside the class and comment self.file_wc=file_wc in the constructor section i get the Error can't pickle on generator objects. I got some fair understanding like we cannot pickle some objects,So want to know what exactly is happening at the background in very simple terms. I got the reference from here or here to make the code working

Categories