Scientific Publications
2021 |
|
Garcia, Adriano Marques; Griebler, Dalvan; Schepke, Claudio; Fernandes, Luiz Gustavo Introducing a Stream Processing Framework for Assessing Parallel Programming Interfaces Inproceedings Forthcoming 29th Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP), IEEE, Valladolid, Spain, Forthcoming. @inproceedings{GARCIA:PDP:21, title = {Introducing a Stream Processing Framework for Assessing Parallel Programming Interfaces}, author = {Adriano Marques Garcia and Dalvan Griebler and Claudio Schepke and Luiz Gustavo Fernandes}, year = {2021}, date = {2021-03-01}, booktitle = {29th Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP)}, publisher = {IEEE}, address = {Valladolid, Spain}, series = {PDP'21}, abstract = {Stream Processing applications are spread across different sectors of industry and people's daily lives. The increasing data we produce, such as audio, video, image, and text are demanding quickly and efficiently computation. It can be done through Stream Parallelism, which is still a challenging task and most reserved for experts. We introduce a Stream Processing framework for assessing Parallel Programming Interfaces (PPIs). Our framework targets multi-core architectures and C++ stream processing applications, providing an API that abstracts the details of the stream operators of these applications. Therefore, users can easily identify all the basic operators and implement parallelism through different PPIs. In this paper, we present the proposed framework, implement three applications using its API, and show how it works, by using it to parallelize and evaluate the applications with the PPIs Intel TBB, FastFlow, and SPar. The performance results were consistent with the literature.}, keywords = {}, pubstate = {forthcoming}, tppubtype = {inproceedings} } Stream Processing applications are spread across different sectors of industry and people's daily lives. The increasing data we produce, such as audio, video, image, and text are demanding quickly and efficiently computation. It can be done through Stream Parallelism, which is still a challenging task and most reserved for experts. We introduce a Stream Processing framework for assessing Parallel Programming Interfaces (PPIs). Our framework targets multi-core architectures and C++ stream processing applications, providing an API that abstracts the details of the stream operators of these applications. Therefore, users can easily identify all the basic operators and implement parallelism through different PPIs. In this paper, we present the proposed framework, implement three applications using its API, and show how it works, by using it to parallelize and evaluate the applications with the PPIs Intel TBB, FastFlow, and SPar. The performance results were consistent with the literature. | |
Vogel, Adriano; Griebler, Dalvan; Fernandes, Luiz G Providing High‐Level Self‐Adaptive Abstractions for Stream Parallelism on Multicores Journal Article Software: Practice and Experience, na (na), pp. na, 2021. @article{VOGEL:SPE:21, title = {Providing High‐Level Self‐Adaptive Abstractions for Stream Parallelism on Multicores}, author = {Adriano Vogel and Dalvan Griebler and Luiz G Fernandes}, url = {https://doi.org/10.1002/spe.2948}, doi = {10.1002/spe.2948}, year = {2021}, date = {2021-01-01}, journal = {Software: Practice and Experience}, volume = {na}, number = {na}, pages = {na}, publisher = {Wiley Online Library}, abstract = {Stream processing applications are common computing workloads that demand parallelism to increase their performance. As in the past, parallel programming remains a difficult task for application programmers. The complexity increases when application programmers must set non-intuitive parallelism parameters, i.e. the degree of parallelism. The main problem is that state-of-the-art libraries use a static degree of parallelism and are not sufficiently abstracted for developing stream processing applications. In this paper, we propose a self-adaptive regulation of the degree of parallelism to provide higher-level abstractions. Flexibility is provided to programmers with two new self-adaptive strategies, one is for performance experts, and the other abstracts the need to set a performance goal. We evaluated our solution using compiler transformation rules to generate parallel code with the SPar domain-specific language. The experimental results with real-world applications highlighted higher abstraction levels without significant performance degradation in comparison to static executions. The strategy for performance experts achieved slightly higher performance than the one that works without user-defined performance goals.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Stream processing applications are common computing workloads that demand parallelism to increase their performance. As in the past, parallel programming remains a difficult task for application programmers. The complexity increases when application programmers must set non-intuitive parallelism parameters, i.e. the degree of parallelism. The main problem is that state-of-the-art libraries use a static degree of parallelism and are not sufficiently abstracted for developing stream processing applications. In this paper, we propose a self-adaptive regulation of the degree of parallelism to provide higher-level abstractions. Flexibility is provided to programmers with two new self-adaptive strategies, one is for performance experts, and the other abstracts the need to set a performance goal. We evaluated our solution using compiler transformation rules to generate parallel code with the SPar domain-specific language. The experimental results with real-world applications highlighted higher abstraction levels without significant performance degradation in comparison to static executions. The strategy for performance experts achieved slightly higher performance than the one that works without user-defined performance goals. | |
2020 |
|
Löff, Júnior Henrique Aumentando a Expressividade e Melhorando a Geração de Código Paralelo para o Paradigma de Paralelismo de Stream em Arquiteturas Multi-core Technical Report School of Technology - PPGCC - PUCRS Porto Alegre, Brazil, 2020. @techreport{LOFF:TCC:20, title = {Aumentando a Expressividade e Melhorando a Geração de Código Paralelo para o Paradigma de Paralelismo de Stream em Arquiteturas Multi-core}, author = {Júnior Henrique Löff}, year = {2020}, date = {2020-12-01}, address = {Porto Alegre, Brazil}, institution = {School of Technology - PPGCC - PUCRS}, abstract = {Multi-core processors are increasing in popularity as an alternative for the physical limitations of unceasing frequency increment in sequential processors. Stream processing applications also have seen an increasing demand with the availability of sensors, IoT devices and user data. Modern systems can generate millions of data per day, that require to be processed timely. To this end, parallelization is crucial to extract the maximum performance of modern parallel architectures. On the other hand, parallel programming is still a challenge to developers, since they must deal with low-level details and operating system knowledge such as scheduling, load balancing and synchronizations. This work goes in this direction, since we offer an additional layer of abstraction to hide these complexities from the programmer, who can entirely focus on the application business. This work contributes with SPar, a domain-specific language for expressing stream parallelism in multi-core architectures with C++11 attributes. This work has increased SPar's expressiveness by adding two new attributes to its language and improving the compiler code generation efficiency. The main modifications added were to allow data parallelism exploitation, along with the current stream parallelism already implemented in SPar. Experiments were conducted with real applications to evaluate the expressiveness, flexibility and performance obtained with the new version. Results show that using only data parallel patterns, SPar achieved similar or even better performance than hand-coded parallelizations. Results also show that combining stream and data parallel patterns is not always efficient. However, there is an opportunity for exploring such approach in the future, since we didn't implement any specific combined optimization, for instance, apply the best stream and data parallel degree combination (auto-adaptive). Meanwhile, traditional parallel patterns (i.e. Map, Farm and Pipeline) were improved along the years and already implement customizations such as schedulers and reductions.}, keywords = {}, pubstate = {published}, tppubtype = {techreport} } Multi-core processors are increasing in popularity as an alternative for the physical limitations of unceasing frequency increment in sequential processors. Stream processing applications also have seen an increasing demand with the availability of sensors, IoT devices and user data. Modern systems can generate millions of data per day, that require to be processed timely. To this end, parallelization is crucial to extract the maximum performance of modern parallel architectures. On the other hand, parallel programming is still a challenge to developers, since they must deal with low-level details and operating system knowledge such as scheduling, load balancing and synchronizations. This work goes in this direction, since we offer an additional layer of abstraction to hide these complexities from the programmer, who can entirely focus on the application business. This work contributes with SPar, a domain-specific language for expressing stream parallelism in multi-core architectures with C++11 attributes. This work has increased SPar's expressiveness by adding two new attributes to its language and improving the compiler code generation efficiency. The main modifications added were to allow data parallelism exploitation, along with the current stream parallelism already implemented in SPar. Experiments were conducted with real applications to evaluate the expressiveness, flexibility and performance obtained with the new version. Results show that using only data parallel patterns, SPar achieved similar or even better performance than hand-coded parallelizations. Results also show that combining stream and data parallel patterns is not always efficient. However, there is an opportunity for exploring such approach in the future, since we didn't implement any specific combined optimization, for instance, apply the best stream and data parallel degree combination (auto-adaptive). Meanwhile, traditional parallel patterns (i.e. Map, Farm and Pipeline) were improved along the years and already implement customizations such as schedulers and reductions. | |
Hoffmann, Renato Barreto Stream Parallelism Annotations for Autonomic OpenMP Code Generation Technical Report School of Technology - PPGCC - PUCRS Porto Alegre, Brazil, 2020. @techreport{HOFFMANN:TCC:20, title = {Stream Parallelism Annotations for Autonomic OpenMP Code Generation}, author = {Renato Barreto Hoffmann}, year = {2020}, date = {2020-12-01}, address = {Porto Alegre, Brazil}, institution = {School of Technology - PPGCC - PUCRS}, abstract = {Recent computer architectures are parallel in nature and require parallel programming to fully exploit the machine power. However, this is a complex and error-prone task. Offering parallel abstractions is a way to mitigate this problem. Some examples that do it are OpenMP, Intel's TBB (Threading Building Blocks), and FastFlow. However, each interface is designed with different implementation techniques and design goals. More specifically, OpenMP is an industry and academy standard for parallel programming that is complex when using to develop parallel stream processing applications. It lacks key programming mechanisms and abstractions that must be developed or obtained from external application programming interfaces (API). On the other hand, the importance of stream processing applications can not be ignored. They are present in data encryption and compression, sensor monitoring, data analytics, video and audio filtering, etc. To tackle this problem, we proposed to use an existing high-level parallel programming model (named SPar) for generating lower-level stream processing OpenMP code. We achieved that by implementing a stream processing template with OpenMP, using its default directives and standard C++ mechanisms. After that, aiming at increasing the abstraction level, we automatically generated code for this stream processing template using SPar's set of source code annotations. This was based on the proposed set of transformation rules for OpenMP stream processing template, which were used for automatic source-to-source code transformations directly in the abstract syntax tree (AST). Our experiments demonstrated that in 4 different stream processing applications the maximum observed performance difference compared to the state-of-the-art implementations was: 1.72% in the case of the manual OpenMP implementation; and 2.49% in the case of the SPar's generated code.}, keywords = {}, pubstate = {published}, tppubtype = {techreport} } Recent computer architectures are parallel in nature and require parallel programming to fully exploit the machine power. However, this is a complex and error-prone task. Offering parallel abstractions is a way to mitigate this problem. Some examples that do it are OpenMP, Intel's TBB (Threading Building Blocks), and FastFlow. However, each interface is designed with different implementation techniques and design goals. More specifically, OpenMP is an industry and academy standard for parallel programming that is complex when using to develop parallel stream processing applications. It lacks key programming mechanisms and abstractions that must be developed or obtained from external application programming interfaces (API). On the other hand, the importance of stream processing applications can not be ignored. They are present in data encryption and compression, sensor monitoring, data analytics, video and audio filtering, etc. To tackle this problem, we proposed to use an existing high-level parallel programming model (named SPar) for generating lower-level stream processing OpenMP code. We achieved that by implementing a stream processing template with OpenMP, using its default directives and standard C++ mechanisms. After that, aiming at increasing the abstraction level, we automatically generated code for this stream processing template using SPar's set of source code annotations. This was based on the proposed set of transformation rules for OpenMP stream processing template, which were used for automatic source-to-source code transformations directly in the abstract syntax tree (AST). Our experiments demonstrated that in 4 different stream processing applications the maximum observed performance difference compared to the state-of-the-art implementations was: 1.72% in the case of the manual OpenMP implementation; and 2.49% in the case of the SPar's generated code. | |
Rockenbach, Dinei André High-Level Programming Abstractions for Stream Parallelism on GPUs Masters Thesis School of Technology - PPGCC - PUCRS, 2020. @mastersthesis{ROCKENBACH:DM:20, title = {High-Level Programming Abstractions for Stream Parallelism on GPUs}, author = {Dinei André Rockenbach}, year = {2020}, date = {2020-11-01}, address = {Porto Alegre, Brazil}, school = {School of Technology - PPGCC - PUCRS}, abstract = {The growth and spread of parallel architectures have driven the pursuit of greater computing power with massively parallel hardware such as the Graphics Processing Units (GPUs). This new heterogeneous computer architecture composed of multi-core Central Processing Units (CPUs) and many-core GPUs became usual, enabling novel software applications such as self-driving cars, real-time ray tracing, deep learning, and virtual reality (VR), which are characterized as stream processing applications. However, this heterogeneous environment poses an additional challenge to software development, which is still in the process of adapting to the parallel processing paradigm on multi-core systems, where programmers are supported by several APIs (Application Programming Interfaces) that offer different abstraction levels. The parallelism exploitation in GPU is done using both CUDA and OpenCL for academia and industry, whose developers have to deal with low-level architecture concepts to efficiently exploit GPU parallelism in their applications. There is still a lack of parallel programming abstractions when: 1) parallelizing code on GPUs, and 2) needing higher-level programming abstractions that deal with both CPU and GPU parallelism. Unfortunately, developers still have to be expert programmers on system and architecture to enable efficient hardware parallelism exploitation in this architectural environment. To contribute to the first problem, we created gsparlib, a novel structured parallel programming library for exploiting GPU parallelism that provides a unified programming API and driver-agnostic runtime. It offers Map and Reduce parallel patterns on top of CUDA and OpenCL drivers. We evaluate its performance comparing with state-of-the-art APIs, where the experiments revealed a comparable and efficient performance. For contributing to the second problem, we extended the SPar Domain-Specific Language (DSL), which has been proved to be high-level and productive for expressing stream parallelism with C++ annotations in multi-core CPUs. In this work, we propose and implement new annotations that increase expressiveness to combine the current stream parallelism on CPUs and data parallelism on GPUs. We also provide new pattern-based transformation rules that were implemented in the compiler targeting automatic source-to-source code transformations using GSParLib for GPU parallelism exploitation. Our experiments demonstrate that SPar compiler is able to generate stream and data parallel patterns without significant performance penalty compared to handwritten code. Thanks to these advances in spar, our work is the first on providing high-level C++11 annotations as an API that does not require significant code refactoring in sequential programs while enabling multi-core CPU and many-core GPU parallelism exploitation for stream processing applications.}, keywords = {}, pubstate = {published}, tppubtype = {mastersthesis} } The growth and spread of parallel architectures have driven the pursuit of greater computing power with massively parallel hardware such as the Graphics Processing Units (GPUs). This new heterogeneous computer architecture composed of multi-core Central Processing Units (CPUs) and many-core GPUs became usual, enabling novel software applications such as self-driving cars, real-time ray tracing, deep learning, and virtual reality (VR), which are characterized as stream processing applications. However, this heterogeneous environment poses an additional challenge to software development, which is still in the process of adapting to the parallel processing paradigm on multi-core systems, where programmers are supported by several APIs (Application Programming Interfaces) that offer different abstraction levels. The parallelism exploitation in GPU is done using both CUDA and OpenCL for academia and industry, whose developers have to deal with low-level architecture concepts to efficiently exploit GPU parallelism in their applications. There is still a lack of parallel programming abstractions when: 1) parallelizing code on GPUs, and 2) needing higher-level programming abstractions that deal with both CPU and GPU parallelism. Unfortunately, developers still have to be expert programmers on system and architecture to enable efficient hardware parallelism exploitation in this architectural environment. To contribute to the first problem, we created gsparlib, a novel structured parallel programming library for exploiting GPU parallelism that provides a unified programming API and driver-agnostic runtime. It offers Map and Reduce parallel patterns on top of CUDA and OpenCL drivers. We evaluate its performance comparing with state-of-the-art APIs, where the experiments revealed a comparable and efficient performance. For contributing to the second problem, we extended the SPar Domain-Specific Language (DSL), which has been proved to be high-level and productive for expressing stream parallelism with C++ annotations in multi-core CPUs. In this work, we propose and implement new annotations that increase expressiveness to combine the current stream parallelism on CPUs and data parallelism on GPUs. We also provide new pattern-based transformation rules that were implemented in the compiler targeting automatic source-to-source code transformations using GSParLib for GPU parallelism exploitation. Our experiments demonstrate that SPar compiler is able to generate stream and data parallel patterns without significant performance penalty compared to handwritten code. Thanks to these advances in spar, our work is the first on providing high-level C++11 annotations as an API that does not require significant code refactoring in sequential programs while enabling multi-core CPU and many-core GPU parallelism exploitation for stream processing applications. | |
Pieper, Ricardo Luis High-level Programming Abstractions for Distributed Stream Processing Masters Thesis School of Technology - PPGCC - PUCRS, 2020. @mastersthesis{PIEPER:DM:20, title = {High-level Programming Abstractions for Distributed Stream Processing}, author = {Ricardo Luis Pieper}, year = {2020}, date = {2020-10-01}, address = {Porto Alegre, Brazil}, school = {School of Technology - PPGCC - PUCRS}, abstract = {Stream processing applications represent a significant part of today's software. An increased amount of streaming data is generated every day from various sources (computing devices and applications), which requires to be processed on time. Shared-memory architectures cannot cope with these large-scale processing demands. In High-Performance Computing (HPC), Message Passing Interface (MPI) is the state-of-the-art parallel API (Application Programming Interface) for implementing parallel C/C++ programs. However, the stream parallelism exploitation using MPI is difficult and error-prone to application developers because it exposes low-level details to them, regarding computer architectures and operating systems. Programmers have to deal with implementation mechanisms for data serizalization, process communication and synchronization, fault tolerance, work scheduling, load balancing, and parallelism strategies. Our research work addresses a subset of these challenges and problems providing two high-level programming abstractions for distributed stream processing. First, we created a distributed stream parallelism library called dsparlib. It was built as a skeleton library equipped with Farm and Pipeline parallel patterns to provide programming abstractions on top of MPI. Second, we extend the spar language and compiler roles to support distributed memory architectures since it is a Domain-Specific Language (DSL) for expressing stream parallelism using C++11 annotation that has been proved to be productive on shared-memory architectures. We managed to make it work without significantly changing the easy of use language syntax and semantics, generating automatic parallel code with spar's compiler using dsparlib as the parallel runtime. The experiments were conducted using real-world stream processing applications and testing different cluster configurations. We demonstrated that dsparlib provides a simpler API than MPI and a competitive performance. Also, the spar's compiler was able to generate parallel code automatically without performance penalties compared to handwritten codes in dsparlib. Finally, with all these high-level programming abstractions implemented, spar becomes the first annotation-based language for expressing stream parallelism in C++ programs to support distributed-memory architectures, avoiding significant sequential code refactoring to enable parallel execution on clusters.}, keywords = {}, pubstate = {published}, tppubtype = {mastersthesis} } Stream processing applications represent a significant part of today's software. An increased amount of streaming data is generated every day from various sources (computing devices and applications), which requires to be processed on time. Shared-memory architectures cannot cope with these large-scale processing demands. In High-Performance Computing (HPC), Message Passing Interface (MPI) is the state-of-the-art parallel API (Application Programming Interface) for implementing parallel C/C++ programs. However, the stream parallelism exploitation using MPI is difficult and error-prone to application developers because it exposes low-level details to them, regarding computer architectures and operating systems. Programmers have to deal with implementation mechanisms for data serizalization, process communication and synchronization, fault tolerance, work scheduling, load balancing, and parallelism strategies. Our research work addresses a subset of these challenges and problems providing two high-level programming abstractions for distributed stream processing. First, we created a distributed stream parallelism library called dsparlib. It was built as a skeleton library equipped with Farm and Pipeline parallel patterns to provide programming abstractions on top of MPI. Second, we extend the spar language and compiler roles to support distributed memory architectures since it is a Domain-Specific Language (DSL) for expressing stream parallelism using C++11 annotation that has been proved to be productive on shared-memory architectures. We managed to make it work without significantly changing the easy of use language syntax and semantics, generating automatic parallel code with spar's compiler using dsparlib as the parallel runtime. The experiments were conducted using real-world stream processing applications and testing different cluster configurations. We demonstrated that dsparlib provides a simpler API than MPI and a competitive performance. Also, the spar's compiler was able to generate parallel code automatically without performance penalties compared to handwritten codes in dsparlib. Finally, with all these high-level programming abstractions implemented, spar becomes the first annotation-based language for expressing stream parallelism in C++ programs to support distributed-memory architectures, avoiding significant sequential code refactoring to enable parallel execution on clusters. | |
Hoffmann, Renato B; Griebler, Dalvan; Danelutto, Marco; Fernandes, Luiz G Stream Parallelism Annotations for Multi-Core Frameworks Inproceedings XXIV Brazilian Symposium on Programming Languages (SBLP), pp. 48-55, ACM, Natal, Brazil, 2020. @inproceedings{HOFFMANN:SBLP:20, title = {Stream Parallelism Annotations for Multi-Core Frameworks}, author = {Renato B Hoffmann and Dalvan Griebler and Marco Danelutto and Luiz G Fernandes}, url = {https://doi.org/10.1145/3427081.3427088}, doi = {10.1145/3427081.3427088}, year = {2020}, date = {2020-10-01}, booktitle = {XXIV Brazilian Symposium on Programming Languages (SBLP)}, pages = {48-55}, publisher = {ACM}, address = {Natal, Brazil}, series = {SBLP'20}, abstract = {Data generation, collection, and processing is an important workload of modern computer architectures. Stream or high-intensity data flow applications are commonly employed in extracting and interpreting the information contained in this data. Due to the computational complexity of these applications, high-performance ought to be achieved using parallel computing. Indeed, the efficient exploitation of available parallel resources from the architecture remains a challenging task for the programmers. Techniques and methodologies are required to help shift the efforts from the complexity of parallelism exploitation to specific algorithmic solutions. To tackle this problem, we propose a methodology that provides the developer with a suitable abstraction layer between a clean and effective parallel programming interface targeting different multi-core parallel programming frameworks. We used standard C++ code annotations that may be inserted in the source code by the programmer. Then, a compiler parses C++ code with the annotations and generates calls to the desired parallel runtime API. Our experiments demonstrate the feasibility of our methodology and the performance of the abstraction layer, where the difference is negligible in four applications with respect to the state-of-the-art C++ parallel programming frameworks. Additionally, our methodology allows improving the application performance since the developers can choose the runtime that best performs in their system.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Data generation, collection, and processing is an important workload of modern computer architectures. Stream or high-intensity data flow applications are commonly employed in extracting and interpreting the information contained in this data. Due to the computational complexity of these applications, high-performance ought to be achieved using parallel computing. Indeed, the efficient exploitation of available parallel resources from the architecture remains a challenging task for the programmers. Techniques and methodologies are required to help shift the efforts from the complexity of parallelism exploitation to specific algorithmic solutions. To tackle this problem, we propose a methodology that provides the developer with a suitable abstraction layer between a clean and effective parallel programming interface targeting different multi-core parallel programming frameworks. We used standard C++ code annotations that may be inserted in the source code by the programmer. Then, a compiler parses C++ code with the annotations and generates calls to the desired parallel runtime API. Our experiments demonstrate the feasibility of our methodology and the performance of the abstraction layer, where the difference is negligible in four applications with respect to the state-of-the-art C++ parallel programming frameworks. Additionally, our methodology allows improving the application performance since the developers can choose the runtime that best performs in their system. | |
Stein, Charles M; Rockenbach, Dinei A; Griebler, Dalvan; Torquati, Massimo; Mencagli, Gabriele; Danelutto, Marco; Fernandes, Luiz G Latency‐aware adaptive micro‐batching techniques for streamed data compression on graphics processing units Journal Article Concurrency and Computation: Practice and Experience, na (na), pp. e5786, 2020. @article{STEIN:CCPE:20, title = {Latency‐aware adaptive micro‐batching techniques for streamed data compression on graphics processing units}, author = {Charles M Stein and Dinei A Rockenbach and Dalvan Griebler and Massimo Torquati and Gabriele Mencagli and Marco Danelutto and Luiz G Fernandes}, url = {https://doi.org/10.1002/cpe.5786}, doi = {10.1002/cpe.5786}, year = {2020}, date = {2020-05-01}, journal = {Concurrency and Computation: Practice and Experience}, volume = {na}, number = {na}, pages = {e5786}, publisher = {Wiley Online Library}, abstract = {Stream processing is a parallel paradigm used in many application domains. With the advance of graphics processing units (GPUs), their usage in stream processing applications has increased as well. The efficient utilization of GPU accelerators in streaming scenarios requires to batch input elements in microbatches, whose computation is offloaded on the GPU leveraging data parallelism within the same batch of data. Since data elements are continuously received based on the input speed, the bigger the microbatch size the higher the latency to completely buffer it and to start the processing on the device. Unfortunately, stream processing applications often have strict latency requirements that need to find the best size of the microbatches and to adapt it dynamically based on the workload conditions as well as according to the characteristics of the underlying device and network. In this work, we aim at implementing latency‐aware adaptive microbatching techniques and algorithms for streaming compression applications targeting GPUs. The evaluation is conducted using the Lempel‐Ziv‐Storer‐Szymanski compression application considering different input workloads. As a general result of our work, we noticed that algorithms with elastic adaptation factors respond better for stable workloads, while algorithms with narrower targets respond better for highly unbalanced workloads.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Stream processing is a parallel paradigm used in many application domains. With the advance of graphics processing units (GPUs), their usage in stream processing applications has increased as well. The efficient utilization of GPU accelerators in streaming scenarios requires to batch input elements in microbatches, whose computation is offloaded on the GPU leveraging data parallelism within the same batch of data. Since data elements are continuously received based on the input speed, the bigger the microbatch size the higher the latency to completely buffer it and to start the processing on the device. Unfortunately, stream processing applications often have strict latency requirements that need to find the best size of the microbatches and to adapt it dynamically based on the workload conditions as well as according to the characteristics of the underlying device and network. In this work, we aim at implementing latency‐aware adaptive microbatching techniques and algorithms for streaming compression applications targeting GPUs. The evaluation is conducted using the Lempel‐Ziv‐Storer‐Szymanski compression application considering different input workloads. As a general result of our work, we noticed that algorithms with elastic adaptation factors respond better for stable workloads, while algorithms with narrower targets respond better for highly unbalanced workloads. | |
2019 |
|
Vogel, Adriano; Griebler, Dalvan; Danelutto, Marco; Fernandes, Luiz Gustavo Seamless Parallelism Management for Multi-core Stream Processing Inproceedings Advances in Parallel Computing, Proceedings of the International Conference on Parallel Computing (ParCo), pp. 533 - 542, IOS Press, Prague, Czech Republic, 2019. @inproceedings{VOGEL:PARCO:19, title = {Seamless Parallelism Management for Multi-core Stream Processing}, author = {Adriano Vogel and Dalvan Griebler and Marco Danelutto and Luiz Gustavo Fernandes}, url = {https://doi.org/10.3233/APC200082}, doi = {10.3233/APC200082}, year = {2019}, date = {2019-09-01}, booktitle = {Advances in Parallel Computing, Proceedings of the International Conference on Parallel Computing (ParCo)}, volume = {36}, pages = {533 - 542}, publisher = {IOS Press}, address = {Prague, Czech Republic}, series = {ParCo'19}, abstract = {Video streaming applications have critical performance requirements for dealing with fluctuating workloads and providing results in real-time. As a consequence, the majority of these applications demand parallelism for delivering quality of service to users. Although high-level and structured parallel programming aims at facilitating parallelism exploitation, there are still several issues to be addressed for increasing/improving existing parallel programming abstractions. In this paper, we aim at employing self-adaptivity for stream processing in order to seamlessly manage the application parallelism configurations at run-time, where a new strategy alleviates from application programmers the need to set time-consuming and error-prone parallelism parameters. The new strategy was implemented and validated on SPar. The results have shown that the proposed solution increases the level of abstraction and achieved a competitive performance.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Video streaming applications have critical performance requirements for dealing with fluctuating workloads and providing results in real-time. As a consequence, the majority of these applications demand parallelism for delivering quality of service to users. Although high-level and structured parallel programming aims at facilitating parallelism exploitation, there are still several issues to be addressed for increasing/improving existing parallel programming abstractions. In this paper, we aim at employing self-adaptivity for stream processing in order to seamlessly manage the application parallelism configurations at run-time, where a new strategy alleviates from application programmers the need to set time-consuming and error-prone parallelism parameters. The new strategy was implemented and validated on SPar. The results have shown that the proposed solution increases the level of abstraction and achieved a competitive performance. | |
Rockenbach, Dinei André; Griebler, Dalvan; Danelutto, Marco; Fernandes, Luiz Gustavo High-Level Stream Parallelism Abstractions with SPar Targeting GPUs Inproceedings Advances in Parallel Computing, Proceedings of the International Conference on Parallel Computing (ParCo), pp. 543 - 552, IOS Press, Prague, Czech Republic, 2019. @inproceedings{ROCKENBACH:PARCO:19, title = {High-Level Stream Parallelism Abstractions with SPar Targeting GPUs}, author = {Dinei André Rockenbach and Dalvan Griebler and Marco Danelutto and Luiz Gustavo Fernandes}, url = {https://doi.org/10.3233/APC200083}, doi = {10.3233/APC200083}, year = {2019}, date = {2019-09-01}, booktitle = {Advances in Parallel Computing, Proceedings of the International Conference on Parallel Computing (ParCo)}, volume = {36}, pages = {543 - 552}, publisher = {IOS Press}, address = {Prague, Czech Republic}, series = {ParCo'19}, abstract = {The combined exploitation of stream and data parallelism is demonstrating encouraging performance results in the literature for heterogeneous architectures, which are present on every computer systems today. However, provide parallel software efficiently targeting those architectures requires significant programming effort and expertise. The SPar domain-specific language already represents a solution to this problem providing proven high-level programming abstractions for multi-core architectures. In this paper, we enrich the SPar language adding support for GPUs. New transformation rules are designed for generating parallel code using stream and data parallel patterns. Our experiments revealed that these transformations rules are able to improve performance while the high-level programming abstractions are maintained.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } The combined exploitation of stream and data parallelism is demonstrating encouraging performance results in the literature for heterogeneous architectures, which are present on every computer systems today. However, provide parallel software efficiently targeting those architectures requires significant programming effort and expertise. The SPar domain-specific language already represents a solution to this problem providing proven high-level programming abstractions for multi-core architectures. In this paper, we enrich the SPar language adding support for GPUs. New transformation rules are designed for generating parallel code using stream and data parallel patterns. Our experiments revealed that these transformations rules are able to improve performance while the high-level programming abstractions are maintained. | |
Vogel, Adriano; Griebler, Dalvan; Danelutto, Marco; Fernandes, Luiz Gustavo Minimizing Self-Adaptation Overhead in Parallel Stream Processing for Multi-Cores Book Chapter Euro-Par 2019: Parallel Processing Workshops, 11997 , pp. 30 - 41, Springer, Göttingen, Germany, 2019. @inbook{VOGEL:adaptive-overhead:AutoDaSP:19, title = {Minimizing Self-Adaptation Overhead in Parallel Stream Processing for Multi-Cores}, author = {Adriano Vogel and Dalvan Griebler and Marco Danelutto and Luiz Gustavo Fernandes}, url = {https://doi.org/10.1007/978-3-030-48340-1_3}, doi = {10.1007/978-3-030-48340-1_3}, year = {2019}, date = {2019-08-01}, booktitle = {Euro-Par 2019: Parallel Processing Workshops}, volume = {11997}, pages = {30 - 41}, publisher = {Springer}, address = {Göttingen, Germany}, abstract = {Stream processing paradigm is present in several applications that apply computations over continuous data flowing in the form of streams (e.g., video feeds, image, and data analytics). Employing self-adaptivity to stream processing applications can provide higher-level programming abstractions and autonomic resource management. However, there are cases where the performance is suboptimal. In this paper, the goal is to optimize parallelism adaptations in terms of stability and accuracy, which can improve the performance of parallel stream processing applications. Therefore, we present a new optimized self-adaptive strategy that is experimentally evaluated. The proposed solution provided high-level programming abstractions, reduced the adaptation overhead, and achieved a competitive performance with the best static executions.}, keywords = {}, pubstate = {published}, tppubtype = {inbook} } Stream processing paradigm is present in several applications that apply computations over continuous data flowing in the form of streams (e.g., video feeds, image, and data analytics). Employing self-adaptivity to stream processing applications can provide higher-level programming abstractions and autonomic resource management. However, there are cases where the performance is suboptimal. In this paper, the goal is to optimize parallelism adaptations in terms of stability and accuracy, which can improve the performance of parallel stream processing applications. Therefore, we present a new optimized self-adaptive strategy that is experimentally evaluated. The proposed solution provided high-level programming abstractions, reduced the adaptation overhead, and achieved a competitive performance with the best static executions. | |
Rockenbach, Dinei André; Stein, Charles Michael; Griebler, Dalvan; Mencagli, Gabriele; Torquati, Massimo; Danelutto, Marco; Fernandes, Luiz Gustavo Stream Processing on Multi-cores with GPUs: Parallel Programming Models' Challenges Inproceedings IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW), pp. 834-841, IEEE, Rio de Janeiro, Brazil, 2019. @inproceedings{ROCKENBACH:stream-multigpus:IPDPSW:19, title = {Stream Processing on Multi-cores with GPUs: Parallel Programming Models' Challenges}, author = {Dinei André Rockenbach and Charles Michael Stein and Dalvan Griebler and Gabriele Mencagli and Massimo Torquati and Marco Danelutto and Luiz Gustavo Fernandes}, url = {https://doi.org/10.1109/IPDPSW.2019.00137}, doi = {10.1109/IPDPSW.2019.00137}, year = {2019}, date = {2019-07-01}, booktitle = {IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW)}, pages = {834-841}, publisher = {IEEE}, address = {Rio de Janeiro, Brazil}, series = {IPDPSW'19}, abstract = {The stream processing paradigm is used in several scientific and enterprise applications in order to continuously compute results out of data items coming from data sources such as sensors. The full exploitation of the potential parallelism offered by current heterogeneous multi-cores equipped with one or more GPUs is still a challenge in the context of stream processing applications. In this work, our main goal is to present the parallel programming challenges that the programmer has to face when exploiting CPUs and GPUs' parallelism at the same time using traditional programming models. We highlight the parallelization methodology in two use-cases (the Mandelbrot Streaming benchmark and the PARSEC's Dedup application) to demonstrate the issues and benefits of using heterogeneous parallel hardware. The experiments conducted demonstrate how a high-level parallel programming model targeting stream processing like the one offered by SPar can be used to reduce the programming effort still offering a good level of performance if compared with state-of-the-art programming models.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } The stream processing paradigm is used in several scientific and enterprise applications in order to continuously compute results out of data items coming from data sources such as sensors. The full exploitation of the potential parallelism offered by current heterogeneous multi-cores equipped with one or more GPUs is still a challenge in the context of stream processing applications. In this work, our main goal is to present the parallel programming challenges that the programmer has to face when exploiting CPUs and GPUs' parallelism at the same time using traditional programming models. We highlight the parallelization methodology in two use-cases (the Mandelbrot Streaming benchmark and the PARSEC's Dedup application) to demonstrate the issues and benefits of using heterogeneous parallel hardware. The experiments conducted demonstrate how a high-level parallel programming model targeting stream processing like the one offered by SPar can be used to reduce the programming effort still offering a good level of performance if compared with state-of-the-art programming models. | |
Griebler, Dalvan; Vogel, Adriano; Sensi, Daniele De; Danelutto, Marco; Fernandes, Luiz Gustavo Simplifying and implementing service level objectives for stream parallelism Journal Article Journal of Supercomputing, 76 , pp. 4603 - 4628, 2019, ISSN: 0920-8542. @article{GRIEBLER:JS:19, title = {Simplifying and implementing service level objectives for stream parallelism}, author = {Dalvan Griebler and Adriano Vogel and Daniele De Sensi and Marco Danelutto and Luiz Gustavo Fernandes}, url = {https://doi.org/10.1007/s11227-019-02914-6}, doi = {10.1007/s11227-019-02914-6}, issn = {0920-8542}, year = {2019}, date = {2019-06-01}, journal = {Journal of Supercomputing}, volume = {76}, pages = {4603 - 4628}, publisher = {Springer}, abstract = {An increasing attention has been given to provide service level objectives (SLOs) in stream processing applications due to the performance and energy requirements, and because of the need to impose limits in terms of resource usage while improving the system utilization. Since the current and next-generation computing systems are intrinsically offering parallel architectures, the software has to naturally exploit the architecture parallelism. Implement and meet SLOs on existing applications is not a trivial task for application programmers, since the software development process, besides the parallelism exploitation, requires the implementation of autonomic algorithms or strategies. This is a system-oriented programming approach and requires the management of multiple knobs and sensors (e.g., the number of threads to use, the clock frequency of the cores, etc.) so that the system can self-adapt at runtime. In this work, we introduce a new and simpler way to define SLO in the application’s source code, by abstracting from the programmer all the details relative to self-adaptive system implementation. The application programmer specifies which parts of the code to parallelize and the related SLOs that should be enforced. To reach this goal, source-to-source code transformation rules are implemented in our compiler, which automatically generates self-adaptive strategies to enforce, at runtime, the user-expressed objectives. The experiments highlighted promising results with simpler, effective, and efficient SLO implementations for real-world applications.}, keywords = {}, pubstate = {published}, tppubtype = {article} } An increasing attention has been given to provide service level objectives (SLOs) in stream processing applications due to the performance and energy requirements, and because of the need to impose limits in terms of resource usage while improving the system utilization. Since the current and next-generation computing systems are intrinsically offering parallel architectures, the software has to naturally exploit the architecture parallelism. Implement and meet SLOs on existing applications is not a trivial task for application programmers, since the software development process, besides the parallelism exploitation, requires the implementation of autonomic algorithms or strategies. This is a system-oriented programming approach and requires the management of multiple knobs and sensors (e.g., the number of threads to use, the clock frequency of the cores, etc.) so that the system can self-adapt at runtime. In this work, we introduce a new and simpler way to define SLO in the application’s source code, by abstracting from the programmer all the details relative to self-adaptive system implementation. The application programmer specifies which parts of the code to parallelize and the related SLOs that should be enforced. To reach this goal, source-to-source code transformation rules are implemented in our compiler, which automatically generates self-adaptive strategies to enforce, at runtime, the user-expressed objectives. The experiments highlighted promising results with simpler, effective, and efficient SLO implementations for real-world applications. | |
Stein, Charles Michael; Griebler, Dalvan; Danelutto, Marco; Fernandes, Luiz Gustavo Stream Parallelism on the LZSS Data Compression Application for Multi-Cores with GPUs Inproceedings 27th Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP), pp. 247-251, IEEE, Pavia, Italy, 2019. @inproceedings{STEIN:LZSS-multigpu:PDP:19, title = {Stream Parallelism on the LZSS Data Compression Application for Multi-Cores with GPUs}, author = {Charles Michael Stein and Dalvan Griebler and Marco Danelutto and Luiz Gustavo Fernandes}, url = {https://doi.org/10.1109/EMPDP.2019.8671624}, doi = {10.1109/EMPDP.2019.8671624}, year = {2019}, date = {2019-02-01}, booktitle = {27th Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP)}, pages = {247-251}, publisher = {IEEE}, address = {Pavia, Italy}, series = {PDP'19}, abstract = {GPUs have been used to accelerate different data parallel applications. The challenge consists in using GPUs to accelerate stream processing applications. Our goal is to investigate and evaluate whether stream parallel applications may benefit from parallel execution on both CPU and GPU cores. In this paper, we introduce new parallel algorithms for the Lempel-Ziv-Storer-Szymanski (LZSS) data compression application. We implemented the algorithms targeting both CPUs and GPUs. GPUs have been used with CUDA and OpenCL to exploit inner algorithm data parallelism. Outer stream parallelism has been exploited using CPU cores through SPar. The parallel implementation of LZSS achieved 135 fold speedup using a multi-core CPU and two GPUs. We also observed speedups in applications where we were not expecting to get it using the same combine data-stream parallel exploitation techniques.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } GPUs have been used to accelerate different data parallel applications. The challenge consists in using GPUs to accelerate stream processing applications. Our goal is to investigate and evaluate whether stream parallel applications may benefit from parallel execution on both CPU and GPU cores. In this paper, we introduce new parallel algorithms for the Lempel-Ziv-Storer-Szymanski (LZSS) data compression application. We implemented the algorithms targeting both CPUs and GPUs. GPUs have been used with CUDA and OpenCL to exploit inner algorithm data parallelism. Outer stream parallelism has been exploited using CPU cores through SPar. The parallel implementation of LZSS achieved 135 fold speedup using a multi-core CPU and two GPUs. We also observed speedups in applications where we were not expecting to get it using the same combine data-stream parallel exploitation techniques. | |
2018 |
|
Vogel, Adriano; Griebler, Dalvan; Sensi, Daniele De; Danelutto, Marco; Fernandes, Luiz Gustavo Autonomic and Latency-Aware Degree of Parallelism Management in SPar Book Chapter Euro-Par 2018: Parallel Processing Workshops, pp. 28–39, Springer, Turin, Italy, 2018. @inbook{VOGEL:Adaptive-Latency-SPar:AutoDaSP:18, title = {Autonomic and Latency-Aware Degree of Parallelism Management in SPar}, author = {Adriano Vogel and Dalvan Griebler and Daniele De Sensi and Marco Danelutto and Luiz Gustavo Fernandes}, url = {http://dx.doi.org/10.1007/978-3-030-10549-5_3}, doi = {10.1007/978-3-030-10549-5_3}, year = {2018}, date = {2018-08-01}, booktitle = {Euro-Par 2018: Parallel Processing Workshops}, pages = {28--39}, publisher = {Springer}, address = {Turin, Italy}, abstract = {Stream processing applications became a representative workload in current computing systems. A significant part of these applications demands parallelism to increase performance. However, programmers are often facing a trade-off between coding productivity and performance when introducing parallelism. SPar was created for balancing this trade-off to the application programmers by using the C++11 attributes' annotation mechanism. In SPar and other programming frameworks for stream processing applications, the manual definition of the number of replicas to be used for the stream operators is a challenge. In addition to that, low latency is required by several stream processing applications. We noted that explicit latency requirements are poorly considered on the state-of-the-art parallel programming frameworks. Since there is a direct relationship between the number of replicas and the latency of the application, in this work we propose an autonomic and adaptive strategy to choose the proper number of replicas in SPar to address latency constraints. We experimentally evaluated our implemented strategy and demonstrated its effectiveness on a real-world application, demonstrating that our adaptive strategy can provide higher abstraction levels while automatically managing the latency.}, keywords = {}, pubstate = {published}, tppubtype = {inbook} } Stream processing applications became a representative workload in current computing systems. A significant part of these applications demands parallelism to increase performance. However, programmers are often facing a trade-off between coding productivity and performance when introducing parallelism. SPar was created for balancing this trade-off to the application programmers by using the C++11 attributes' annotation mechanism. In SPar and other programming frameworks for stream processing applications, the manual definition of the number of replicas to be used for the stream operators is a challenge. In addition to that, low latency is required by several stream processing applications. We noted that explicit latency requirements are poorly considered on the state-of-the-art parallel programming frameworks. Since there is a direct relationship between the number of replicas and the latency of the application, in this work we propose an autonomic and adaptive strategy to choose the proper number of replicas in SPar to address latency constraints. We experimentally evaluated our implemented strategy and demonstrated its effectiveness on a real-world application, demonstrating that our adaptive strategy can provide higher abstraction levels while automatically managing the latency. | |
Griebler, Dalvan; Sensi, Daniele De; Vogel, Adriano; Danelutto, Marco; Fernandes, Luiz Gustavo Service Level Objectives via C++11 Attributes Book Chapter Euro-Par 2018: Parallel Processing Workshops, pp. 745–756, Springer, Turin, Italy, 2018. @inbook{GRIEBLER:SLO-SPar-Nornir:REPARA:18, title = {Service Level Objectives via C++11 Attributes}, author = {Dalvan Griebler and Daniele De Sensi and Adriano Vogel and Marco Danelutto and Luiz Gustavo Fernandes}, url = {http://dx.doi.org/10.1007/978-3-030-10549-5_58}, doi = {10.1007/978-3-030-10549-5_58}, year = {2018}, date = {2018-08-01}, booktitle = {Euro-Par 2018: Parallel Processing Workshops}, pages = {745--756}, publisher = {Springer}, address = {Turin, Italy}, series = { Lecture Notes in Computer Science }, abstract = {n recent years, increasing attention has been given to the possibility of guaranteeing Service Level Objectives (SLOs) to users about their applications, either regarding performance or power consumption. SLO can be implemented for parallel applications since they can provide many control knobs (e.g., the number of threads to use, the clock frequency of the cores, etc.) to tune the performance and power consumption of the application. Different from most of the existing approaches, we target sequential stream processing applications by proposing a solution based on C++ annotations. The user specifies which parts of the code to parallelize and what type of requirements should be enforced on that part of the code. Our solution first automatically parallelizes the annotated code and then applies self-adaptation approaches at run-time to enforce the user-expressed objectives. We ran experiments on different real-world applications, showing its simplicity and effectiveness.}, keywords = {}, pubstate = {published}, tppubtype = {inbook} } n recent years, increasing attention has been given to the possibility of guaranteeing Service Level Objectives (SLOs) to users about their applications, either regarding performance or power consumption. SLO can be implemented for parallel applications since they can provide many control knobs (e.g., the number of threads to use, the clock frequency of the cores, etc.) to tune the performance and power consumption of the application. Different from most of the existing approaches, we target sequential stream processing applications by proposing a solution based on C++ annotations. The user specifies which parts of the code to parallelize and what type of requirements should be enforced on that part of the code. Our solution first automatically parallelizes the annotated code and then applies self-adaptation approaches at run-time to enforce the user-expressed objectives. We ran experiments on different real-world applications, showing its simplicity and effectiveness. | |
Griebler, Dalvan; Hoffmann, Renato B; Danelutto, Marco; Fernandes, Luiz Gustavo Stream Parallelism with Ordered Data Constraints on Multi-Core Systems Journal Article Journal of Supercomputing, 75 , pp. 1-20, 2018, ISSN: 0920-8542. @article{GRIEBLER:JS:18, title = {Stream Parallelism with Ordered Data Constraints on Multi-Core Systems}, author = {Dalvan Griebler and Renato B Hoffmann and Marco Danelutto and Luiz Gustavo Fernandes}, url = {https://doi.org/10.1007/s11227-018-2482-7}, doi = {10.1007/s11227-018-2482-7}, issn = {0920-8542}, year = {2018}, date = {2018-07-01}, journal = {Journal of Supercomputing}, volume = {75}, pages = {1-20}, publisher = {Springer}, abstract = {It is often a challenge to keep input/output tasks/results in order for parallel computations ver data streams, particularly when stateless task operators are replicated to increase parallelism when there are irregular tasks. Maintaining input/output order requires additional coding effort and may significantly impact the application's actual throughput. Thus, we propose a new implementation technique designed to be easily integrated with any of the existing C++ parallel programming frameworks that support stream parallelism. In this paper, it is first implemented and studied using SPar, our high-level domain-specific language for stream parallelism. We discuss the results of a set of experiments with real-world applications revealing how significant performance improvements may be achieved when our proposed solution is integrated within SPar, especially for data compression applications. Also, we show the results of experiments performed after integrating our solution within FastFlow and TBB, revealing no significant overheads.}, keywords = {}, pubstate = {published}, tppubtype = {article} } It is often a challenge to keep input/output tasks/results in order for parallel computations ver data streams, particularly when stateless task operators are replicated to increase parallelism when there are irregular tasks. Maintaining input/output order requires additional coding effort and may significantly impact the application's actual throughput. Thus, we propose a new implementation technique designed to be easily integrated with any of the existing C++ parallel programming frameworks that support stream parallelism. In this paper, it is first implemented and studied using SPar, our high-level domain-specific language for stream parallelism. We discuss the results of a set of experiments with real-world applications revealing how significant performance improvements may be achieved when our proposed solution is integrated within SPar, especially for data compression applications. Also, we show the results of experiments performed after integrating our solution within FastFlow and TBB, revealing no significant overheads. | |
Vogel, Adriano Adaptive Degree of Parallelism for the SPar Runtime Masters Thesis School of Technology - PPGCC - PUCRS, 2018. @mastersthesis{VOGEL:DM:18, title = {Adaptive Degree of Parallelism for the SPar Runtime}, author = {Adriano Vogel}, url = {http://tede2.pucrs.br/tede2/handle/tede/8255}, year = {2018}, date = {2018-03-01}, address = {Porto Alegre, Brazil}, school = {School of Technology - PPGCC - PUCRS}, abstract = {In recent years, stream processing applications have become a traditional workload in computing systems. They are traditionally found in video, audio, graphic and image processing. Many of these applications demand parallelism to increase performance. However, programmers must often face the trade-off between coding productivity and performance that introducing parallelism creates. SPar Domain-Specific Language (DSL) was created to achieve the optimal balance for programmers, with the C++-11 attribute annotation mechanism to ensure that essential properties of stream parallelism could be represented (stage, input, output, and replicate). The compiler recognizes the SPar attributes and generates parallel code automatically. The need to manually define parallelism is tne crucial challenge for increasing SPAR's abstraction level, because it is time consuming and error prone. Also, executing several applications can fail to be efficient when running a non-suitable number of replicas. This occurs when the defined number of replicas in a parallel region is not optimal or when a static number is used, which ignores the dynamic nature of stream processing applications. In order to solve this problem, we introduced the concept of the abstracted and adaptive number of replicas for SPar. Moreover, we described our implemented mechanism as well as transformation rules that enable SPar to generate parallel code with the adaptive degree of parallelism support. We experimentally evaluated the implemented adaptive mechanisms regarding their effectiveness. Thus, we used real-world applications to demonstrate that our adaptive mechanism implementations can provide higher abstraction levels without significant performance degradation.}, keywords = {}, pubstate = {published}, tppubtype = {mastersthesis} } In recent years, stream processing applications have become a traditional workload in computing systems. They are traditionally found in video, audio, graphic and image processing. Many of these applications demand parallelism to increase performance. However, programmers must often face the trade-off between coding productivity and performance that introducing parallelism creates. SPar Domain-Specific Language (DSL) was created to achieve the optimal balance for programmers, with the C++-11 attribute annotation mechanism to ensure that essential properties of stream parallelism could be represented (stage, input, output, and replicate). The compiler recognizes the SPar attributes and generates parallel code automatically. The need to manually define parallelism is tne crucial challenge for increasing SPAR's abstraction level, because it is time consuming and error prone. Also, executing several applications can fail to be efficient when running a non-suitable number of replicas. This occurs when the defined number of replicas in a parallel region is not optimal or when a static number is used, which ignores the dynamic nature of stream processing applications. In order to solve this problem, we introduced the concept of the abstracted and adaptive number of replicas for SPar. Moreover, we described our implemented mechanism as well as transformation rules that enable SPar to generate parallel code with the adaptive degree of parallelism support. We experimentally evaluated the implemented adaptive mechanisms regarding their effectiveness. Thus, we used real-world applications to demonstrate that our adaptive mechanism implementations can provide higher abstraction levels without significant performance degradation. | |
Griebler, Dalvan; Filho, Renato B H; Danelutto, Marco; Fernandes, Luiz Gustavo High-Level and Productive Stream Parallelism for Dedup, Ferret, and Bzip2 Journal Article International Journal of Parallel Programming, 47 (2), pp. 253-271, 2018, ISSN: 1573-7640. @article{GRIEBLER:IJPP:18, title = {High-Level and Productive Stream Parallelism for Dedup, Ferret, and Bzip2}, author = {Dalvan Griebler and Renato B H Filho and Marco Danelutto and Luiz Gustavo Fernandes}, editor = {J. Daniel García and Arturo Gonzalez-Escribano}, url = {https://doi.org/10.1007/s10766-018-0558-x}, doi = {10.1007/s10766-018-0558-x}, issn = {1573-7640}, year = {2018}, date = {2018-02-01}, journal = {International Journal of Parallel Programming}, volume = {47}, number = {2}, pages = {253-271}, publisher = {Springer}, abstract = {Parallel programming has been a challenging task for application programmers. Stream processing is an application domain present in several scientific, enterprise, and financial areas that lack suitable abstractions to exploit parallelism. Our goal is to assess the feasibility of state-of-the-art frameworks/libraries (Pthreads, TBB, and FastFlow) and the SPar domain-specific language for real-world streaming applications (Dedup, Ferret, and Bzip2) targeting multi-core architectures. SPar was specially designed to provide high-level and productive stream parallelism abstractions, supporting programmers with standard C++-11 annotations. For the experiments, we implemented three streaming applications. We discussed SPar’s programmability advantages compared to the frameworks in terms of productivity and structured parallel programming. The results demonstrate that SPar improves productivity and provides the necessary features to achieve similar performances compared to the state-of-the-art.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Parallel programming has been a challenging task for application programmers. Stream processing is an application domain present in several scientific, enterprise, and financial areas that lack suitable abstractions to exploit parallelism. Our goal is to assess the feasibility of state-of-the-art frameworks/libraries (Pthreads, TBB, and FastFlow) and the SPar domain-specific language for real-world streaming applications (Dedup, Ferret, and Bzip2) targeting multi-core architectures. SPar was specially designed to provide high-level and productive stream parallelism abstractions, supporting programmers with standard C++-11 annotations. For the experiments, we implemented three streaming applications. We discussed SPar’s programmability advantages compared to the frameworks in terms of productivity and structured parallel programming. The results demonstrate that SPar improves productivity and provides the necessary features to achieve similar performances compared to the state-of-the-art. | |
2017 |
|
Griebler, Dalvan; Hoffmann, Renato B; Loff, Junior; Danelutto, Marco; Fernandes, Luiz G High-Level and Efficient Stream Parallelism on Multi-core Systems with SPar for Data Compression Applications Inproceedings XVIII Simpósio em Sistemas Computacionais de Alto Desempenho, pp. 16-27, SBC, Campinas, SP, Brasil, 2017, ISBN: 2358-6613. @inproceedings{GRIEBLER:WSCAD:17, title = {High-Level and Efficient Stream Parallelism on Multi-core Systems with SPar for Data Compression Applications}, author = {Dalvan Griebler and Renato B Hoffmann and Junior Loff and Marco Danelutto and Luiz G Fernandes}, isbn = {2358-6613}, year = {2017}, date = {2017-10-01}, booktitle = {XVIII Simpósio em Sistemas Computacionais de Alto Desempenho}, pages = {16-27}, publisher = {SBC}, address = {Campinas, SP, Brasil}, abstract = {The stream processing domain is present in several real-world applications that are running on multi-core systems. In this paper, we focus on data compression applications that are an important sub-set of this domain. Our main goal is to assess the programmability and efficiency of domain-specific language called SPar. It was specially designed for expressing stream parallelism and it promises higher-level parallelism abstractions without significant performance losses. Therefore, we parallelized Lzip and Bzip2 compressors with SPar and compared with state-of-the-art frameworks. The results revealed that SPar is able to efficiently exploit stream parallelism as well as provide suitable abstractions with less code intrusion and code re-factoring.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } The stream processing domain is present in several real-world applications that are running on multi-core systems. In this paper, we focus on data compression applications that are an important sub-set of this domain. Our main goal is to assess the programmability and efficiency of domain-specific language called SPar. It was specially designed for expressing stream parallelism and it promises higher-level parallelism abstractions without significant performance losses. Therefore, we parallelized Lzip and Bzip2 compressors with SPar and compared with state-of-the-art frameworks. The results revealed that SPar is able to efficiently exploit stream parallelism as well as provide suitable abstractions with less code intrusion and code re-factoring. | |
Griebler, Dalvan; Fernandes, Luiz Gustavo Towards Distributed Parallel Programming Support for the SPar DSL Inproceedings Parallel Computing: On the Road to Exascale, Proceedings of the International Conference on Parallel Computing, pp. 563-572, IOS Press, Bologna, Italy, 2017, ISBN: 978-1-61499-843-3. @inproceedings{GRIEBLER:PARCO:17, title = {Towards Distributed Parallel Programming Support for the SPar DSL}, author = {Dalvan Griebler and Luiz Gustavo Fernandes}, url = {https://doi.org/10.3233/978-1-61499-843-3-563}, doi = {10.3233/978-1-61499-843-3-563}, isbn = {978-1-61499-843-3}, year = {2017}, date = {2017-09-01}, booktitle = {Parallel Computing: On the Road to Exascale, Proceedings of the International Conference on Parallel Computing}, pages = {563-572}, publisher = {IOS Press}, address = {Bologna, Italy}, series = {ParCo'17}, abstract = {SPar was originally designed to provide high-level abstractions for stream parallelism in C++ programs targeting multi-core systems. This work proposes distributed parallel programming support for SPar targeting cluster environments. The goal is to preserve the original semantics while source-to-source code transformations will be turned into MPI (Message Passing Interface) parallel code. The results of the experiments presented in the paper demonstrate improved programmability without significant performance losses.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } SPar was originally designed to provide high-level abstractions for stream parallelism in C++ programs targeting multi-core systems. This work proposes distributed parallel programming support for SPar targeting cluster environments. The goal is to preserve the original semantics while source-to-source code transformations will be turned into MPI (Message Passing Interface) parallel code. The results of the experiments presented in the paper demonstrate improved programmability without significant performance losses. | |
Griebler, Dalvan; Filho, Renato B H; Danelutto, Marco; Fernandes, Luiz Gustavo Higher-Level Parallelism Abstractions for Video Applications with SPar Inproceedings Parallel Computing: On the Road to Exascale, Proceedings of the International Conference on Parallel Computing, pp. 698-707, IOS Press, Bologna, Italy, 2017, ISBN: 978-1-61499-843-3. @inproceedings{GRIEBLER:REPARA:17, title = {Higher-Level Parallelism Abstractions for Video Applications with SPar}, author = {Dalvan Griebler and Renato B H Filho and Marco Danelutto and Luiz Gustavo Fernandes}, url = {https://doi.org/10.3233/978-1-61499-843-3-698}, doi = {10.3233/978-1-61499-843-3-698}, isbn = {978-1-61499-843-3}, year = {2017}, date = {2017-09-01}, booktitle = {Parallel Computing: On the Road to Exascale, Proceedings of the International Conference on Parallel Computing}, pages = {698-707}, publisher = {IOS Press}, address = {Bologna, Italy}, series = {RePara'17}, abstract = {SPar is a Domain-Specific Language (DSL) designed to provide high-level parallel programming abstractions for streaming applications. Video processing belongs to this application domain that requires parallel processing to extract and analyze information quickly. When using state-of-the-art frameworks such as FastFlow and TBB, the application programmer has to manage source code re-factoring and performance optimization for implementing the parallelism efficiently. This is not attractive for the application programmer audience. Our goal is to assess SPar's programming language and performance in traditional video applications. We also discuss different implementations and compare them to SPar. The results have shown that SPar maintains the sequential code structure, requires less code intrusion and provides higher-level programming abstractions without performance losses.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } SPar is a Domain-Specific Language (DSL) designed to provide high-level parallel programming abstractions for streaming applications. Video processing belongs to this application domain that requires parallel processing to extract and analyze information quickly. When using state-of-the-art frameworks such as FastFlow and TBB, the application programmer has to manage source code re-factoring and performance optimization for implementing the parallelism efficiently. This is not attractive for the application programmer audience. Our goal is to assess SPar's programming language and performance in traditional video applications. We also discuss different implementations and compare them to SPar. The results have shown that SPar maintains the sequential code structure, requires less code intrusion and provides higher-level programming abstractions without performance losses. | |
Ledur, Cleverson; Griebler, Dalvan; Manssour, Isabel; Fernandes, Luiz Gustavo A High-Level DSL for Geospatial Visualizations with Multi-core Parallelism Support Inproceedings 41st Annual Computer Software and Applications Conference (COMPSAC), pp. 298-304, IEEE, Torino, Italy, 2017, ISBN: 978-1-5386-0367-3. @inproceedings{LEDUR:COMPSAC:17, title = {A High-Level DSL for Geospatial Visualizations with Multi-core Parallelism Support}, author = {Cleverson Ledur and Dalvan Griebler and Isabel Manssour and Luiz Gustavo Fernandes}, url = {https://doi.org/10.1109/COMPSAC.2017.18}, doi = {10.1109/COMPSAC.2017.18}, isbn = {978-1-5386-0367-3}, year = {2017}, date = {2017-07-01}, booktitle = { 41st Annual Computer Software and Applications Conference (COMPSAC)}, pages = {298-304}, publisher = {IEEE}, address = {Torino, Italy}, series = {COMPSAC'17}, abstract = {The amount of data generated worldwide associated with geolocalization has exponentially increased over the last decade due to social networks, population demographics, and the popularization of Global Positioning Systems. Several methods for geovisualization have already been developed, but many of them are focused on a specific application or require learning a variety of tools and programming languages. It becomes even more difficult when users have to manage a large amount of data because state-of-the-art alternatives require the use of third-party pre-processing tools. We present a novel Domain-Specific Language (DSL), which focuses on large data geovisualizations. Through a compiler, we support automatic visualization generations and data pre-processing. The system takes advantage of multi-core parallelism to speed-up data pre-processing abstractly. Our experiments were designated to highlight the programming effort and performance of our DSL. The results have shown a considerable programming effort reduction and efficient parallelism support with respect to the sequential version.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } The amount of data generated worldwide associated with geolocalization has exponentially increased over the last decade due to social networks, population demographics, and the popularization of Global Positioning Systems. Several methods for geovisualization have already been developed, but many of them are focused on a specific application or require learning a variety of tools and programming languages. It becomes even more difficult when users have to manage a large amount of data because state-of-the-art alternatives require the use of third-party pre-processing tools. We present a novel Domain-Specific Language (DSL), which focuses on large data geovisualizations. Through a compiler, we support automatic visualization generations and data pre-processing. The system takes advantage of multi-core parallelism to speed-up data pre-processing abstractly. Our experiments were designated to highlight the programming effort and performance of our DSL. The results have shown a considerable programming effort reduction and efficient parallelism support with respect to the sequential version. | |
Griebler, Dalvan; Danelutto, Marco; Torquati, Massimo; Fernandes, Luiz Gustavo SPar: A DSL for High-Level and Productive Stream Parallelism Journal Article Parallel Processing Letters, 27 (01), pp. 1740005, 2017, ISSN: 1793-642X. @article{GRIEBLER:PPL:17, title = {SPar: A DSL for High-Level and Productive Stream Parallelism}, author = {Dalvan Griebler and Marco Danelutto and Massimo Torquati and Luiz Gustavo Fernandes}, url = {http://dx.doi.org/10.1142/S0129626417400059}, doi = {10.1142/S0129626417400059}, issn = {1793-642X}, year = {2017}, date = {2017-03-01}, journal = {Parallel Processing Letters}, volume = {27}, number = {01}, pages = {1740005}, publisher = {World Scientific}, abstract = {This paper introduces SPar, an internal C++ Domain-Specific Language (DSL) that supports the development of classic stream parallel applications. The DSL uses standard C++ attributes to introduce annotations tagging the notable components of stream parallel applications: stream sources and stream processing stages. A set of tools process SPar code (C++ annotated code using the SPar attributes) to generate FastFlow C++ code that exploits the stream parallelism denoted by SPar annotations while targeting shared memory multi-core architectures. We outline the main SPar features along with the main implementation techniques and tools. Also, we show the results of experiments assessing the feasibility of the entire approach as well as SPar’s performance and expressiveness.}, keywords = {}, pubstate = {published}, tppubtype = {article} } This paper introduces SPar, an internal C++ Domain-Specific Language (DSL) that supports the development of classic stream parallel applications. The DSL uses standard C++ attributes to introduce annotations tagging the notable components of stream parallel applications: stream sources and stream processing stages. A set of tools process SPar code (C++ annotated code using the SPar attributes) to generate FastFlow C++ code that exploits the stream parallelism denoted by SPar annotations while targeting shared memory multi-core architectures. We outline the main SPar features along with the main implementation techniques and tools. Also, we show the results of experiments assessing the feasibility of the entire approach as well as SPar’s performance and expressiveness. | |
2016 |
|
Griebler, Dalvan Domain-Specific Language & Support Tool for High-Level Stream Parallelism PhD Thesis Faculdade de Informática - PPGCC - PUCRS, 2016. @phdthesis{GRIEBLER:PHD:16, title = {Domain-Specific Language & Support Tool for High-Level Stream Parallelism}, author = {Dalvan Griebler}, url = {http://hdl.handle.net/10923/8500}, year = {2016}, date = {2016-06-01}, address = {Porto Alegre, Brazil}, school = {Faculdade de Informática - PPGCC - PUCRS}, abstract = {Stream-based systems are representative of several application domains including video, audio, networking, graphic processing, etc. Stream programs may run on different kinds of parallel architectures (desktop, servers, cell phones, and supercomputers) and represent significant workloads on our current computing systems. Nevertheless, most of them are still not parallelized. Moreover, when new software has to be developed, programmers often face a trade-off between coding productivity, code portability, and performance. To solve this problem, we provide a new Domain-Specific Language (DSL) that naturally/on-the-fly captures and represents parallelism for stream-based applications. The aim is to offer a set of attributes (through annotations) that preserves the program's source code and is not architecture-dependent for annotating parallelism. We used the C++ attribute mechanism to design a ``textitde-facto'' standard C++ embedded DSL named SPar. However, the implementation of DSLs using compiler-based tools is difficult, complicated, and usually requires a significant learning curve. This is even harder for those who are not familiar with compiler technology. Therefore, our motivation is to simplify this path for other researchers (experts in their domain) with support tools (our tool is CINCLE) to create high-level and productive DSLs through powerful and aggressive source-to-source transformations. In fact, parallel programmers can use their expertise without having to design and implement low-level code. The main goal of this thesis was to create a DSL and support tools for high-level stream parallelism in the context of a programming framework that is compiler-based and domain-oriented. Thus, we implemented SPar using CINCLE. SPar supports the software developer with productivity, performance, and code portability while CINCLE provides sufficient support to generate new DSLs. Also, SPar targets source-to-source transformation producing parallel pattern code built on top of FastFlow and MPI. Finally, we provide a full set of experiments showing that SPar provides better coding productivity without significant performance degradation in multi-core systems as well as transformation rules that are able to achieve code portability (for cluster architectures) through its generalized attributes.}, keywords = {}, pubstate = {published}, tppubtype = {phdthesis} } Stream-based systems are representative of several application domains including video, audio, networking, graphic processing, etc. Stream programs may run on different kinds of parallel architectures (desktop, servers, cell phones, and supercomputers) and represent significant workloads on our current computing systems. Nevertheless, most of them are still not parallelized. Moreover, when new software has to be developed, programmers often face a trade-off between coding productivity, code portability, and performance. To solve this problem, we provide a new Domain-Specific Language (DSL) that naturally/on-the-fly captures and represents parallelism for stream-based applications. The aim is to offer a set of attributes (through annotations) that preserves the program's source code and is not architecture-dependent for annotating parallelism. We used the C++ attribute mechanism to design a ``textitde-facto'' standard C++ embedded DSL named SPar. However, the implementation of DSLs using compiler-based tools is difficult, complicated, and usually requires a significant learning curve. This is even harder for those who are not familiar with compiler technology. Therefore, our motivation is to simplify this path for other researchers (experts in their domain) with support tools (our tool is CINCLE) to create high-level and productive DSLs through powerful and aggressive source-to-source transformations. In fact, parallel programmers can use their expertise without having to design and implement low-level code. The main goal of this thesis was to create a DSL and support tools for high-level stream parallelism in the context of a programming framework that is compiler-based and domain-oriented. Thus, we implemented SPar using CINCLE. SPar supports the software developer with productivity, performance, and code portability while CINCLE provides sufficient support to generate new DSLs. Also, SPar targets source-to-source transformation producing parallel pattern code built on top of FastFlow and MPI. Finally, we provide a full set of experiments showing that SPar provides better coding productivity without significant performance degradation in multi-core systems as well as transformation rules that are able to achieve code portability (for cluster architectures) through its generalized attributes. | |
2015 |
|
Griebler, Dalvan; Danelutto, Marco; Torquati, Massimo; Fernandes, Luiz G An Embedded C++ Domain-Specific Language for Stream Parallelism Inproceedings Parallel Computing: On the Road to Exascale, Proceedings of the International Conference on Parallel Computing, pp. 317-326, IOS Press, Edinburgh, Scotland, UK, 2015, ISBN: 978-1-61499-621-7. @inproceedings{GRIEBLER:PARCO:15, title = {An Embedded C++ Domain-Specific Language for Stream Parallelism}, author = {Dalvan Griebler and Marco Danelutto and Massimo Torquati and Luiz G Fernandes}, url = {http://dx.doi.org/10.3233/978-1-61499-621-7-317}, doi = {10.3233/978-1-61499-621-7-317}, isbn = {978-1-61499-621-7}, year = {2015}, date = {2015-09-01}, booktitle = {Parallel Computing: On the Road to Exascale, Proceedings of the International Conference on Parallel Computing}, pages = {317-326}, publisher = {IOS Press}, address = {Edinburgh, Scotland, UK}, series = {ParCo'15}, abstract = {This paper proposes a new C++ embedded Domain-Specific Language (DSL) for expressing stream parallelism by using standard C++11 attributes annotations. The main goal is to introduce high-level parallel abstractions for developing stream based parallel programs as well as reducing sequential source code rewriting. We demonstrated that by using a small set of attributes it is possible to produce different parallel versions depending on the way the source code is annotated. The performances of the parallel code produced are comparable with those obtained by manual parallelization.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } This paper proposes a new C++ embedded Domain-Specific Language (DSL) for expressing stream parallelism by using standard C++11 attributes annotations. The main goal is to introduce high-level parallel abstractions for developing stream based parallel programs as well as reducing sequential source code rewriting. We demonstrated that by using a small set of attributes it is possible to produce different parallel versions depending on the way the source code is annotated. The performances of the parallel code produced are comparable with those obtained by manual parallelization. |