From f4db66967b4d20ff57c8fa8d19f3af0a22bc8a8e Mon Sep 17 00:00:00 2001 From: "Dinei A. Rockenbach" Date: Fri, 19 May 2023 20:48:15 +0000 Subject: [PATCH] GSParLib source and examples --- .github/workflows/gspar-build-ci.yml | 26 + .gitignore | 40 + LICENSE | 21 + Makefile | 151 + README.md | 24 +- examples/driver_api/atomic_operations.cpp | 113 + examples/driver_api/gpuinfo.cpp | 93 + examples/driver_api/matrix_multi.cpp | 143 + examples/driver_api/shared_memory.cpp | 126 + examples/driver_api/vector_sum.cpp | 135 + .../driver_api/vector_sum_chunked_memory.cpp | 152 + examples/include/cudabase.hpp | 49 + examples/include/cudabase_driver.hpp | 56 + examples/include/cudabase_nvrtc.hpp | 57 + examples/include/oclbase.h | 116 + examples/include/oclbase.hpp | 116 + examples/pattern_api/mandel.cpp | 141 + .../pattern_api/mandel_batched_parameters.cpp | 222 ++ examples/pattern_api/mandel_stream.cpp | 218 ++ examples/pattern_api/matrix_multi_map_cm.cpp | 133 + examples/pattern_api/matrix_multi_map_rm.cpp | 132 + examples/pattern_api/raytracer.cpp | 818 ++++++ examples/pattern_api/reduce_sample.cpp | 55 + examples/pattern_api/vector_sum_map.cpp | 107 + examples/pattern_api/vector_sum_map_batch.cpp | 117 + .../vector_sum_map_managing_memory.cpp | 103 + .../pattern_api/vector_sum_map_parallel.cpp | 195 ++ examples/pattern_api/vector_sum_mapreduce.cpp | 135 + examples/sequential/mandel.cpp | 100 + examples/sequential/matrix_multi_cm.cpp | 101 + examples/sequential/matrix_multi_rm.cpp | 95 + examples/sequential/primer.cpp | 79 + examples/sequential/raytracer.cpp | 524 ++++ examples/sequential/reduce.cpp | 79 + examples/sequential/saxpy.cpp | 96 + examples/sequential/vector_sum.cpp | 95 + examples/workloads/raytracer_scene.xml | 137 + src/GSPar.hpp | 13 + src/GSPar_Base.cpp | 28 + src/GSPar_Base.hpp | 37 + src/GSPar_BaseGPUDriver.hpp | 796 +++++ src/GSPar_BaseParallelPattern.hpp | 1129 +++++++ src/GSPar_CUDA.cpp | 942 ++++++ src/GSPar_CUDA.hpp | 262 ++ src/GSPar_OpenCL.cpp | 1051 +++++++ src/GSPar_OpenCL.hpp | 260 ++ src/GSPar_PatternComposition.hpp | 271 ++ src/GSPar_PatternMap.hpp | 29 + src/GSPar_PatternReduce.cpp | 155 + src/GSPar_PatternReduce.hpp | 174 ++ thirdpt/marX2/marX2.c | 434 +++ thirdpt/marX2/marX2.h | 29 + thirdpt/rapidxml-1.13/license.txt | 52 + thirdpt/rapidxml-1.13/manual.html | 406 +++ thirdpt/rapidxml-1.13/rapidxml.hpp | 2596 +++++++++++++++++ thirdpt/rapidxml-1.13/rapidxml_iterators.hpp | 174 ++ thirdpt/rapidxml-1.13/rapidxml_print.hpp | 421 +++ thirdpt/rapidxml-1.13/rapidxml_utils.hpp | 122 + 58 files changed, 14480 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/gspar-build-ci.yml create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 Makefile create mode 100644 examples/driver_api/atomic_operations.cpp create mode 100644 examples/driver_api/gpuinfo.cpp create mode 100644 examples/driver_api/matrix_multi.cpp create mode 100644 examples/driver_api/shared_memory.cpp create mode 100644 examples/driver_api/vector_sum.cpp create mode 100644 examples/driver_api/vector_sum_chunked_memory.cpp create mode 100644 examples/include/cudabase.hpp create mode 100644 examples/include/cudabase_driver.hpp create mode 100644 examples/include/cudabase_nvrtc.hpp create mode 100644 examples/include/oclbase.h create mode 100644 examples/include/oclbase.hpp create mode 100644 examples/pattern_api/mandel.cpp create mode 100644 examples/pattern_api/mandel_batched_parameters.cpp create mode 100644 examples/pattern_api/mandel_stream.cpp create mode 100644 examples/pattern_api/matrix_multi_map_cm.cpp create mode 100644 examples/pattern_api/matrix_multi_map_rm.cpp create mode 100644 examples/pattern_api/raytracer.cpp create mode 100644 examples/pattern_api/reduce_sample.cpp create mode 100644 examples/pattern_api/vector_sum_map.cpp create mode 100644 examples/pattern_api/vector_sum_map_batch.cpp create mode 100644 examples/pattern_api/vector_sum_map_managing_memory.cpp create mode 100644 examples/pattern_api/vector_sum_map_parallel.cpp create mode 100644 examples/pattern_api/vector_sum_mapreduce.cpp create mode 100644 examples/sequential/mandel.cpp create mode 100644 examples/sequential/matrix_multi_cm.cpp create mode 100644 examples/sequential/matrix_multi_rm.cpp create mode 100644 examples/sequential/primer.cpp create mode 100644 examples/sequential/raytracer.cpp create mode 100644 examples/sequential/reduce.cpp create mode 100644 examples/sequential/saxpy.cpp create mode 100644 examples/sequential/vector_sum.cpp create mode 100644 examples/workloads/raytracer_scene.xml create mode 100644 src/GSPar.hpp create mode 100644 src/GSPar_Base.cpp create mode 100644 src/GSPar_Base.hpp create mode 100644 src/GSPar_BaseGPUDriver.hpp create mode 100644 src/GSPar_BaseParallelPattern.hpp create mode 100644 src/GSPar_CUDA.cpp create mode 100644 src/GSPar_CUDA.hpp create mode 100644 src/GSPar_OpenCL.cpp create mode 100644 src/GSPar_OpenCL.hpp create mode 100644 src/GSPar_PatternComposition.hpp create mode 100644 src/GSPar_PatternMap.hpp create mode 100644 src/GSPar_PatternReduce.cpp create mode 100644 src/GSPar_PatternReduce.hpp create mode 100644 thirdpt/marX2/marX2.c create mode 100644 thirdpt/marX2/marX2.h create mode 100644 thirdpt/rapidxml-1.13/license.txt create mode 100644 thirdpt/rapidxml-1.13/manual.html create mode 100644 thirdpt/rapidxml-1.13/rapidxml.hpp create mode 100644 thirdpt/rapidxml-1.13/rapidxml_iterators.hpp create mode 100644 thirdpt/rapidxml-1.13/rapidxml_print.hpp create mode 100644 thirdpt/rapidxml-1.13/rapidxml_utils.hpp diff --git a/.github/workflows/gspar-build-ci.yml b/.github/workflows/gspar-build-ci.yml new file mode 100644 index 0000000..0d3fb6a --- /dev/null +++ b/.github/workflows/gspar-build-ci.yml @@ -0,0 +1,26 @@ +name: GSPar Build CI + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - name: Check out repository + uses: actions/checkout@v2 + - name: Fix apt on GitHub Actions + run: sudo gem install apt-spy2 && sudo apt-spy2 fix --commit --launchpad --country=US + - name: Update apt + run: sudo apt-get update + - name: Install OpenCL + run: sudo apt-get -o Acquire::Retries=3 install opencl-headers nvidia-opencl-dev #nvidia-libopencl1-384 + - name: Install CUDA + run: sudo apt-get -o Acquire::Retries=3 install nvidia-cuda-dev + - name: Build library + run: make diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b8106b1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,40 @@ +# VS Code +.vscode/ipch/* + +# Prerequisites +*.d + +# Compiled Object files +*.slo +*.lo +*.o +*.obj + +# Precompiled Headers +*.gch +*.pch + +# Compiled Dynamic libraries +*.so +*.dylib +*.dll + +# Fortran module files +*.mod +*.smod + +# Compiled Static libraries +*.lai +*.la +*.a +*.lib + +# Ignore the build and lib dirs +build +lib/* + +# Executables +bin/* + +# Auto-generated +generated_*.cpp diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..0a03df8 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 Parallel Applications Modelling Group - GMAP + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..f64422b --- /dev/null +++ b/Makefile @@ -0,0 +1,151 @@ +# Compilers +COMPILER := g++ +# Directories +SRCDIR := src +BUILDDIR := build +TARGETDIR := bin +EXAMPLESDIR := examples +EXAMPLEDRIVERAPIDIR := $(EXAMPLESDIR)/driver_api +EXAMPLEPATTERNAPIDIR := $(EXAMPLESDIR)/pattern_api +EXAMPLESEQUENTIALDIR := $(EXAMPLESDIR)/sequential +THIRDPTDIR := thirdpt +MARX2DIR := $(THIRDPTDIR)/marX2 +LIBMARX2PATH := $(MARX2DIR)/libmarX2.a +# Names +LIBNAME := gspar +GSPARNAME := gspar +CUDANAME := cuda +OCLNAME := opencl +DRIVERAPINAME := driverapi +PATTERNAPINAME := patternapi +SEQUENTIALNAME := seq +# App names +MANDELNAME := mandel +LANEDETECTIONNAME := lanedetection +# Target +TARGET := $(TARGETDIR)/lib$(LIBNAME).so +# Others +SPACE := +SPACE += +SRCEXT := cpp +EXAMPLESTARGETPREFIX := ex + +# Files +SOURCES := $(wildcard $(SRCDIR)/*.$(SRCEXT)) +OBJECTS := $(patsubst $(SRCDIR)/%,$(BUILDDIR)/%,$(SOURCES:.$(SRCEXT)=.o)) + +CFLAGS := -Wall -std=c++14 -O3 -Wno-reorder +LIB := -Llib -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/local/cuda/targets/x86_64-linux/lib/stubs -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 +LIBOCL := -lOpenCL +LIBCUDADRIVER := -lcuda +LIBCUDANVRTC := -lnvrtc +LIBPTHREAD := -pthread +PATHSLIB := -I/usr/local/cuda/include -Isrc +PATHSTEST := $(PATHSLIB) -I$(THIRDPTDIR) -I$(EXAMPLESDIR)/include +TESTLIB := -L$(TARGETDIR) -l$(LIBNAME) + +PATHOPENCV := -I/usr/local/include/opencv4 +LIBSOPENCV := -lopencv_core -lopencv_highgui -lopencv_imgproc -lopencv_imgcodecs +EXTRADEPS := +INCMANDEL := +LIBMANDEL := +ifdef DEBUG + DEFS +=-DDEBUG -DGSPAR_DEBUG + EXTRADEPS := $(LIBMARX2PATH) + INCMANDEL := -I$(MARX2DIR) -L$(MARX2DIR) + LIBMANDEL := -lmarX2 -lX11 -lm +endif + +CLR_BLUE := \033[0;34m +CLR_ORANGE := \033[0;33m +CLR_DARKCYAN := \033[0;36m +CLR_NO := \033[0m + +# Functions +get_paths_mandel = $(if $(findstring $(MANDELNAME), $(1)), $(INCMANDEL)) +get_paths_lanedetection = $(if $(findstring $(LANEDETECTIONNAME), $(1)), $(PATHOPENCV)) +get_paths = $(strip $(PATHSTEST) $(LIB) $(call get_paths_mandel, $(1)) $(call get_paths_lanedetection, $(1)) ) + +get_libs_mandel = $(if $(findstring $(MANDELNAME), $(1)), $(LIBMANDEL)) +get_libs_lanedetection = $(if $(findstring $(LANEDETECTIONNAME), $(1)), $(LIBSOPENCV)) +get_libs = $(strip $(call get_libs_mandel, $(1)) $(call get_libs_lanedetection, $(1))) + +# Driver API examples +EXAMPLESOURCES_DRIVERAPI := $(wildcard $(EXAMPLEDRIVERAPIDIR)/*.$(SRCEXT)) +EXAMPLETARGETS_DRIVERAPI_CUDA := $(patsubst $(EXAMPLEDRIVERAPIDIR)/%,$(TARGETDIR)/$(EXAMPLESTARGETPREFIX)_$(DRIVERAPINAME)_%,$(EXAMPLESOURCES_DRIVERAPI:.$(SRCEXT)=_$(CUDANAME))) +EXAMPLETARGETS_DRIVERAPI_OPENCL := $(patsubst $(EXAMPLEDRIVERAPIDIR)/%,$(TARGETDIR)/$(EXAMPLESTARGETPREFIX)_$(DRIVERAPINAME)_%,$(EXAMPLESOURCES_DRIVERAPI:.$(SRCEXT)=_$(OCLNAME))) +# Pattern API examples +EXAMPLESOURCES_PATTERNAPI := $(wildcard $(EXAMPLEPATTERNAPIDIR)/*.$(SRCEXT)) +EXAMPLETARGETS_PATTERNAPI_CUDA := $(patsubst $(EXAMPLEPATTERNAPIDIR)/%,$(TARGETDIR)/$(EXAMPLESTARGETPREFIX)_$(PATTERNAPINAME)_%,$(EXAMPLESOURCES_PATTERNAPI:.$(SRCEXT)=_$(CUDANAME))) +EXAMPLETARGETS_PATTERNAPI_OPENCL := $(patsubst $(EXAMPLEPATTERNAPIDIR)/%,$(TARGETDIR)/$(EXAMPLESTARGETPREFIX)_$(PATTERNAPINAME)_%,$(EXAMPLESOURCES_PATTERNAPI:.$(SRCEXT)=_$(OCLNAME))) +# Sequential examples +EXAMPLESOURCES_SEQUENTIAL := $(wildcard $(EXAMPLESEQUENTIALDIR)/*.$(SRCEXT)) +EXAMPLETARGETS_SEQUENTIAL := $(patsubst $(EXAMPLESEQUENTIALDIR)/%,$(TARGETDIR)/$(EXAMPLESTARGETPREFIX)_$(SEQUENTIALNAME)_%,$(EXAMPLESOURCES_SEQUENTIAL:.$(SRCEXT)=)) + + +# Build targets + +$(TARGET): $(OBJECTS) | $(TARGETDIR) + @echo "${CLR_DARKCYAN}Linking dynamic library ${CLR_ORANGE}$(TARGET)${CLR_NO}..." + $(COMPILER) $(DEFS) -shared -fPIC -o $(TARGET) $^ $(LIB) $(LIBOCL) $(LIBCUDADRIVER) $(LIBCUDANVRTC) + +$(BUILDDIR)/%.o: $(SRCDIR)/%.$(SRCEXT) | $(BUILDDIR) + @echo "${CLR_DARKCYAN}Compiling and assembling object ${CLR_ORANGE}$@${CLR_NO}..." + $(COMPILER) $(DEFS) $(CFLAGS) $(PATHSLIB) -c -fPIC -o $@ $< + +$(TARGETDIR): + @mkdir -p $@ + +$(BUILDDIR): + @mkdir -p $@ + + +.PHONY: examples +examples: examples_driver_api examples_pattern_api examples_sequential + +# Driver API examples +examples_driver_api: $(EXAMPLETARGETS_DRIVERAPI_CUDA) $(EXAMPLETARGETS_DRIVERAPI_OPENCL) +$(TARGETDIR)/$(EXAMPLESTARGETPREFIX)_$(DRIVERAPINAME)_%: $(TARGETDIR)/$(EXAMPLESTARGETPREFIX)_$(DRIVERAPINAME)_%_$(CUDANAME) $(TARGETDIR)/$(EXAMPLESTARGETPREFIX)_$(DRIVERAPINAME)_%_$(OCLNAME) ; +# Lib to CUDA +$(TARGETDIR)/$(EXAMPLESTARGETPREFIX)_$(DRIVERAPINAME)_%_$(CUDANAME): $(EXAMPLEDRIVERAPIDIR)/%.$(SRCEXT) $(TARGET) $(EXTRADEPS) | $(TARGETDIR) + @echo "${CLR_DARKCYAN}Building GSPar Driver API example ${CLR_ORANGE}$@${CLR_DARKCYAN} from $<${CLR_NO}" + $(COMPILER) $(DEFS) -DGSPARDRIVER_CUDA $(CFLAGS) $< $(call get_paths, $<) $(TESTLIB) -o $@ $(LIBPTHREAD) $(call get_libs, $<) +# Lib to OpenCL +$(TARGETDIR)/$(EXAMPLESTARGETPREFIX)_$(DRIVERAPINAME)_%_$(OCLNAME): $(EXAMPLEDRIVERAPIDIR)/%.$(SRCEXT) $(TARGET) $(EXTRADEPS) | $(TARGETDIR) + @echo "${CLR_DARKCYAN}Building GSPar Driver API example ${CLR_ORANGE}$@${CLR_DARKCYAN} from $<${CLR_NO}" + $(COMPILER) $(DEFS) -DGSPARDRIVER_OPENCL $(CFLAGS) $< $(call get_paths, $<) $(TESTLIB) -o $@ $(LIBPTHREAD) $(call get_libs, $<) + +# Pattern API examples +examples_pattern_api: $(EXAMPLETARGETS_PATTERNAPI_CUDA) $(EXAMPLETARGETS_PATTERNAPI_OPENCL) +$(TARGETDIR)/$(EXAMPLESTARGETPREFIX)_$(PATTERNAPINAME)_%: $(TARGETDIR)/$(EXAMPLESTARGETPREFIX)_$(PATTERNAPINAME)_%_$(CUDANAME) $(TARGETDIR)/$(EXAMPLESTARGETPREFIX)_$(PATTERNAPINAME)_%_$(OCLNAME) ; +# Lib to CUDA +$(TARGETDIR)/$(EXAMPLESTARGETPREFIX)_$(PATTERNAPINAME)_%_$(CUDANAME): $(EXAMPLEPATTERNAPIDIR)/%.$(SRCEXT) $(TARGET) $(EXTRADEPS) | $(TARGETDIR) + @echo "${CLR_DARKCYAN}Building GSPar Pattern API example ${CLR_ORANGE}$@${CLR_DARKCYAN} from $<${CLR_NO}" + $(COMPILER) $(DEFS) -DGSPARDRIVER_CUDA $(CFLAGS) $< $(call get_paths, $<) $(TESTLIB) -o $@ $(LIBPTHREAD) $(call get_libs, $<) +# Lib to OpenCL +$(TARGETDIR)/$(EXAMPLESTARGETPREFIX)_$(PATTERNAPINAME)_%_$(OCLNAME): $(EXAMPLEPATTERNAPIDIR)/%.$(SRCEXT) $(TARGET) $(EXTRADEPS) | $(TARGETDIR) + @echo "${CLR_DARKCYAN}Building GSPar Pattern API example ${CLR_ORANGE}$@${CLR_DARKCYAN} from $<${CLR_NO}" + $(COMPILER) $(DEFS) -DGSPARDRIVER_OPENCL $(CFLAGS) $< $(call get_paths, $<) $(TESTLIB) -o $@ $(LIBPTHREAD) $(call get_libs, $<) + +# Sequential examples +examples_sequential: $(EXAMPLETARGETS_SEQUENTIAL) +$(TARGETDIR)/$(EXAMPLESTARGETPREFIX)_$(SEQUENTIALNAME)_%: $(EXAMPLESEQUENTIALDIR)/%.$(SRCEXT) | $(TARGETDIR) + @echo "${CLR_DARKCYAN}Building sequential example ${CLR_ORANGE}$@${CLR_DARKCYAN} from $<${CLR_NO}" + $(COMPILER) $(DEFS) $(CFLAGS) $< $(call get_paths, $<) -o $@ $(call get_libs, $<) + +$(LIBMARX2PATH): $(MARX2DIR)/marX2.c $(MARX2DIR)/marX2.h + @echo "${CLR_DARKCYAN}Building ${CLR_ORANGE}$(LIBMARX2PATH)${CLR_DARKCYAN}${CLR_NO}" + gcc -c -Wall -O3 -I/usr/X11R6/include -I$(MARX2DIR) $(MARX2DIR)/marX2.c -o $(MARX2DIR)/marX2.o + ar -rv $(LIBMARX2PATH) $(MARX2DIR)/marX2.o + ranlib $(LIBMARX2PATH) + +.PHONY: clean +clean: + @echo "${CLR_DARKCYAN}Cleaning...${CLR_NO}"; + $(RM) -r $(BUILDDIR) $(TARGETDIR) $(MARX2DIR)/*.a $(MARX2DIR)/*.o + +clean_lib: + @echo "${CLR_DARKCYAN}Cleaning lib $(TARGET)...${CLR_NO}"; + $(RM) $(OBJECTS) $(TARGET) + +all: $(TARGET) examples diff --git a/README.md b/README.md index 73de64e..bcd1726 100644 --- a/README.md +++ b/README.md @@ -1 +1,23 @@ -# GSParLib \ No newline at end of file +# GSParLib + +GSParLib is a C++ object-oriented multi-level API for GPU programming that allows code portability between different GPU platforms and targets stream and data parallelism. + +The scientific article presenting GSParLib is currently under review. + +## Compilation + +- `make` builds the library +- `make examples` builds the examples for both Pattern and Driver API, as well as the sequential versions. To compile just a specific set of examples, use one of: + - `make examples_driver_api` + - `make examples_pattern_api` + - `make examples_sequential` +- Alternatively, it is possible to compile individual examples by referring directly to their compiled names (the `cuda`/`opencl` suffix may be ommited). Ex.: `make bin/ex_driverapi_gpuinfo` compiles both CUDA and OpenCL versions of the [gpuinfo.cpp](examples/driver_api/gpuinfo.cpp) example. + +To compile with debugging enabled, use `DEBUG=1 make` (both when compiling the library and the examples). This automatically enables debugging flags, so that GSParLib print various debugging information during execution. + +## Run examples + +After building the library it is necessary to make it available at runtime. +To do this, execute `export LD_LIBRARY_PATH=/bin:$LD_LIBRARY_PATH`, replacing `` with the path to the repository's root folder. + +After this, just execute any example under the path `bin/ex_` diff --git a/examples/driver_api/atomic_operations.cpp b/examples/driver_api/atomic_operations.cpp new file mode 100644 index 0000000..679cd77 --- /dev/null +++ b/examples/driver_api/atomic_operations.cpp @@ -0,0 +1,113 @@ +#include +#include + +#ifdef GSPARDRIVER_OPENCL + #include "GSPar_OpenCL.hpp" + using namespace GSPar::Driver::OpenCL; +#else + #include "GSPar_CUDA.hpp" + using namespace GSPar::Driver::CUDA; +#endif + +const char* kernelSource = GSPAR_STRINGIZE_SOURCE( + GSPAR_DEVICE_KERNEL void atomicops_kernel(const int max, + GSPAR_DEVICE_GLOBAL_MEMORY const int *vector, + GSPAR_DEVICE_GLOBAL_MEMORY int *result) { + size_t gid = gspar_get_global_id(0); + if (gid <= max) { + gspar_atomic_add_int(result, vector[gid]); + } + } +); + +void print_vector(int size, const int* vector, bool compact = false) { + if (compact || size > 100) { + std::cout << vector[0] << "..." << vector[size-1]; + } else { + for (int i = 0; i < size; i++) { + std::cout << vector[i] << " "; + } + } + std::cout << std::endl; +} + +int main(int argc, const char * argv[]) { + + std::cout << "Testing atomic operations in GSParLib Driver API" << std::endl; + + const int VECTOR_SIZE = 20; + + // Create memory objects + int correctResult = 0; + int* result = new int; + int* vector = new int[VECTOR_SIZE]; + for (int i = 0; i < VECTOR_SIZE; i++) { + vector[i] = (int)i; + correctResult += i; + } + + std::cout << "Vector with " << VECTOR_SIZE << " elements:" << std::endl; + print_vector(VECTOR_SIZE, vector); + + try { + + auto t_start = std::chrono::steady_clock::now(); + + auto driver = Instance::getInstance(); + driver->init(); + + int numGpus = driver->getGpuCount(); + if (numGpus == 0) { + std::cout << "No GPU found, interrupting test" << std::endl; + exit(-1); + } + + // Get the first GPU + auto gpu = driver->getGpu(0); + + auto vector_dev = gpu->malloc(sizeof(int) * VECTOR_SIZE, vector); + // Async copy + // vector_dev->copyInAsync(); + // vector_dev->waitAsync(); + // Sync copy + vector_dev->copyIn(); + + auto result_dev = gpu->malloc(sizeof(int), result); + + auto kernel = gpu->prepareKernel(kernelSource, "atomicops_kernel"); + // auto kernel = new Kernel(gpu, kernelSource, "atomicops_kernel"); + + // Set a fixed number of threads per block for the X dimension + // kernel->setNumThreadsPerBlockForX(5); + kernel->setParameter(sizeof(VECTOR_SIZE), &VECTOR_SIZE); + kernel->setParameter(vector_dev); + kernel->setParameter(result_dev); + + kernel->runAsync({VECTOR_SIZE, 0}); + kernel->waitAsync(); + + result_dev->copyOut(); + + delete kernel; + delete vector_dev; + delete result_dev; + + auto t_end = std::chrono::steady_clock::now(); + + // Output the result buffer + std::cout << "Expected result: " << correctResult << std::endl; + std::cout << "Actual result: " << *result << std::endl; + + delete vector; + delete result; + + std::cout << "Test finished succesfully in " << std::chrono::duration_cast(t_end - t_start).count() << " ms " << std::endl; + + return 0; + + } catch (GSPar::GSParException &ex) { + std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl; + exit(-1); + } + +} diff --git a/examples/driver_api/gpuinfo.cpp b/examples/driver_api/gpuinfo.cpp new file mode 100644 index 0000000..184f860 --- /dev/null +++ b/examples/driver_api/gpuinfo.cpp @@ -0,0 +1,93 @@ +#include +#include + +#ifdef GSPARDRIVER_OPENCL + + const char* nameOfGSParDriver = "OpenCL"; + + #include "GSPar_OpenCL.hpp" + using namespace GSPar::Driver::OpenCL; + +#else + + const char* nameOfGSParDriver = "CUDA"; + + #include "GSPar_CUDA.hpp" + using namespace GSPar::Driver::CUDA; + +#endif + +const char* kernelSource = GSPAR_STRINGIZE_SOURCE( + GSPAR_DEVICE_MACRO_BEGIN CONSTANT_N 42 GSPAR_DEVICE_MACRO_END + GSPAR_DEVICE_KERNEL void info_kernel(int N) { + unsigned int idx_x = gspar_get_global_id(0); + unsigned int idx_y = gspar_get_global_id(1); + unsigned int blk_x = gspar_get_block_size(0); + unsigned int blk_y = gspar_get_block_size(1); + unsigned int blkid_x = gspar_get_block_id(0); + unsigned int blkid_y = gspar_get_block_id(1); + unsigned int thr_x = gspar_get_thread_id(0); + unsigned int thr_y = gspar_get_thread_id(1); + gspar_synchronize_local_threads(); // Unnecessary, just for show + printf("Thread [%u,%u]: Dim (%u, %u), Block (%u, %u), thread (%u, %u), constant N: %d, parameter N: %d\n", + idx_x, idx_y, blk_x, blk_y, blkid_x, blkid_y, thr_x, thr_y, CONSTANT_N, N); + } +); + +int main(int argc, const char * argv[]) { + + std::cout << "Testing GSPar Driver: " << nameOfGSParDriver << std::endl; + + try { + + auto t_start = std::chrono::steady_clock::now(); + + Instance* driver = Instance::getInstance(); + driver->init(); + + int numGpus = driver->getGpuCount(); + if (numGpus == 0) { + std::cout << "No GPU found, interrupting test" << std::endl; + exit(-1); + } + + auto gpus = driver->getGpuList(); + + std::cout << "Found " << numGpus << " GPU devices:" << std::endl; + int d = 0; + for (auto const& gpu : gpus) { + std::cout << "Device #" << ++d << ": \"" << gpu->getName() << "\""; + std::cout << " (" << (gpu->isIntegratedMainMemory() ? "integrated" : "dedicated") << ")" << std::endl; + std::cout << " Memory:" << std::endl; + std::cout << " Total global memory: " << gpu->getGlobalMemorySizeBytes()/(1024 * 1024) << " MB" << std::endl; + std::cout << " Total local memory: " << gpu->getLocalMemorySizeBytes()/1024 << " KB" << std::endl; + std::cout << " Total shared memory per CU: " << gpu->getSharedMemoryPerComputeUnitSizeBytes()/1024 << " KB" << std::endl; + std::cout << " Number of compute units (CU): " << gpu->getComputeUnitsCount() << std::endl; + std::cout << " Maximum threads per block: " << gpu->getMaxThreadsPerBlock() << std::endl; + std::cout << " Device clock rate: " << gpu->getClockRateMHz() << " MHz" << std::endl; + } + + auto gpu = gpus.front(); + std::cout << "Running test kernel in the first GPU (" << gpu->getName() << ")" << std::endl; + + auto kernel = gpu->prepareKernel(kernelSource, "info_kernel"); + // auto kernel = new Kernel(gpu, kernelSource, "info_kernel"); + + int N = 12; + kernel->setParameter(sizeof(N), &N); + + kernel->runAsync({2, 3}); + // kernel->waitAsync(); + + delete kernel; + + auto t_end = std::chrono::steady_clock::now(); + + std::cout << "Test finished succesfully in " << std::chrono::duration_cast(t_end - t_start).count() << " ms " << std::endl; + + } catch (GSPar::GSParException &ex) { + std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl; + } + + return 0; +} diff --git a/examples/driver_api/matrix_multi.cpp b/examples/driver_api/matrix_multi.cpp new file mode 100644 index 0000000..aff2bc5 --- /dev/null +++ b/examples/driver_api/matrix_multi.cpp @@ -0,0 +1,143 @@ +#include +#include + +#ifdef GSPARDRIVER_CUDA + + #include "GSPar_CUDA.hpp" + using namespace GSPar::Driver::CUDA; + +// #elif GSPARDRIVER_OPENCL +#else // This way my IDE doesn't complain + + #include "GSPar_OpenCL.hpp" + using namespace GSPar::Driver::OpenCL; + +#endif + +const char* kernelSource = GSPAR_STRINGIZE_SOURCE( + GSPAR_DEVICE_KERNEL void matrix_multi(long MX, + GSPAR_DEVICE_GLOBAL_MEMORY const long *a, + GSPAR_DEVICE_GLOBAL_MEMORY const long *b, + GSPAR_DEVICE_GLOBAL_MEMORY long *result) { + long i = gspar_get_global_id(0); + long j = gspar_get_global_id(1); + if (i < MX && j < MX) { + for (long k = 0; kinit(); + + int numGpus = driver->getGpuCount(); + if (numGpus == 0) { + std::cerr << "No GPU found, interrupting test" << std::endl; + exit(-1); + } + + auto gpus = driver->getGpuList(); + + // Get the first GPU + Device* gpu = gpus.front(); + MemoryObject* a_dev = gpu->malloc(sizeof(long) * max * max, a); + MemoryObject* b_dev = gpu->malloc(sizeof(long) * max * max, b); + // Async copy + // a_dev->copyInAsync(); + // b_dev->copyInAsync(); + // AsyncExecutionSupport::waitAllAsync({ a_dev->getBaseAsyncObject(), b_dev->getBaseAsyncObject() }); + // Sync copy + a_dev->copyIn(); + b_dev->copyIn(); + + MemoryObject* result_dev = gpu->malloc(sizeof(long) * max * max, result); + result_dev->copyIn(); + + // Kernel* kernel = gpu->prepareKernel(kernelSource, "matrix_multi"); + Kernel* kernel = new Kernel(gpu, kernelSource, "matrix_multi"); + + kernel->setParameter(sizeof(max), &max); + kernel->setParameter(a_dev); + kernel->setParameter(b_dev); + kernel->setParameter(result_dev); + + unsigned long dimensions[3] = {(unsigned long)max, (unsigned long)max, 0}; + kernel->runAsync(dimensions); + kernel->waitAsync(); + + result_dev->copyOut(); + + delete kernel; + delete a_dev; + delete b_dev; + delete result_dev; + + } catch (GSPar::GSParException &ex) { + std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl; + exit(-1); + } +} + +void print_matrix(long max, const long* matrix, bool compact = false) { + if (compact || max > 100) { + std::cout << matrix[0] << "..." << matrix[(max * max)-1]; + } else { + for (long i = 0; i < max; i++) { + std::cout << std::endl; + for (long j = 0; j < max; j++) { + std::cout << matrix[i * max + j] << " "; + } + } + } + std::cout << std::endl; +} + +int main(int argc, const char * argv[]) { + if (argc < 2) { + std::cerr << "Use: " << argv[0] << " " << std::endl; + exit(-1); + } + + const long MX = std::stoi(argv[1]); + std::cout << "Multiplying matrixes of " << MX << " x " << MX << std::endl; + + // Create memory objects + long* matrix_a = new long[MX * MX]; + long* matrix_b = new long[MX * MX]; + long* result = new long[MX * MX]; + for (long i = 0; i < MX; i++) { + for (long j = 0; j < MX; j++) { + matrix_a[j * MX + i] = 4; + matrix_b[j * MX + i] = 5; + result[j * MX + i] = 0; + } + } + + std::cout << "Matrix A: "; + print_matrix(MX, matrix_a, true); + std::cout << "Matrix B: "; + print_matrix(MX, matrix_b, true); + + auto t_start = std::chrono::steady_clock::now(); + + matrix_multi(MX, matrix_a, matrix_b, result); + + auto t_end = std::chrono::steady_clock::now(); + + // Output the result buffer + std::cout << "Result: "; + print_matrix(MX, result); + + delete matrix_a; + delete matrix_b; + delete result; + + std::cout << "Test finished succesfully in " << std::chrono::duration_cast(t_end - t_start).count() << " ms " << std::endl; + + return 0; +} diff --git a/examples/driver_api/shared_memory.cpp b/examples/driver_api/shared_memory.cpp new file mode 100644 index 0000000..2f434e5 --- /dev/null +++ b/examples/driver_api/shared_memory.cpp @@ -0,0 +1,126 @@ +#include +#include + +#ifdef GSPARDRIVER_OPENCL + #include "GSPar_OpenCL.hpp" + using namespace GSPar::Driver::OpenCL; +#else + #include "GSPar_CUDA.hpp" + using namespace GSPar::Driver::CUDA; +#endif + +void print_vector(unsigned int size, const unsigned int* vector, bool compact = false) { + if (compact || size > 100) { + std::cout << vector[0] << "..." << vector[size-1]; + } else { + for (unsigned int i = 0; i < size; i++) { + std::cout << vector[i] << " "; + } + } + std::cout << std::endl; +} + +int main(int argc, const char * argv[]) { + + std::cout << "Testing shared memory in GSParLib Driver API" << std::endl; + + std::string kernelSource = "" + "GSPAR_DEVICE_KERNEL void sharedmem_kernel(const int max, \n" + " GSPAR_DEVICE_GLOBAL_MEMORY const unsigned int *vector, \n" + " GSPAR_DEVICE_GLOBAL_MEMORY unsigned int *result"; + #ifdef GSPARDRIVER_OPENCL // OpenCL requires declaring shared memory after all the parameters + kernelSource += ", GSPAR_DEVICE_SHARED_MEMORY unsigned int* sharedMem) { \n"; + #else // CUDA requires declaring shared memory inside kernel's body + kernelSource += ") { \n GSPAR_DEVICE_SHARED_MEMORY unsigned int sharedMem[];\n"; + #endif + kernelSource += + " size_t gid = gspar_get_global_id(0); \n" + " if (gid <= max) { \n" + " sharedMem[gid] = vector[gid]; \n" + " } \n" + " gspar_synchronize_local_threads(); \n" + " if (gid == 0) { \n" + " for (size_t i = 0; i < max; i++) { \n" + " *result += sharedMem[i]; \n" + " } \n" + " } \n" + "} \n"; + + const unsigned int VECTOR_SIZE = 20; + + // Create memory objects + unsigned int correctResult = 0; + unsigned int* result = new unsigned int; + unsigned int* vector = new unsigned int[VECTOR_SIZE]; + for (unsigned int i = 0; i < VECTOR_SIZE; i++) { + vector[i] = (unsigned int)i; + correctResult += i; + } + + std::cout << "Vector with " << VECTOR_SIZE << " elements:" << std::endl; + print_vector(VECTOR_SIZE, vector); + + try { + + auto t_start = std::chrono::steady_clock::now(); + + auto driver = Instance::getInstance(); + driver->init(); + + int numGpus = driver->getGpuCount(); + if (numGpus == 0) { + std::cout << "No GPU found, interrupting test" << std::endl; + exit(-1); + } + + // Get the first GPU + auto gpu = driver->getGpu(0); + + auto vector_dev = gpu->malloc(sizeof(unsigned int) * VECTOR_SIZE, vector); + // Async copy + // vector_dev->copyInAsync(); + // vector_dev->waitAsync(); + // Sync copy + vector_dev->copyIn(); + + auto result_dev = gpu->malloc(sizeof(unsigned int), result); + + auto kernel = gpu->prepareKernel(kernelSource, "sharedmem_kernel"); + // auto kernel = new Kernel(gpu, kernelSource, "sharedmem_kernel"); + + kernel->setSharedMemoryAllocation(sizeof(unsigned int) * VECTOR_SIZE); + + // Set a fixed number of threads per block for the X dimension + // kernel->setNumThreadsPerBlockForX(5); + kernel->setParameter(sizeof(VECTOR_SIZE), &VECTOR_SIZE); + kernel->setParameter(vector_dev); + kernel->setParameter(result_dev); + + kernel->runAsync({VECTOR_SIZE, 0}); + kernel->waitAsync(); + + result_dev->copyOut(); + + delete kernel; + delete vector_dev; + delete result_dev; + + auto t_end = std::chrono::steady_clock::now(); + + // Output the result buffer + std::cout << "Expected result: " << correctResult << std::endl; + std::cout << "Actual result: " << *result << std::endl; + + delete vector; + delete result; + + std::cout << "Test finished succesfully in " << std::chrono::duration_cast(t_end - t_start).count() << " ms " << std::endl; + + return 0; + + } catch (GSPar::GSParException &ex) { + std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl; + exit(-1); + } + +} diff --git a/examples/driver_api/vector_sum.cpp b/examples/driver_api/vector_sum.cpp new file mode 100644 index 0000000..17f5b6a --- /dev/null +++ b/examples/driver_api/vector_sum.cpp @@ -0,0 +1,135 @@ +#include +#include + +#ifdef GSPARDRIVER_OPENCL + + #include "GSPar_OpenCL.hpp" + using namespace GSPar::Driver::OpenCL; + +#else + + #include "GSPar_CUDA.hpp" + using namespace GSPar::Driver::CUDA; + +#endif + +const char* kernelSource = GSPAR_STRINGIZE_SOURCE( + GSPAR_DEVICE_KERNEL void vector_sum_kernel(const int max, + GSPAR_DEVICE_GLOBAL_MEMORY const unsigned int *a, + GSPAR_DEVICE_GLOBAL_MEMORY const unsigned int *b, + GSPAR_DEVICE_GLOBAL_MEMORY unsigned int *result) { + size_t gid = gspar_get_global_id(0); + if (gid <= max) { + result[gid] = a[gid] + b[gid]; + } + } +); + +void vector_sum(const unsigned int max, const unsigned int* a, const unsigned int* b, unsigned int* result) { + + try { + + auto driver = Instance::getInstance(); + driver->init(); + + int numGpus = driver->getGpuCount(); + if (numGpus == 0) { + std::cout << "No GPU found, interrupting test" << std::endl; + exit(-1); + } + + // Get the first GPU + auto gpu = driver->getGpu(0); + + // MemoryObject* a_dev = new MemoryObject(gpu, sizeof(unsigned int) * max, a); + auto a_dev = gpu->malloc(sizeof(unsigned int) * max, a); + auto b_dev = gpu->malloc(sizeof(unsigned int) * max, b); + // Async copy + a_dev->copyInAsync(); + b_dev->copyInAsync(); + AsyncExecutionSupport::waitAllAsync({ a_dev, b_dev }); + // Sync copy + // a_dev->copyIn(); + // b_dev->copyIn(); + + auto result_dev = gpu->malloc(sizeof(unsigned int) * max, result); + + // auto kernel = new Kernel(gpu, kernelSource, "vector_sum_kernel"); + auto kernel = gpu->prepareKernel(kernelSource, "vector_sum_kernel"); + + // Set a fixed number of threads per block for the X dimension + kernel->setNumThreadsPerBlockForX(5); + kernel->setParameter(sizeof(max), &max); + kernel->setParameter(a_dev); + kernel->setParameter(b_dev); + kernel->setParameter(result_dev); + + kernel->runAsync({max, 0}); + kernel->waitAsync(); + + result_dev->copyOut(); + + delete kernel; + delete a_dev; + delete b_dev; + delete result_dev; + + } catch (GSPar::GSParException &ex) { + std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl; + exit(-1); + } +} + +void print_vector(unsigned int size, const unsigned int* vector, bool compact = false) { + if (compact || size > 100) { + std::cout << vector[0] << "..." << vector[size-1]; + } else { + for (unsigned int i = 0; i < size; i++) { + std::cout << vector[i] << " "; + } + } + std::cout << std::endl; +} + +int main(int argc, const char * argv[]) { + if (argc < 2) { + std::cerr << "Use: " << argv[0] << " " << std::endl; + exit(-1); + } + + const unsigned int VECTOR_SIZE = std::stoi(argv[1]); + + // Create memory objects + unsigned int* result = new unsigned int[VECTOR_SIZE]; + unsigned int* a = new unsigned int[VECTOR_SIZE]; + unsigned int* b = new unsigned int[VECTOR_SIZE]; + for (unsigned int i = 0; i < VECTOR_SIZE; i++) { + a[i] = (unsigned int)i; + b[i] = (unsigned int)i + 1; + result[i] = 0; + } + + std::cout << "Summing vectors:" << std::endl; + std::cout << "Vector A: "; + print_vector(VECTOR_SIZE, a); + std::cout << "Vector B: "; + print_vector(VECTOR_SIZE, b); + + auto t_start = std::chrono::steady_clock::now(); + + vector_sum(VECTOR_SIZE, a, b, result); + + auto t_end = std::chrono::steady_clock::now(); + + // Output the result buffer + std::cout << "Result: "; + print_vector(VECTOR_SIZE, result); + + delete result; + delete a; + delete b; + + std::cout << "Test finished succesfully in " << std::chrono::duration_cast(t_end - t_start).count() << " ms " << std::endl; + + return 0; +} diff --git a/examples/driver_api/vector_sum_chunked_memory.cpp b/examples/driver_api/vector_sum_chunked_memory.cpp new file mode 100644 index 0000000..05f474d --- /dev/null +++ b/examples/driver_api/vector_sum_chunked_memory.cpp @@ -0,0 +1,152 @@ +#include +#include + +#ifdef GSPARDRIVER_OPENCL + + #include "GSPar_OpenCL.hpp" + using namespace GSPar::Driver::OpenCL; + +#else + + #include "GSPar_CUDA.hpp" + using namespace GSPar::Driver::CUDA; + +#endif + +const char* kernelSource = GSPAR_STRINGIZE_SOURCE( + GSPAR_DEVICE_KERNEL void vector_sum(const int max, + GSPAR_DEVICE_GLOBAL_MEMORY const float *a, + GSPAR_DEVICE_GLOBAL_MEMORY const float *b, + GSPAR_DEVICE_GLOBAL_MEMORY float *result) { + size_t gid = gspar_get_global_id(0); + if (gid <= max) { + result[gid] = a[gid] + b[gid]; + } + } +); + +void vector_sum(const unsigned int max, const unsigned int chunks, const float* a, const float* b, float* result) { + + try { + + Instance* driver = Instance::getInstance(); + driver->init(); + + int numGpus = driver->getGpuCount(); + if (numGpus == 0) { + std::cout << "No GPU found, interrupting test" << std::endl; + exit(-1); + } + + // Get the first GPU + auto gpu = driver->getGpu(0); + + // Separate memory in chunks to simulate real-world chunked data + const unsigned int itemsInEachChunk = max/chunks; + + const void** a_chunked = new const void*[chunks]; + const void** b_chunked = new const void*[chunks]; + for (unsigned int chunk = 0; chunk < chunks; chunk++) { + a_chunked[chunk] = &a[chunk*itemsInEachChunk]; + b_chunked[chunk] = &b[chunk*itemsInEachChunk]; + // std::cout << "a_chunked[" << chunk << "] starts on " << ((float*)a_chunked[chunk])[0] << std::endl; + // std::cout << "b_chunked[" << chunk << "] starts on " << ((float*)b_chunked[chunk])[0] << std::endl; + } + + ChunkedMemoryObject* a_dev = gpu->mallocChunked(chunks, sizeof(float) * itemsInEachChunk, a_chunked); + ChunkedMemoryObject* b_dev = gpu->mallocChunked(chunks, sizeof(float) * itemsInEachChunk, b_chunked); + + // Async copy + a_dev->copyInAsync(); + b_dev->copyInAsync(); + AsyncExecutionSupport::waitAllAsync({ a_dev, b_dev }); + // Sync copy + // a_dev->copyIn(); + // b_dev->copyIn(); + + MemoryObject* result_dev = gpu->malloc(sizeof(float) * max, result); + + // Kernel* kernel = gpu->prepareKernel(kernelSource, "vector_sum"); + Kernel* kernel = new Kernel(gpu, kernelSource, "vector_sum"); + + kernel->setParameter(sizeof(max), &max); + kernel->setParameter(a_dev); + kernel->setParameter(b_dev); + kernel->setParameter(result_dev); + + unsigned long dimensions[3] = {max, 0, 0}; + kernel->runAsync(dimensions); + kernel->waitAsync(); + + result_dev->copyOut(); + + delete kernel; + delete a_dev; + delete b_dev; + delete result_dev; + + } catch (GSPar::GSParException &ex) { + std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl; + exit(-1); + } +} + +void print_vector(unsigned int size, const float* vector, unsigned int itemsInEachChunk = 0, bool compact = false) { + if (compact || size > 100) { + std::cout << vector[0] << "..." << vector[size-1]; + } else { + for (unsigned int i = 0; i < size; i++) { + std::cout << vector[i] << " "; + if (itemsInEachChunk && ((i+1) % itemsInEachChunk == 0)) std::cout << "| "; + } + } + std::cout << std::endl; +} + +int main(int argc, const char * argv[]) { + if (argc < 3) { + std::cerr << "Use: " << argv[0] << " " << std::endl; + std::cerr << " should be divisible by " << std::endl; + exit(-1); + } + + const unsigned int VECTOR_SIZE = std::stoi(argv[1]); + const unsigned int CHUNKS = std::stoi(argv[2]); + + const unsigned int itemsInEachChunk = VECTOR_SIZE/CHUNKS; + + + // Create memory objects + float* result = new float[VECTOR_SIZE]; + float* a = new float[VECTOR_SIZE]; + float* b = new float[VECTOR_SIZE]; + for (unsigned int i = 0; i < VECTOR_SIZE; i++) { + a[i] = (float)i; + b[i] = (float)(i * 2); + result[i] = 0; + } + + std::cout << "Summing vectors:" << std::endl; + std::cout << "Vector A: "; + print_vector(VECTOR_SIZE, a, itemsInEachChunk); + std::cout << "Vector B: "; + print_vector(VECTOR_SIZE, b, itemsInEachChunk); + + auto t_start = std::chrono::steady_clock::now(); + + vector_sum(VECTOR_SIZE, CHUNKS, a, b, result); + + auto t_end = std::chrono::steady_clock::now(); + + // Output the result buffer + std::cout << "Result: "; + print_vector(VECTOR_SIZE, result); + + delete result; + delete a; + delete b; + + std::cout << "Test finished succesfully in " << std::chrono::duration_cast(t_end - t_start).count() << " ms " << std::endl; + + return 0; +} diff --git a/examples/include/cudabase.hpp b/examples/include/cudabase.hpp new file mode 100644 index 0000000..3782235 --- /dev/null +++ b/examples/include/cudabase.hpp @@ -0,0 +1,49 @@ + +#ifndef __CUDABASE_INCLUDED__ +#define __CUDABASE_INCLUDED__ + + +#define CUDA_ERROR_CHECK + +#define CudaSafeCall( err ) __cudaSafeCall( err, __FILE__, __LINE__ ) +#define CudaCheckError() __cudaCheckError( __FILE__, __LINE__ ) + +inline void __cudaSafeCall( cudaError err, const char *file, const int line ) +{ +#ifdef CUDA_ERROR_CHECK + if ( cudaSuccess != err ) + { + fprintf( stderr, "cudaSafeCall() failed at %s:%i : %d-%s\n", + file, line, err, cudaGetErrorString( err ) ); + exit( -1 ); + } +#endif + + return; +} + +inline void __cudaCheckError( const char *file, const int line ) +{ +#ifdef CUDA_ERROR_CHECK + cudaError err = cudaGetLastError(); + if ( cudaSuccess != err ) + { + fprintf( stderr, "cudaCheckError() failed at %s:%i : %d-%s\n", + file, line, err, cudaGetErrorString( err ) ); + exit( -1 ); + } + + // More careful checking. However, this will affect performance. + // Comment away if needed. + err = cudaDeviceSynchronize(); + if( cudaSuccess != err ) + { + fprintf( stderr, "cudaCheckError() with sync failed at %s:%i : %d-%s\n", + file, line, err, cudaGetErrorString( err ) ); + exit( -1 ); + } +#endif + + return; +} +#endif \ No newline at end of file diff --git a/examples/include/cudabase_driver.hpp b/examples/include/cudabase_driver.hpp new file mode 100644 index 0000000..2b412cf --- /dev/null +++ b/examples/include/cudabase_driver.hpp @@ -0,0 +1,56 @@ + +#ifndef __CUDABASE_INCLUDED__ +#define __CUDABASE_INCLUDED__ + +#include +#include + +#define CUDA_ERROR_CHECK + +#define CudaSafeCall( err ) __cudaSafeCall( err, __FILE__, __LINE__ ) +// #define CudaCheckError() __cudaCheckError( __FILE__, __LINE__ ) + +inline void __cudaSafeCall( CUresult err, const char *file, const int line ) +{ +#ifdef CUDA_ERROR_CHECK + if ( CUDA_SUCCESS != err ) + { + const char* errName; + cuGetErrorName(err, &errName); + const char* errString; + cuGetErrorString(err, &errString); + + fprintf( stderr, "cudaSafeCall() failed at %s:%i : %s - %s\n", + file, line, errName, errString ); + exit( -1 ); + } +#endif + + return; +} + +// inline void __cudaCheckError( const char *file, const int line ) +// { +// #ifdef CUDA_ERROR_CHECK +// CUresult err = cudaGetLastError(); +// if ( cudaSuccess != err ) +// { +// fprintf( stderr, "cudaCheckError() failed at %s:%i : %s\n", +// file, line, cudaGetErrorString( err ) ); +// exit( -1 ); +// } + +// // More careful checking. However, this will affect performance. +// // Comment away if needed. +// err = cudaDeviceSynchronize(); +// if( cudaSuccess != err ) +// { +// fprintf( stderr, "cudaCheckError() with sync failed at %s:%i : %s\n", +// file, line, cudaGetErrorString( err ) ); +// exit( -1 ); +// } +// #endif + +// return; +// } +#endif \ No newline at end of file diff --git a/examples/include/cudabase_nvrtc.hpp b/examples/include/cudabase_nvrtc.hpp new file mode 100644 index 0000000..deb85d1 --- /dev/null +++ b/examples/include/cudabase_nvrtc.hpp @@ -0,0 +1,57 @@ + +#ifndef __CUDABASENVRTC_INCLUDED__ +#define __CUDABASENVRTC_INCLUDED__ + +#include +#include + +#define CUDA_ERROR_CHECK + +#define NvrtcSafeCall( err ) __nvrtcSafeCall( err, __FILE__, __LINE__ ) +#define NvrtcSafeBuild( err, prog ) __nvrtcSafeBuild( prog, err, __FILE__, __LINE__ ) + +inline void __nvrtcSafeCall( nvrtcResult err, const char *file, const int line ) +{ +#ifdef CUDA_ERROR_CHECK + if ( NVRTC_SUCCESS != err ) + { + const char* errString = nvrtcGetErrorString(err); + + fprintf( stderr, "nvrtcSafeCall() failed at %s:%i : %s\n", + file, line, errString ); + exit( -1 ); + } +#endif + + return; +} + +inline void __nvrtcSafeBuild( nvrtcProgram prog, nvrtcResult err, const char *file, const int line ) +{ +#ifdef CUDA_ERROR_CHECK + if ( NVRTC_SUCCESS != err ) + { + const char* errString = nvrtcGetErrorString(err); + + fprintf( stderr, "nvrtcSafeBuild() failed at %s:%i : %s\n", + file, line, errString ); + + size_t logSize = 0; + nvrtcGetProgramLogSize(prog, &logSize); + + if (logSize > 0) { + char *buildLog = new char[logSize]; + nvrtcGetProgramLog(prog, buildLog); + + fprintf( stderr, "Build log:\n%s", buildLog); + delete[] buildLog; + } + + exit( -1 ); + } +#endif + + return; +} + +#endif \ No newline at end of file diff --git a/examples/include/oclbase.h b/examples/include/oclbase.h new file mode 100644 index 0000000..2a7d757 --- /dev/null +++ b/examples/include/oclbase.h @@ -0,0 +1,116 @@ + +#ifndef __OCLBASE_INCLUDED__ +#define __OCLBASE_INCLUDED__ + +#include +#include + +#define OCL_ERROR_CHECK + +const char *__openCLGetErrorString(cl_int error) +{ +switch(error){ + // run-time and JIT compiler errors + case 0: return "CL_SUCCESS"; + case -1: return "CL_DEVICE_NOT_FOUND"; + case -2: return "CL_DEVICE_NOT_AVAILABLE"; + case -3: return "CL_COMPILER_NOT_AVAILABLE"; + case -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE"; + case -5: return "CL_OUT_OF_RESOURCES"; + case -6: return "CL_OUT_OF_HOST_MEMORY"; + case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE"; + case -8: return "CL_MEM_COPY_OVERLAP"; + case -9: return "CL_IMAGE_FORMAT_MISMATCH"; + case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED"; + case -11: return "CL_BUILD_PROGRAM_FAILURE"; + case -12: return "CL_MAP_FAILURE"; + case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET"; + case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST"; + case -15: return "CL_COMPILE_PROGRAM_FAILURE"; + case -16: return "CL_LINKER_NOT_AVAILABLE"; + case -17: return "CL_LINK_PROGRAM_FAILURE"; + case -18: return "CL_DEVICE_PARTITION_FAILED"; + case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE"; + + // compile-time errors + case -30: return "CL_INVALID_VALUE"; + case -31: return "CL_INVALID_DEVICE_TYPE"; + case -32: return "CL_INVALID_PLATFORM"; + case -33: return "CL_INVALID_DEVICE"; + case -34: return "CL_INVALID_CONTEXT"; + case -35: return "CL_INVALID_QUEUE_PROPERTIES"; + case -36: return "CL_INVALID_COMMAND_QUEUE"; + case -37: return "CL_INVALID_HOST_PTR"; + case -38: return "CL_INVALID_MEM_OBJECT"; + case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR"; + case -40: return "CL_INVALID_IMAGE_SIZE"; + case -41: return "CL_INVALID_SAMPLER"; + case -42: return "CL_INVALID_BINARY"; + case -43: return "CL_INVALID_BUILD_OPTIONS"; + case -44: return "CL_INVALID_PROGRAM"; + case -45: return "CL_INVALID_PROGRAM_EXECUTABLE"; + case -46: return "CL_INVALID_KERNEL_NAME"; + case -47: return "CL_INVALID_KERNEL_DEFINITION"; + case -48: return "CL_INVALID_KERNEL"; + case -49: return "CL_INVALID_ARG_INDEX"; + case -50: return "CL_INVALID_ARG_VALUE"; + case -51: return "CL_INVALID_ARG_SIZE"; + case -52: return "CL_INVALID_KERNEL_ARGS"; + case -53: return "CL_INVALID_WORK_DIMENSION"; + case -54: return "CL_INVALID_WORK_GROUP_SIZE"; + case -55: return "CL_INVALID_WORK_ITEM_SIZE"; + case -56: return "CL_INVALID_GLOBAL_OFFSET"; + case -57: return "CL_INVALID_EVENT_WAIT_LIST"; + case -58: return "CL_INVALID_EVENT"; + case -59: return "CL_INVALID_OPERATION"; + case -60: return "CL_INVALID_GL_OBJECT"; + case -61: return "CL_INVALID_BUFFER_SIZE"; + case -62: return "CL_INVALID_MIP_LEVEL"; + case -63: return "CL_INVALID_GLOBAL_WORK_SIZE"; + case -64: return "CL_INVALID_PROPERTY"; + case -65: return "CL_INVALID_IMAGE_DESCRIPTOR"; + case -66: return "CL_INVALID_COMPILER_OPTIONS"; + case -67: return "CL_INVALID_LINKER_OPTIONS"; + case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT"; + + // extension errors + case -1000: return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR"; + case -1001: return "CL_PLATFORM_NOT_FOUND_KHR"; + case -1002: return "CL_INVALID_D3D10_DEVICE_KHR"; + case -1003: return "CL_INVALID_D3D10_RESOURCE_KHR"; + case -1004: return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR"; + case -1005: return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR"; + default: return "Unknown OpenCL error"; + } +} + +#define OpenCLCheckError(status) __openCLCheckError( status, __FILE__, __LINE__ ) +#define OpenCLCheckBuildError(status, program, device) __openCLPrintBuildError( status, program, device, __FILE__, __LINE__ ) + +inline void __openCLCheckError( cl_int status, const char *file, const int line ) +{ +#ifdef OCL_ERROR_CHECK + if (status != CL_SUCCESS) { + printf("OpenCL failed at %s:%i : %s (%d)\n", file, line, __openCLGetErrorString(status), status); + exit( -1 ); + } +#endif + return; +} + +inline void __openCLPrintBuildError(cl_int status, cl_program program, cl_device_id device, const char *file, const int line) +{ + if (status == CL_BUILD_PROGRAM_FAILURE) { + size_t log_size; + clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); + char *log = (char *) malloc(log_size); + clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); + printf("Program build log:\n"); + printf("%s\n", log); + free(log); + } + + OpenCLCheckError(status); +} + +#endif diff --git a/examples/include/oclbase.hpp b/examples/include/oclbase.hpp new file mode 100644 index 0000000..4e30632 --- /dev/null +++ b/examples/include/oclbase.hpp @@ -0,0 +1,116 @@ + +#ifndef __OCLBASE_INCLUDED__ +#define __OCLBASE_INCLUDED__ + +#include +#include + +#define OCL_ERROR_CHECK + +const char *__openCLGetErrorString(cl_int error) +{ +switch(error){ + // run-time and JIT compiler errors + case 0: return "CL_SUCCESS"; + case -1: return "CL_DEVICE_NOT_FOUND"; + case -2: return "CL_DEVICE_NOT_AVAILABLE"; + case -3: return "CL_COMPILER_NOT_AVAILABLE"; + case -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE"; + case -5: return "CL_OUT_OF_RESOURCES"; + case -6: return "CL_OUT_OF_HOST_MEMORY"; + case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE"; + case -8: return "CL_MEM_COPY_OVERLAP"; + case -9: return "CL_IMAGE_FORMAT_MISMATCH"; + case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED"; + case -11: return "CL_BUILD_PROGRAM_FAILURE"; + case -12: return "CL_MAP_FAILURE"; + case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET"; + case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST"; + case -15: return "CL_COMPILE_PROGRAM_FAILURE"; + case -16: return "CL_LINKER_NOT_AVAILABLE"; + case -17: return "CL_LINK_PROGRAM_FAILURE"; + case -18: return "CL_DEVICE_PARTITION_FAILED"; + case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE"; + + // compile-time errors + case -30: return "CL_INVALID_VALUE"; + case -31: return "CL_INVALID_DEVICE_TYPE"; + case -32: return "CL_INVALID_PLATFORM"; + case -33: return "CL_INVALID_DEVICE"; + case -34: return "CL_INVALID_CONTEXT"; + case -35: return "CL_INVALID_QUEUE_PROPERTIES"; + case -36: return "CL_INVALID_COMMAND_QUEUE"; + case -37: return "CL_INVALID_HOST_PTR"; + case -38: return "CL_INVALID_MEM_OBJECT"; + case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR"; + case -40: return "CL_INVALID_IMAGE_SIZE"; + case -41: return "CL_INVALID_SAMPLER"; + case -42: return "CL_INVALID_BINARY"; + case -43: return "CL_INVALID_BUILD_OPTIONS"; + case -44: return "CL_INVALID_PROGRAM"; + case -45: return "CL_INVALID_PROGRAM_EXECUTABLE"; + case -46: return "CL_INVALID_KERNEL_NAME"; + case -47: return "CL_INVALID_KERNEL_DEFINITION"; + case -48: return "CL_INVALID_KERNEL"; + case -49: return "CL_INVALID_ARG_INDEX"; + case -50: return "CL_INVALID_ARG_VALUE"; + case -51: return "CL_INVALID_ARG_SIZE"; + case -52: return "CL_INVALID_KERNEL_ARGS"; + case -53: return "CL_INVALID_WORK_DIMENSION"; + case -54: return "CL_INVALID_WORK_GROUP_SIZE"; + case -55: return "CL_INVALID_WORK_ITEM_SIZE"; + case -56: return "CL_INVALID_GLOBAL_OFFSET"; + case -57: return "CL_INVALID_EVENT_WAIT_LIST"; + case -58: return "CL_INVALID_EVENT"; + case -59: return "CL_INVALID_OPERATION"; + case -60: return "CL_INVALID_GL_OBJECT"; + case -61: return "CL_INVALID_BUFFER_SIZE"; + case -62: return "CL_INVALID_MIP_LEVEL"; + case -63: return "CL_INVALID_GLOBAL_WORK_SIZE"; + case -64: return "CL_INVALID_PROPERTY"; + case -65: return "CL_INVALID_IMAGE_DESCRIPTOR"; + case -66: return "CL_INVALID_COMPILER_OPTIONS"; + case -67: return "CL_INVALID_LINKER_OPTIONS"; + case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT"; + + // extension errors + case -1000: return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR"; + case -1001: return "CL_PLATFORM_NOT_FOUND_KHR"; + case -1002: return "CL_INVALID_D3D10_DEVICE_KHR"; + case -1003: return "CL_INVALID_D3D10_RESOURCE_KHR"; + case -1004: return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR"; + case -1005: return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR"; + default: return "Unknown OpenCL error"; + } +} + +#define OpenCLCheckError(status) __openCLCheckError( status, __FILE__, __LINE__ ) +#define OpenCLCheckBuildError(status, program, device) __openCLPrintBuildError( status, program, device, __FILE__, __LINE__ ) + +inline void __openCLCheckError( cl_int status, const char *file, const int line ) +{ +#ifdef OCL_ERROR_CHECK + if (status != CL_SUCCESS) { + printf("OpenCL failed at %s:%i : %s (%d)\n", file, line, __openCLGetErrorString(status), status); + exit( -1 ); + } +#endif + return; +} + +inline void __openCLPrintBuildError(cl_int status, cl_program program, cl_device_id device, const char *file, const int line) +{ + if (status == CL_BUILD_PROGRAM_FAILURE) { + size_t log_size; + clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); + char *log = (char *) malloc(log_size); + clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); + printf("Program build log:\n"); + printf("%s\n", log); + free(log); + } + + OpenCLCheckError(status); +} + +#endif diff --git a/examples/pattern_api/mandel.cpp b/examples/pattern_api/mandel.cpp new file mode 100644 index 0000000..c578277 --- /dev/null +++ b/examples/pattern_api/mandel.cpp @@ -0,0 +1,141 @@ +#include +#include +#ifdef DEBUG +#include "marX2/marX2.h" +#endif + +std::chrono::steady_clock::time_point tInitialization; +std::chrono::steady_clock::time_point tComputation; +std::chrono::steady_clock::time_point tFinishing; +std::chrono::steady_clock::time_point tEnd; + +#ifdef GSPARDRIVER_CUDA + + #include "GSPar_CUDA.hpp" + using namespace GSPar::Driver::CUDA; + +// #elif GSPARDRIVER_OPENCL +#else // This way my IDE doesn't complain + + #include "GSPar_OpenCL.hpp" + using namespace GSPar::Driver::OpenCL; + +#endif + +#include "GSPar_PatternMap.hpp" +using namespace GSPar::Pattern; + +void mandelbrot(const double init_a, const double init_b, const double range, const unsigned long dim, const unsigned long niter, unsigned char *M) { + double step = range/((double) dim); + + auto pattern = new Map(GSPAR_STRINGIZE_SOURCE( + double im=init_b+(step*i); + double cr; + double a=cr=init_a+step*j; + double b=im; + unsigned long k = 0; + for (k = 0; k < niter; k++) { + double a2=a*a; + double b2=b*b; + if ((a2+b2)>4.0) break; + b=2*a*b+im; + a=a2-b2+cr; + } + M[i*dim+j] = (unsigned char)(255-((k*255/niter))); + )); + + try { + + pattern->setParameter("init_a", init_a) + .setParameter("init_b", init_b) + .setParameter("step", step) + .setParameter("dim", dim) + .setParameter("niter", niter) + .setParameter("M", dim*dim, M, GSPAR_PARAM_OUT); + + pattern->setStdVarNames({"i", "j", ""}); + + pattern->compile({dim, dim, 0}); + + } catch (GSPar::GSParException &ex) { + std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl; + exit(-1); + } + + + tComputation = std::chrono::steady_clock::now(); // Ends initialization, start computation + + + try { + pattern->run(); + } catch (GSPar::GSParException &ex) { + std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl; + exit(-1); + } + + + tFinishing = std::chrono::steady_clock::now(); // Ends computation, start finishing + + + delete pattern; +} + +int main(int argc, char **argv) { + double init_a=-2.125,init_b=-1.5,range=3.0; + unsigned long dim = 1000; + unsigned long niter = 1000; + + #ifndef DEBUG + if (argc<3) { + std::cerr << "Usage: " << argv[0] << " " << std::endl; + exit(-1); + } + #endif + if (argc > 1) { + dim = strtoul(argv[1], 0, 10); + } + if (argc > 2) { + niter = strtoul(argv[2], 0, 10); + } + + unsigned char *M = new unsigned char[dim*dim]; + + #ifdef DEBUG + SetupXWindows(dim,dim,1,NULL,"Mandelbroot"); + #endif + + tInitialization = std::chrono::steady_clock::now(); // Begins initialization + + mandelbrot(init_a, init_b, range, dim, niter, M); + + tEnd = std::chrono::steady_clock::now(); // Ends finish + + #ifdef DEBUG + for(unsigned long i=0; i(tEnd - tInitialization).count(); + double msInitialization = std::chrono::duration_cast(tComputation - tInitialization).count(); + double msComputation = std::chrono::duration_cast(tFinishing - tComputation).count(); + double msFinishing = std::chrono::duration_cast(tEnd - tFinishing).count(); + + #ifdef DEBUG + std::cout << "Teste: " << argv[0] << " " << dim << " " << niter << std::endl; + std::cout << "Total: " << msTotal << " ms" << std::endl; + std::cout << "Initialization: " << msInitialization << " ms" << std::endl; + std::cout << "Computation: " << msComputation << " ms" << std::endl; + std::cout << "Finishing: " << msFinishing << " ms" << std::endl; + #else + std::cout << argv[0] << " " << dim << " " << niter << ";" << msTotal << ";" << msInitialization << ";" << msComputation << ";" << msFinishing << std::endl; + #endif + + #ifdef DEBUG + getchar(); + CloseXWindows(); + #endif + + delete[] M; + return 0; +} diff --git a/examples/pattern_api/mandel_batched_parameters.cpp b/examples/pattern_api/mandel_batched_parameters.cpp new file mode 100644 index 0000000..1950079 --- /dev/null +++ b/examples/pattern_api/mandel_batched_parameters.cpp @@ -0,0 +1,222 @@ +/* *************************************************************************** + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * As a special exception, you may use this file as part of a free software + * library without restriction. Specifically, if other files instantiate + * templates or use macros or inline functions from this file, or you compile + * this file and link it with other files to produce an executable, this + * file does not by itself cause the resulting executable to be covered by + * the GNU General Public License. This exception does not however + * invalidate any other reasons why the executable file might be covered by + * the GNU General Public License. + * + **************************************************************************** + */ + +/* + + Author: Marco Aldinucci. + email: aldinuc@di.unipi.it + marco@pisa.quadrics.com + date : 15/11/97 + +Modified by: + +**************************************************************************** + * Author: Dalvan Griebler + * Author: Dinei Rockenbach + * + * Copyright: GNU General Public License + * Description: This program simply computes the mandelbroat set. + * File Name: mandel.cpp + * Version: 1.0 (25/05/2018) + * Compilation Command: make + **************************************************************************** +*/ + + +#include +#ifdef DEBUG +#include "marX2.h" +#endif +#include +#include + +#include +#include + +#ifdef GSPARDRIVER_CUDA + + #include "GSPar_CUDA.hpp" + using namespace GSPar::Driver::CUDA; + +// #elif GSPARDRIVER_OPENCL +#else // This way my IDE doesn't complain + + #include "GSPar_OpenCL.hpp" + using namespace GSPar::Driver::OpenCL; + +#endif + +#include "GSPar_PatternMap.hpp" +using namespace GSPar::Pattern; + +#define DIM 800 +#define ITERATION 1024 + +double diffmsec(struct timeval a, struct timeval b) { + long sec = (a.tv_sec - b.tv_sec); + long usec = (a.tv_usec - b.tv_usec); + + if(usec < 0) { + --sec; + usec += 1000000; + } + return ((double)(sec*1000)+ (double)usec/1000.0); +} + + + +int main(int argc, char **argv) { + double init_a=-2.125,init_b=-1.5,range=3.0; + unsigned long dim = DIM, niter = ITERATION; + // stats + struct timeval t1,t2; + int retries=1; + double avg = 0; + int batch_size = 1; + + if (argc<5) { + printf("Usage: %s size niterations retries batch_size\n\n", argv[0]); + exit(-1); + } + else { + dim = atoi(argv[1]); + niter = atoi(argv[2]); + retries = atoi(argv[3]); + batch_size = atoi(argv[4]); + } + + double * runs = new double[retries]; + unsigned char **Ms = new unsigned char*[batch_size]; + for (int b = 0; b < batch_size; b++) { + Ms[b] = new unsigned char[dim]; + } + + unsigned int batches = ceil((double)dim/batch_size); + + double step = range/((double) dim); + +#ifdef DEBUG + SetupXWindows(dim,dim,1,NULL,"Sequential Mandelbroot"); +#endif + + printf("bin;size;numiter;time (ms);workers;batch size\n"); + for (int r=0; r4.0) break; + b=2*a*b+im; + a=a2-b2+cr; + } + M[j]= (unsigned char) 255-((k*255/niter)); + )); + + unsigned long dimensions[3] = {dim, 0, 0}; + + try { + + pattern->setParameterPlaceholder("i", GSPAR_PARAM_VALUE, GSPAR_PARAM_IN, true) + .setParameter("dim", dim) + .setParameter("init_a", init_a) + .setParameter("init_b", init_b) + .setParameter("step", step) + .setParameter("niter", niter) + .setParameterPlaceholder("M", GSPAR_PARAM_POINTER, GSPAR_PARAM_INOUT, true); + + pattern->setStdVarNames({"j", "", ""}) + .setBatchSize(batch_size); + + pattern->compile(dimensions); + + } catch (GSPar::GSParException &ex) { + std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl; + exit(-1); + } + + // Start time + gettimeofday(&t1,NULL); + + int* Is = new int[batch_size]; + + for(unsigned int b=0; bsetBatchedParameter("i", Is) + .setBatchedParameter("M", dim, Ms, GSPAR_PARAM_INOUT); + + pattern->run(dimensions); + + } catch (GSPar::GSParException &ex) { + std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl; + exit(-1); + } + +#ifdef DEBUG + for (int i = 0; i < batch_size; i++) { + ShowLine(Ms[i],dim,Is[i]); + } +#endif + } + // Stop time + gettimeofday(&t2,NULL); + + avg += runs[r] = diffmsec(t2,t1); + printf("%s;%lu;%lu;%.2f;1;1\n", argv[0], dim, niter, runs[r]); + } + avg = avg / (double) retries; + double var = 0; + for (int r=0; r + * Author: Dinei Rockenbach + * + * Copyright: GNU General Public License + * Description: This program simply computes the mandelbroat set. + * File Name: mandel.cpp + * Version: 1.0 (25/05/2018) + * Compilation Command: make + **************************************************************************** +*/ + + +#include +#ifdef DEBUG +#include "marX2/marX2.h" +#endif +#include +#include + +#include +#include + +#ifdef GSPARDRIVER_CUDA + + #include "GSPar_CUDA.hpp" + using namespace GSPar::Driver::CUDA; + +// #elif GSPARDRIVER_OPENCL +#else // This way my IDE doesn't complain + + #include "GSPar_OpenCL.hpp" + using namespace GSPar::Driver::OpenCL; + +#endif + +#include "GSPar_PatternMap.hpp" +using namespace GSPar::Pattern; + +#define DIM 800 +#define ITERATION 1024 + +double diffmsec(struct timeval a, struct timeval b) { + long sec = (a.tv_sec - b.tv_sec); + long usec = (a.tv_usec - b.tv_usec); + + if(usec < 0) { + --sec; + usec += 1000000; + } + return ((double)(sec*1000)+ (double)usec/1000.0); +} + + + +int main(int argc, char **argv) { + double init_a=-2.125,init_b=-1.5,range=3.0; + unsigned long dim = DIM, niter = ITERATION; + // stats + struct timeval t1,t2; + int retries=1; + double avg = 0; + + + if (argc<4) { + printf("Usage: %s size niterations retries\n\n", argv[0]); + exit(-1); + } + else { + dim = atoi(argv[1]); + niter = atoi(argv[2]); + retries = atoi(argv[3]); + } + + double * runs = new double[retries]; + unsigned char *M = new unsigned char[dim]; + + double step = range/((double) dim); + +#ifdef DEBUG + SetupXWindows(dim,dim,1,NULL,"Sequential Mandelbroot"); +#endif + + printf("bin;size;numiter;time (ms);workers;batch size\n"); + for (int r=0; r4.0) break; + b=2*a*b+im; + a=a2-b2+cr; + } + M[j]= (unsigned char) 255-((k*255/niter)); + )); + + unsigned long dimensions[3] = {dim, 0, 0}; + try { + + pattern->setParameterPlaceholder("i", GSPAR_PARAM_VALUE) + .setParameter("dim", dim) + .setParameter("init_a", init_a) + .setParameter("init_b", init_b) + .setParameter("step", step) + .setParameter("niter", niter) + .setParameterPlaceholder("M", GSPAR_PARAM_POINTER, GSPAR_PARAM_INOUT); + + pattern->setStdVarNames({"j"}); + + pattern->compile(dimensions); + + } catch (GSPar::GSParException &ex) { + std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl; + exit(-1); + } + + // Start time + gettimeofday(&t1,NULL); + + for(unsigned long i=0; i4.0) break; + // b=2*a*b+im; + // a=a2-b2+cr; + // } + // M[j]= (unsigned char) 255-((k*255/niter)); + // } + + try { + pattern->setParameter("i", i) + .setParameter("M", dim, M, GSPAR_PARAM_INOUT); + + pattern->run(dimensions); + + } catch (GSPar::GSParException &ex) { + std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl; + exit(-1); + } + +#ifdef DEBUG + ShowLine(M,dim,i); +#endif + } + // Stop time + gettimeofday(&t2,NULL); + + avg += runs[r] = diffmsec(t2,t1); + printf("%s;%lu;%lu;%.2f;1;1\n", argv[0], dim, niter, runs[r]); + } + avg = avg / (double) retries; + double var = 0; + for (int r=0; r +#include +#include + +std::chrono::steady_clock::time_point tInitialization; +std::chrono::steady_clock::time_point tComputation; +std::chrono::steady_clock::time_point tFinishing; +std::chrono::steady_clock::time_point tEnd; + +#ifdef GSPARDRIVER_CUDA + + #include "GSPar_CUDA.hpp" + namespace Driver = GSPar::Driver::CUDA; + +// #elif GSPARDRIVER_OPENCL +#else // This way my IDE doesn't complain + + #include "GSPar_OpenCL.hpp" + namespace Driver = GSPar::Driver::OpenCL; + +#endif + +#include "GSPar_PatternMap.hpp" +namespace Pattern = GSPar::Pattern; + +void multiply(const unsigned long size, const float *matrixA, const float *matrixB, float *result) { + try { + + auto map = new Pattern::Map(GSPAR_STRINGIZE_SOURCE( + float sum = 0; + for (unsigned long k = 0; k < size; k++) { + sum += a[k * size + i] * b[j * size + k]; // Column-major + } + result[j * size + i] = sum; + )); + + map->setStdVarNames({"i", "j"}); + + map->setParameter("size", size) + .setParameter("a", sizeof(float) * size * size, matrixA) + .setParameter("b", sizeof(float) * size * size, matrixB) + .setParameter("result", sizeof(float) * size * size, result, Pattern::GSPAR_PARAM_OUT); + + map->compile({size, size, 0}); + + + tComputation = std::chrono::steady_clock::now(); // Ends initialization, start computation + + + map->run(); + + + tFinishing = std::chrono::steady_clock::now(); // Ends computation, start finishing + + + delete map; + + } catch (GSPar::GSParException &ex) { + std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl; + exit(-1); + } +} + +void printMatrix(const unsigned long size, float *matrix, bool compact = false) { + if (compact || size > 100) { + std::cout << matrix[0] << ".." << matrix[size-1] << ".." << matrix[size*size-1]; + } else { + for (unsigned long i = 0; i < size; i++) { + std::cout << std::endl; + for (unsigned long j = 0; j < size; j++) { + // std::cout << matrix[i * size + j] << " "; // Row-major + std::cout << matrix[j * size + i] << " "; // Column-major + } + } + } + std::cout << std::endl; +} + +int main(int argc, char const *argv[]) { + if (argc < 2) { + std::cerr << "Use: " << argv[0] << " " << std::endl; + exit(-1); + } + std::cout << std::fixed << std::setprecision(0); + + const unsigned long MX = std::stoi(argv[1]); + + float *matrixA = new float[MX * MX]; + float *matrixB = new float[MX * MX]; + float *result = new float[MX * MX]; + for (unsigned long i = 0; i < MX; i++) { + for (unsigned long j = 0; j < MX; j++) { + // Column-major + matrixA[j * MX + i] = i+1; + matrixB[j * MX + i] = j+1; + result[j * MX + i] = 0; + } + } + + #ifdef DEBUG + std::cout << "Multiplying matrixes of " << MX << " x " << MX << std::endl; + std::cout << "Matrix A: "; + printMatrix(MX, matrixA); + std::cout << "Matrix B: "; + printMatrix(MX, matrixB); + #endif + + tInitialization = std::chrono::steady_clock::now(); // Begins initialization + + multiply(MX, matrixA, matrixB, result); + + tEnd = std::chrono::steady_clock::now(); // Ends finish + + double msTotal = std::chrono::duration_cast(tEnd - tInitialization).count(); + double msInitialization = std::chrono::duration_cast(tComputation - tInitialization).count(); + double msComputation = std::chrono::duration_cast(tFinishing - tComputation).count(); + double msFinishing = std::chrono::duration_cast(tEnd - tFinishing).count(); + + #ifdef DEBUG + std::cout << "Result: "; + printMatrix(MX, result); + std::cout << "Total: " << msTotal << " ms" << std::endl; + std::cout << "Initialization: " << msInitialization << " ms" << std::endl; + std::cout << "Computation: " << msComputation << " ms" << std::endl; + std::cout << "Finishing: " << msFinishing << " ms" << std::endl; + #else + std::cout << result[0] << ".." << result[MX-1] << ".." << result[MX*MX-1] << ";" << msTotal << ";" << msInitialization << ";" << msComputation << ";" << msFinishing << std::endl; + #endif + + delete[] matrixA; + delete[] matrixB; + delete[] result; +} diff --git a/examples/pattern_api/matrix_multi_map_rm.cpp b/examples/pattern_api/matrix_multi_map_rm.cpp new file mode 100644 index 0000000..7b176ae --- /dev/null +++ b/examples/pattern_api/matrix_multi_map_rm.cpp @@ -0,0 +1,132 @@ +#include +#include +#include + +std::chrono::steady_clock::time_point tInitialization; +std::chrono::steady_clock::time_point tComputation; +std::chrono::steady_clock::time_point tFinishing; +std::chrono::steady_clock::time_point tEnd; + +#ifdef GSPARDRIVER_CUDA + + #include "GSPar_CUDA.hpp" + namespace Driver = GSPar::Driver::CUDA; + +// #elif GSPARDRIVER_OPENCL +#else // This way my IDE doesn't complain + + #include "GSPar_OpenCL.hpp" + namespace Driver = GSPar::Driver::OpenCL; + +#endif + +#include "GSPar_PatternMap.hpp" +namespace Pattern = GSPar::Pattern; + +void multiply(const unsigned long size, const float *matrixA, const float *matrixB, float *result) { + try { + + auto map = new Pattern::Map(GSPAR_STRINGIZE_SOURCE( + float sum = 0; + for (unsigned long k = 0; k < size; k++) { + sum += a[i * size + k] * b[k * size + j]; // Row-major + } + result[i * size + j] = sum; + )); + + map->setStdVarNames({"i", "j"}); + + map->setParameter("size", size) + .setParameter("a", sizeof(float) * size * size, matrixA) + .setParameter("b", sizeof(float) * size * size, matrixB) + .setParameter("result", sizeof(float) * size * size, result, Pattern::GSPAR_PARAM_OUT); + + map->compile({size, size, 0}); + + + tComputation = std::chrono::steady_clock::now(); // Ends initialization, start computation + + + map->run(); + + + tFinishing = std::chrono::steady_clock::now(); // Ends computation, start finishing + + + delete map; + + } catch (GSPar::GSParException &ex) { + std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl; + exit(-1); + } +} + +void printMatrix(const unsigned long size, float *matrix, bool compact = false) { + if (compact || size > 100) { + std::cout << matrix[0] << ".." << matrix[size-1] << ".." << matrix[size*size-1]; + } else { + for (unsigned long i = 0; i < size; i++) { + std::cout << std::endl; + for (unsigned long j = 0; j < size; j++) { + std::cout << matrix[i * size + j] << " "; // Row-major + } + } + } + std::cout << std::endl; +} + +int main(int argc, char const *argv[]) { + if (argc < 2) { + std::cerr << "Use: " << argv[0] << " " << std::endl; + exit(-1); + } + std::cout << std::fixed << std::setprecision(0); + + const unsigned long MX = std::stoi(argv[1]); + + float *matrixA = new float[MX * MX]; + float *matrixB = new float[MX * MX]; + float *result = new float[MX * MX]; + for (unsigned long i = 0; i < MX; i++) { + for (unsigned long j = 0; j < MX; j++) { + // Row-major + matrixA[i * MX + j] = i+1; + matrixB[i * MX + j] = j+1; + result[i * MX + j] = 0; + } + } + + #ifdef DEBUG + std::cout << "Multiplying matrixes of " << MX << " x " << MX << std::endl; + std::cout << "Matrix A: "; + printMatrix(MX, matrixA); + std::cout << "Matrix B: "; + printMatrix(MX, matrixB); + #endif + + tInitialization = std::chrono::steady_clock::now(); // Begins initialization + + multiply(MX, matrixA, matrixB, result); + + tEnd = std::chrono::steady_clock::now(); // Ends finish + + double msTotal = std::chrono::duration_cast(tEnd - tInitialization).count(); + double msInitialization = std::chrono::duration_cast(tComputation - tInitialization).count(); + double msComputation = std::chrono::duration_cast(tFinishing - tComputation).count(); + double msFinishing = std::chrono::duration_cast(tEnd - tFinishing).count(); + + #ifdef DEBUG + std::cout << "Result: "; + printMatrix(MX, result); + std::cout << "Total: " << msTotal << " ms" << std::endl; + std::cout << "Initialization: " << msInitialization << " ms" << std::endl; + std::cout << "Computation: " << msComputation << " ms" << std::endl; + std::cout << "Finishing: " << msFinishing << " ms" << std::endl; + #else + std::cout << result[0] << ".." << result[MX-1] << ".." << result[MX*MX-1] << ";" << msTotal << ";" << msInitialization << ";" << msComputation << ";" << msFinishing << std::endl; + #endif + + delete[] matrixA; + delete[] matrixB; + delete[] result; +} diff --git a/examples/pattern_api/raytracer.cpp b/examples/pattern_api/raytracer.cpp new file mode 100644 index 0000000..865a426 --- /dev/null +++ b/examples/pattern_api/raytracer.cpp @@ -0,0 +1,818 @@ +// [header] +// A very basic raytracer example. +// [/header] +// [compile] +// c++ -o raytracer -O3 -Wall raytracer.cpp +// [/compile] +// [ignore] +// Copyright (C) 2012 www.scratchapixel.com +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . +// [/ignore] +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "rapidxml-1.13/rapidxml.hpp" + +#ifdef GSPARDRIVER_CUDA + + #include "GSPar_CUDA.hpp" + using namespace GSPar::Driver::CUDA; + + const char* extraKernelCode = GSPAR_STRINGIZE_SOURCE( + template + class Vec3 + { + public: + T x, y, z; + Vec3() : x(T(0)), y(T(0)), z(T(0)) {} + Vec3(T xx) : x(xx), y(xx), z(xx) {} + Vec3(T xx, T yy, T zz) : x(xx), y(yy), z(zz) {} + void normalize() { Vec3f_normalize(this); } + Vec3 operator * (const T &f) const { return Vec3(x * f, y * f, z * f); } + Vec3 operator * (const Vec3 &v) const { return Vec3(x * v.x, y * v.y, z * v.z); } + T dot(const Vec3 &v) const { return x * v.x + y * v.y + z * v.z; } + Vec3 operator - (const Vec3 &v) const { return Vec3(x - v.x, y - v.y, z - v.z); } + Vec3 operator + (const Vec3 &v) const { return Vec3(x + v.x, y + v.y, z + v.z); } + Vec3& operator += (const Vec3 &v) { x += v.x, y += v.y, z += v.z; return *this; } + Vec3& operator *= (const Vec3 &v) { x *= v.x, y *= v.y, z *= v.z; return *this; } + Vec3 operator - () const { return Vec3(-x, -y, -z); } + T length2() const { return x * x + y * y + z * z; } + T length() const { return sqrt(length2()); } + }; + + typedef Vec3 Vec3f; + typedef Vec3 Vec3b; + + Vec3f Vec3f_new_single(float xx) { + Vec3f v; + v.x = xx; + v.y = xx; + v.z = xx; + return v; + } + Vec3f Vec3f_new(float xx, float yy, float zz) { + Vec3f v; + v.x = xx; + v.y = yy; + v.z = zz; + return v; + } + Vec3f Vec3f_mult_single(const Vec3f *thes, const float f) { return Vec3f_new(thes->x * f, thes->y * f, thes->z * f); } + Vec3f Vec3f_mult(const Vec3f *thes, const Vec3f* v) { return Vec3f_new(thes->x * v->x, thes->y * v->y, thes->z * v->z); } + float Vec3f_dot(const Vec3f *thes, const Vec3f *v) { return thes->x * v->x + thes->y * v->y + thes->z * v->z; } + Vec3f Vec3f_minus(const Vec3f *thes, const Vec3f *v) { return Vec3f_new(thes->x - v->x, thes->y - v->y, thes->z - v->z); } + Vec3f Vec3f_plus(const Vec3f *thes, const Vec3f *v) { return Vec3f_new(thes->x + v->x, thes->y + v->y, thes->z + v->z); } + Vec3f Vec3f_inverse(const Vec3f *thes) { return Vec3f_new(-thes->x, -thes->y, -thes->z); } + float Vec3f_length2(const Vec3f *thes) { return thes->x * thes->x + thes->y * thes->y + thes->z * thes->z; } + void Vec3f_normalize(Vec3f *thes) { + float nor2 = Vec3f_length2(thes); + if (nor2 > 0) { + float invNor = 1 / sqrt(nor2); + thes->x *= invNor; + thes->y *= invNor; + thes->z *= invNor; + } + } + + class Sphere + { + public: + const char* id; + Vec3f center; /// position of the sphere + float radius, radius2; /// sphere radius and radius^2 + Vec3f surfaceColor, emissionColor; /// surface color and emission (light) + float transparency, reflection; /// surface transparency and reflectivity + int animation_frame; + Vec3b animation_position_rand; + Vec3f animation_position; + Sphere() { } + Sphere( + const char* id, + const Vec3f &c, + const float &r, + const Vec3f &sc, + const float &refl = 0, + const float &transp = 0, + const Vec3f &ec = 0) : + id(id), center(c), radius(r), radius2(r * r), surfaceColor(sc), + emissionColor(ec), transparency(transp), reflection(refl) + { + animation_frame = 0; + } + //[comment] + // Compute a ray-sphere intersection using the geometric solution + //[/comment] + bool intersect(const Vec3f &rayorig, const Vec3f &raydir, float &t0, float &t1) const + { + Vec3f l = center - rayorig; + float tca = l.dot(raydir); + if (tca < 0) return false; + float d2 = l.dot(l) - tca * tca; + if (d2 > radius2) return false; + float thc = sqrt(radius2 - d2); + t0 = tca - thc; + t1 = tca + thc; + + return true; + } + }; + + float mixfresnel(const float &a, const float &b, const float &mixval) { + return b * mixval + a * (1 - mixval); + } + + Vec3f trace( + const Vec3f *rayorig_ptr, + const Vec3f *raydir_ptr, + const Sphere *spheres, + const unsigned int spheres_size, + const int &depth) + { + const Vec3f rayorig = *rayorig_ptr; + const Vec3f raydir = *raydir_ptr; + + float tnear = 1e8; + const Sphere* sphere = NULL; + // find intersection of this ray with the sphere in the scene + for (unsigned i = 0; i < spheres_size; ++i) { + float t0 = 1e8, t1 = 1e8; + if (spheres[i].intersect(rayorig, raydir, t0, t1)) { + if (t0 < 0) t0 = t1; + if (t0 < tnear) { + tnear = t0; + sphere = &spheres[i]; + } + } + } + // if there's no intersection return black or background color + if (!sphere) return Vec3f(2); + Vec3f surfaceColor = 0; // color of the ray/surfaceof the object intersected by the ray + Vec3f phit = rayorig + raydir * tnear; // point of intersection + Vec3f nhit = phit - sphere->center; // normal at the intersection point + nhit.normalize(); // normalize normal direction + // If the normal and the view direction are not opposite to each other + // reverse the normal direction. That also means we are inside the sphere so set + // the inside bool to true. Finally reverse the sign of IdotN which we want + // positive. + float bias = 1e-4; // add some bias to the point from which we will be tracing + bool inside = false; + if (raydir.dot(nhit) > 0) nhit = -nhit, inside = true; + if ((sphere->transparency > 0 || sphere->reflection > 0) && depth < 5) { //MAX_RAY_DEPTH + float facingratio = 1+raydir.dot(nhit); + float fresneleffect = facingratio*facingratio*facingratio; + // change the mix value to tweak the effect + fresneleffect = mixfresnel(fresneleffect, 1, 0.1); + // compute reflection direction (not need to normalize because all vectors + // are already normalized) + Vec3f refldir = raydir - nhit * 2 * raydir.dot(nhit); + refldir.normalize(); + Vec3f new_rayorig = phit + nhit * bias; + Vec3f reflection = trace(&new_rayorig, &refldir, spheres, spheres_size, depth + 1); + Vec3f refraction = 0; + // if the sphere is also transparent compute refraction ray (transmission) + if (sphere->transparency) { + float ior = 1.1, eta = (inside) ? ior : 1 / ior; // are we inside or outside the surface? + float cosi = -nhit.dot(raydir); + float k = 1 - eta * eta * (1 - cosi * cosi); + Vec3f refrdir = raydir * eta + nhit * (eta * cosi - sqrt(k)); + refrdir.normalize(); + new_rayorig = phit - nhit * bias; + refraction = trace(&new_rayorig, &refrdir, spheres, spheres_size, depth + 1); + } + // the result is a mix of reflection and refraction (if the sphere is transparent) + surfaceColor = ( + reflection * fresneleffect + + refraction * (1 - fresneleffect) * sphere->transparency) * sphere->surfaceColor; + } + else { + // it's a diffuse object, no need to raytrace any further + for (unsigned i = 0; i < spheres_size; ++i) { + if (spheres[i].emissionColor.x > 0) { + // this is a light + Vec3f transmission = 1; + Vec3f lightDirection = spheres[i].center - phit; + lightDirection.normalize(); + for (unsigned j = 0; j < spheres_size; ++j) { + if (i != j) { + float t0, t1; + if (spheres[j].intersect(phit + nhit * bias, lightDirection, t0, t1)) { + transmission = 0; + break; + } + } + } + surfaceColor += sphere->surfaceColor * transmission * + max(float(0), nhit.dot(lightDirection)) * spheres[i].emissionColor; + } + } + } + + return surfaceColor + sphere->emissionColor; + } + ); + +// #elif GSPARDRIVER_OPENCL +#else // This way my IDE doesn't complain + + #include "GSPar_OpenCL.hpp" + using namespace GSPar::Driver::OpenCL; + + const char* extraKernelCode = GSPAR_STRINGIZE_SOURCE( + typedef struct tVec3b { bool x; bool y; bool z; } Vec3b; + typedef struct tVec3f { float x; float y; float z; } Vec3f; + Vec3f Vec3f_new_single(float xx) { + Vec3f v; + v.x = xx; + v.y = xx; + v.z = xx; + return v; + } + Vec3f Vec3f_new(float xx, float yy, float zz) { + Vec3f v; + v.x = xx; + v.y = yy; + v.z = zz; + return v; + } + Vec3f Vec3f_mult_single(const Vec3f *thes, const float f) { return Vec3f_new(thes->x * f, thes->y * f, thes->z * f); } + Vec3f Vec3f_mult(const Vec3f *thes, const Vec3f* v) { return Vec3f_new(thes->x * v->x, thes->y * v->y, thes->z * v->z); } + Vec3f Vec3f_mult__global_first(const __global Vec3f *thes, const Vec3f* v) { return Vec3f_new(thes->x * v->x, thes->y * v->y, thes->z * v->z); } + Vec3f Vec3f_mult__global_second(const Vec3f *thes, const __global Vec3f* v) { return Vec3f_new(thes->x * v->x, thes->y * v->y, thes->z * v->z); } + float Vec3f_dot(const Vec3f *thes, const Vec3f *v) { return thes->x * v->x + thes->y * v->y + thes->z * v->z; } + Vec3f Vec3f_minus(const Vec3f *thes, const Vec3f *v) { return Vec3f_new(thes->x - v->x, thes->y - v->y, thes->z - v->z); } + Vec3f Vec3f_minus__global_first(__global const Vec3f *thes, const Vec3f *v) { return Vec3f_new(thes->x - v->x, thes->y - v->y, thes->z - v->z); } + Vec3f Vec3f_minus__global_second(const Vec3f *thes, const __global Vec3f *v) { return Vec3f_new(thes->x - v->x, thes->y - v->y, thes->z - v->z); } + Vec3f Vec3f_plus(const Vec3f *thes, const Vec3f *v) { return Vec3f_new(thes->x + v->x, thes->y + v->y, thes->z + v->z); } + Vec3f Vec3f_plus__global_second(const Vec3f *thes, const __global Vec3f *v) { return Vec3f_new(thes->x + v->x, thes->y + v->y, thes->z + v->z); } + Vec3f Vec3f_inverse(const Vec3f *thes) { return Vec3f_new(-thes->x, -thes->y, -thes->z); } + float Vec3f_length2(const Vec3f *thes) { return thes->x * thes->x + thes->y * thes->y + thes->z * thes->z; } + void Vec3f_normalize(Vec3f *thes) { + float nor2 = Vec3f_length2(thes); + if (nor2 > 0) { + float invNor = 1 / sqrt(nor2); + thes->x *= invNor; + thes->y *= invNor; + thes->z *= invNor; + } + } + + typedef struct tSphere { + const char *id; + Vec3f center; + float radius, radius2; + Vec3f surfaceColor, emissionColor; + float transparency, reflection; + int animation_frame; + Vec3b animation_position_rand; + Vec3f animation_position; + } Sphere; + + bool Sphere_intersect(__global const Sphere* thes, const Vec3f *rayorig, const Vec3f *raydir, float *t0, float *t1) { + Vec3f l = Vec3f_minus__global_first(&thes->center, rayorig); + float tca = Vec3f_dot(&l, raydir); + if (tca < 0) return false; + float d2 = Vec3f_dot(&l, &l) - tca * tca; + if (d2 > thes->radius2) return false; + float thc = sqrt(thes->radius2 - d2); + *t0 = tca - thc; + *t1 = tca + thc; + + return true; + } + + float mix_fresnel(const float a, const float b, const float mixval) { + return b * mixval + a * (1 - mixval); + } + + Vec3f trace( + const Vec3f* rayorig, + const Vec3f* raydir, + const __global Sphere *spheres, + const unsigned int spheres_size, + const int depth) + { + float tnear = 1e8; + const __global Sphere* sphere = NULL; + // find intersection of the ray with the sphere in the scene + for (unsigned i = 0; i < spheres_size; ++i) { + float t0 = 1e8, t1 = 1e8; + if (Sphere_intersect(&spheres[i], rayorig, raydir, &t0, &t1)) { + if (t0 < 0) t0 = t1; + if (t0 < tnear) { + tnear = t0; + sphere = &spheres[i]; + } + } + } + + // if there's no intersection return black or background color + if (!sphere) return Vec3f_new_single(2); + Vec3f surfaceColor = Vec3f_new_single(0); // color of the ray/surfaceof the object intersected by the ray + Vec3f aux = Vec3f_mult_single(raydir, tnear); + Vec3f phit = Vec3f_plus(rayorig, &aux); + Vec3f nhit = Vec3f_minus__global_second(&phit, &sphere->center); // normal at the intersection point + Vec3f_normalize(&nhit); // normalize normal direction + // If the normal and the view direction are not opposite to each other + // reverse the normal direction. That also means we are inside the sphere so set + // the inside bool to true. Finally reverse the sign of IdotN which we want + // positive. + float bias = 1e-4; // add some bias to the point from which we will be tracing + bool inside = false; + if (Vec3f_dot(raydir, &nhit) > 0) { + nhit = Vec3f_inverse(&nhit); + inside = true; + } + if ((sphere->transparency > 0 || sphere->reflection > 0) && depth < 5) { // MAX_RAY_DEPTH + float facingratio = 1+Vec3f_dot(raydir, &nhit); + float fresneleffect = facingratio*facingratio*facingratio; + // change the mix value to tweak the effect + fresneleffect = mix_fresnel(fresneleffect, 1, 0.1); + // compute reflection direction (not need to normalize because all vectors + // are already normalized) + aux = Vec3f_mult_single(&nhit, 2); + aux = Vec3f_mult_single(&aux, Vec3f_dot(raydir, &nhit)); + Vec3f refldir = Vec3f_minus(raydir, &aux); + Vec3f_normalize(&refldir); + aux = Vec3f_mult_single(&nhit, bias); + aux = Vec3f_plus(&phit, &aux); + Vec3f reflection = trace(&aux, &refldir, spheres, spheres_size, depth + 1); + Vec3f refraction = Vec3f_new_single(0); + // if the sphere is also transparent compute refraction ray (transmission) + if (sphere->transparency) { + float ior = 1.1, eta = (inside) ? ior : 1 / ior; // are we inside or outside the surface? + float cosi = -Vec3f_dot(&nhit, raydir); + float k = 1 - eta * eta * (1 - cosi * cosi); + aux = Vec3f_mult_single(raydir, eta); + Vec3f aux2 = Vec3f_mult_single(&nhit, (eta * cosi - sqrt(k))); + Vec3f refrdir = Vec3f_plus(&aux, &aux2); + Vec3f_normalize(&refrdir); + aux = Vec3f_mult_single(&nhit, bias); + aux = Vec3f_minus(&phit, &aux); + refraction = trace(&aux, &refrdir, spheres, spheres_size, depth + 1); + } + // the result is a mix of reflection and refraction (if the sphere is transparent) + aux = Vec3f_mult_single(&reflection, fresneleffect); + Vec3f aux2 = Vec3f_mult_single(&refraction, (1 - fresneleffect) * sphere->transparency); + surfaceColor = Vec3f_plus(&aux, &aux2); + surfaceColor = Vec3f_mult__global_second(&surfaceColor, &sphere->surfaceColor); + } + else { + // it's a diffuse object, no need to raytrace any further + for (unsigned i = 0; i < spheres_size; ++i) { + if (spheres[i].emissionColor.x > 0) { + // this is a light + Vec3f transmission = Vec3f_new_single(1); + Vec3f lightDirection = Vec3f_minus__global_first(&spheres[i].center, &phit); + Vec3f_normalize(&lightDirection); + for (unsigned j = 0; j < spheres_size; ++j) { + if (i != j) { + float t0, t1; //Unused + // t0 = 0; + // t1 = 0; + aux = Vec3f_mult_single(&nhit, bias); + aux = Vec3f_plus(&phit, &aux); + if (Sphere_intersect(&spheres[j], &aux, &lightDirection, &t0, &t1)) { + transmission = Vec3f_new_single(0); + break; + } + } + } + + aux = Vec3f_mult__global_first(&sphere->surfaceColor, &transmission); + aux = Vec3f_mult_single(&aux, fmax((float)0, Vec3f_dot(&nhit, &lightDirection))); + aux = Vec3f_mult__global_second(&aux, &spheres[i].emissionColor); + surfaceColor = Vec3f_plus(&surfaceColor, &aux); + } + } + } + + return Vec3f_plus__global_second(&surfaceColor, &sphere->emissionColor); + } + ); + +#endif + +#include "GSPar_PatternMap.hpp" +using namespace GSPar::Pattern; + +#if defined __linux__ || defined __APPLE__ +// "Compiled for Linux +#else +// Windows doesn't define these values by default, Linux does +#define M_PI 3.141592653589793 +#endif + +// This variable controls if it should work in memory. If it is not defined, works in disk +#define WORK_IN_MEMORY + +#ifdef WORK_IN_MEMORY +#define WORKING_MEDIA "memory" +#else +#define WORKING_MEDIA "disk" +#endif + +class Vec3f { +public: + float x, y, z; + Vec3f() : x(0), y(0), z(0) {} + Vec3f(float xx) : x(xx), y(xx), z(xx) {} + Vec3f(float xx, float yy, float zz) : x(xx), y(yy), z(zz) {} +}; +struct Vec3b { + bool x; bool y; bool z; +}; + +class Sphere +{ +public: + const char *id; + Vec3f center; /// position of the sphere + float radius, radius2; /// sphere radius and radius^2 + Vec3f surfaceColor, emissionColor; /// surface color and emission (light) + float transparency, reflection; /// surface transparency and reflectivity + int animation_frame; + Vec3b animation_position_rand; + Vec3f animation_position; + Sphere() { } + Sphere( + const char *id, + const Vec3f &c, + const float &r, + const Vec3f &sc, + const float &refl = 0, + const float &transp = 0, + const Vec3f &ec = 0) : + id(id), center(c), radius(r), radius2(r * r), surfaceColor(sc), + emissionColor(ec), transparency(transp), reflection(refl) + { + animation_frame = 0; + } +}; + +void save_image(const std::string output_folder, const int frame, const unsigned int width, const unsigned int height, Vec3f *image) { + // Save result to a PPM image (keep these flags if you compile under Windows) + std::stringstream ss; + ss << std::setfill('0') << std::setw(5) << frame; + std::string filename = output_folder + "/frame" + ss.str() + ".ppm"; +#ifdef DEBUG + std::cout << "[Work] Writing frame " << frame << " to " << filename << std::endl; +#endif + std::ofstream ofs(filename, std::ios::out | std::ios::binary); + ofs << "P6\n" << width << " " << height << "\n255\n"; + for (unsigned i = 0; i < width * height; ++i) { + ofs << (unsigned char)(std::min(float(1), image[i].x) * 255) << + (unsigned char)(std::min(float(1), image[i].y) * 255) << + (unsigned char)(std::min(float(1), image[i].z) * 255); + } + ofs.close(); +} + + +void raytrace(std::string output_folder, int total_frames, unsigned int width, unsigned int height, const std::vector &initial_spheres) { + float invWidth = 1 / float(width); + float invHeight = 1 / float(height); + float fov = 30; + float aspectratio = width / float(height); + float angle = tan(M_PI * 0.5 * fov / 180.); + + // std::cout << "[Vec3f] CPU version is " << sizeof(Vec3f) << ", gpu version is " << sizeof(GpuVec3f) << std::endl; + // std::cout << "[Sphere] CPU version is " << sizeof(Sphere) << ", gpu version is " << sizeof(GpuSphere) << std::endl; + +#ifdef WORK_IN_MEMORY + unsigned int total_memory = sizeof(Vec3f)*total_frames*width*height; + std::string total_memory_unit = " bytes"; + if (total_memory > 1024) { + total_memory = (total_frames*width*height)/1024; + total_memory_unit = " KB"; + } + if (total_memory > (10*1024)) { + total_memory /= 1024; + total_memory_unit = " MB"; + } +#ifdef DEBUG + std::cout << "[Init] Allocating " << total_memory << total_memory_unit << " of memory to store images" << std::endl; +#endif + Vec3f **images = new Vec3f*[total_frames]; + for (int f=0; fsetParameter("width", width) + .setParameter("invWidth", invWidth) + .setParameter("invHeight", invHeight) + .setParameter("aspectratio", aspectratio) + .setParameter("angle", angle) + .setParameterPlaceholder("image", GSPAR_PARAM_POINTER, GSPAR_PARAM_INOUT) + .setParameterPlaceholder("spheres") + .setParameterPlaceholder("spheres_size", GSPAR_PARAM_VALUE); + + // Extra kernel code + pattern->addExtraKernelCode(extraKernelCode); + + unsigned long dimensions[3] = {width, height, 0}; + pattern->compile(dimensions); + + } catch (GSPar::GSParException &ex) { + std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl; + exit(-1); + } + +# ifndef NO_TIME_MEASUREMENT +#ifdef DEBUG + std::cout << "[Time] Starting time measurement" << std::endl; +#endif + time_t wall_start, wall_end; + time(&wall_start); + clock_t cpu_start = clock(); +# endif + + for (int frame = 1; frame <= total_frames; frame++) { +#ifdef DEBUG + std::cout << "[Work] Generating frame " << frame << "..." << std::endl; +#endif + // Set up the scenne + unsigned int spheres_size = initial_spheres.size(); + Sphere* spheres = new Sphere[spheres_size]; + memcpy(spheres, initial_spheres.data(), sizeof(Sphere) * spheres_size); + + // Animation of each frame + for(unsigned long i = 0; i != spheres_size; i++) { + if (spheres[i].animation_frame != 0 && + (spheres[i].animation_frame > 0 && frame < spheres[i].animation_frame)) { + continue; + } + + int adjusted_frame = frame; + if (spheres[i].animation_frame < 0) { + if (frame > spheres[i].animation_frame*-1) { + adjusted_frame = spheres[i].animation_frame*-1; + } + } else if (spheres[i].animation_frame > 0) { + adjusted_frame -= spheres[i].animation_frame; + } + + if (spheres[i].animation_position.x) { + if (spheres[i].animation_position_rand.x) { + spheres[i].center.x += (drand48()*spheres[i].animation_position.x); + } else { + spheres[i].center.x += adjusted_frame*spheres[i].animation_position.x; + } + } + if (spheres[i].animation_position.y) { + if (spheres[i].animation_position_rand.y) { + spheres[i].center.y += (drand48()*spheres[i].animation_position.y); + } else { + spheres[i].center.y += adjusted_frame*spheres[i].animation_position.y; + } + } + if (spheres[i].animation_position.z) { + if (spheres[i].animation_position_rand.z) { + spheres[i].center.z += (drand48()*spheres[i].animation_position.z); + } else { + spheres[i].center.z += adjusted_frame*spheres[i].animation_position.z; + } + } + } + +#ifdef WORK_IN_MEMORY + Vec3f *image = images[frame-1]; +#else + Vec3f *image = new Vec3f[width * height]; +#endif + + try { + + // // Trace rays + // for (unsigned y = 0; y < height; ++y) { + // for (unsigned x = 0; x < width; ++x) { + // float xx = (2 * ((x + 0.5) * invWidth) - 1) * angle * aspectratio; + // float yy = (1 - 2 * ((y + 0.5) * invHeight)) * angle; + // Vec3f raydir(xx, yy, -1); + // raydir.normalize(); + // image[y*width+x] = trace(Vec3f(0), raydir, spheres, 0); + // } + // } + + // Kernel parameters + pattern->setParameter("image", sizeof(Vec3f) * width * height, image, GSPAR_PARAM_INOUT) + .setParameter("spheres", sizeof(Sphere) * spheres_size, spheres) + .setParameter("spheres_size", spheres_size); + + unsigned long dimensions[3] = {width, height, 0}; + pattern->run(dimensions); + + } catch (GSPar::GSParException &ex) { + std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl; + exit(-1); + } + + delete [] spheres; +#ifndef WORK_IN_MEMORY + save_image(output_folder, frame, width, height, image); + delete [] image; +#endif + } + +# ifndef NO_TIME_MEASUREMENT +#ifdef DEBUG + std::cout << "[Time] Stopping time measurement" << std::endl; +#endif + clock_t cpu_end = clock(); + time(&wall_end); + double cpu_time_seconds = ((double) (cpu_end - cpu_start)) / CLOCKS_PER_SEC; + double wall_time_seconds = difftime(wall_end, wall_start); + printf("The generation of %d frames in %s of %u x %u with %lu spheres took:\n", total_frames, WORKING_MEDIA, width, height, initial_spheres.size()); + printf("%.0f wall-clock seconds (%.2f FPS)\n", wall_time_seconds, ((double)total_frames)/wall_time_seconds); + printf("%.2f CPU time seconds\n", cpu_time_seconds); +# endif + +#ifdef WORK_IN_MEMORY + for (int frame = 1; frame <= total_frames; frame++) { + save_image(output_folder, frame, width, height, images[frame-1]); + delete [] images[frame-1]; + } + delete [] images; +#endif +} + + +int main(int argc, char **argv) +{ + int image_size_parameter = 2; + int total_frames = 1; + + if (argc < 3) { + std::cerr << "Usage: " << argv[0] << " [ [ ]]" << std::endl; + std::cerr << " : XML with the scene description" << std::endl; + std::cerr << " : Folder on which produce output images" << std::endl; + std::cerr << " : Size of images to generate, a single integer meaning 1=320x180, 2=640x360, 4=HD, 6=FHD and so on. Defaults to " << image_size_parameter << std::endl; + std::cerr << " : Number of frames to produce. Defaults to " << total_frames << std::endl; + exit(1); + } + srand48(13); + + std::string scene_filename(argv[1]); + std::string output_folder = argv[2]; + if (argc > 3) { + image_size_parameter = atoi(argv[3]); + } + if (argc > 4) { + total_frames = atoi(argv[4]); + } + + // 1 = 320x180 + // 2 = 640x360 + // 4 = 1280x720 (HD) + // 6 = 1920x1080 (FHD) + unsigned int image_size_multiplier = 20*image_size_parameter; + + unsigned int width = image_size_multiplier*16; + unsigned int height = image_size_multiplier*9; + + std::vector initial_spheres; + +#ifdef DEBUG + std::cout << "[Init] Generating " << total_frames << " frames of " << width << "x" << height << " in " << WORKING_MEDIA << " in " << output_folder << std::endl; + std::cout << "[Init] Loading scene from " << scene_filename << std::endl; +#endif + + // Parses the scene + std::ifstream scene_file(scene_filename, std::ios::binary | std::ios::ate); + std::streamsize scene_file_size = scene_file.tellg(); + scene_file.seekg(0, std::ios::beg); + char *scene_buffer = new char[scene_file_size]; + if (scene_file.read(scene_buffer, scene_file_size)) { + rapidxml::xml_document<> doc; + doc.parse<0>(scene_buffer); + rapidxml::xml_node<> *scene_node = doc.first_node("scene"); + + rapidxml::xml_node<> *spheres_node = scene_node->first_node("spheres"); + rapidxml::xml_node<> *sphere_node = spheres_node->first_node(); + while (sphere_node != 0) { + // position, radius, surface color, reflectivity, transparency, emission color + initial_spheres.push_back(Sphere( + sphere_node->first_attribute("id")->value(), + Vec3f( //Center position + atof(sphere_node->first_node("position")->first_attribute("x")->value()), + atof(sphere_node->first_node("position")->first_attribute("y")->value()), + atof(sphere_node->first_node("position")->first_attribute("z")->value()) + ), + atof(sphere_node->first_node("size")->first_attribute("radius")->value()), // Radius + Vec3f( //Surface color + atof(sphere_node->first_node("surface_color")->first_attribute("red")->value()), + atof(sphere_node->first_node("surface_color")->first_attribute("green")->value()), + atof(sphere_node->first_node("surface_color")->first_attribute("blue")->value()) + ), + atof(sphere_node->first_node("reflectivity")->first_attribute("value")->value()), // Reflectivity + atof(sphere_node->first_node("transparency")->first_attribute("value")->value()) // Transparency + )); + if (sphere_node->first_node("emission_color")) { + initial_spheres.back().emissionColor = Vec3f( + atof(sphere_node->first_node("emission_color")->first_attribute("red")->value()), + atof(sphere_node->first_node("emission_color")->first_attribute("green")->value()), + atof(sphere_node->first_node("emission_color")->first_attribute("blue")->value()) + ); + } + sphere_node = sphere_node->next_sibling(); + } +#ifdef DEBUG + std::cout << "[Init] Loaded " << initial_spheres.size() << " spheres, looking for animations" << std::endl; +#endif + + rapidxml::xml_node<> *animation_node = scene_node->first_node("animation"); + for (rapidxml::xml_node<> *sphere_animation = animation_node->first_node(); + sphere_animation; sphere_animation = sphere_animation->next_sibling()) { + std::string id = sphere_animation->first_attribute("id")->value(); + for(unsigned long i = 0; i != initial_spheres.size(); i++) { + if (id == initial_spheres[i].id) { + rapidxml::xml_node<> *position_node = sphere_animation->first_node("position"); + if (position_node) { + rapidxml::xml_attribute<> *attr; + attr = position_node->first_attribute("after"); + if (attr) { + initial_spheres[i].animation_frame = atoi(attr->value()); + } + attr = position_node->first_attribute("before"); + if (attr) { + initial_spheres[i].animation_frame = atoi(attr->value())*-1; + } + attr = position_node->first_attribute("x"); + if (attr) { + if (strcmp(attr->value(), "random") == 0) { + initial_spheres[i].animation_position_rand.x = true; + initial_spheres[i].animation_position.x = atof(position_node->first_attribute("random")->value()); + } else { + initial_spheres[i].animation_position.x = atof(attr->value()); + } + } + attr = position_node->first_attribute("y"); + if (attr) { + if (strcmp(attr->value(), "random") == 0) { + initial_spheres[i].animation_position_rand.y = true; + initial_spheres[i].animation_position.y = atof(position_node->first_attribute("random")->value()); + } else { + initial_spheres[i].animation_position.y = atof(position_node->first_attribute("y")->value()); + } + } + attr = position_node->first_attribute("z"); + if (attr) { + if (strcmp(attr->value(), "random") == 0) { + initial_spheres[i].animation_position_rand.z = true; + initial_spheres[i].animation_position.z = atof(position_node->first_attribute("random")->value()); + } else { + initial_spheres[i].animation_position.z = atof(position_node->first_attribute("z")->value()); + } + } + } + } + } + } +#ifdef DEBUG + std::cout << "[Init] Finished loading animation for spheres" << std::endl; +#endif + + } + + raytrace(output_folder, total_frames, width, height, initial_spheres); + + return 0; +} \ No newline at end of file diff --git a/examples/pattern_api/reduce_sample.cpp b/examples/pattern_api/reduce_sample.cpp new file mode 100644 index 0000000..9e8a3e5 --- /dev/null +++ b/examples/pattern_api/reduce_sample.cpp @@ -0,0 +1,55 @@ +#ifdef GSPARDRIVER_CUDA + #include "GSPar_CUDA.hpp" + using namespace GSPar::Driver::CUDA; +#else + #include "GSPar_OpenCL.hpp" + using namespace GSPar::Driver::OpenCL; +#endif +#include "GSPar_PatternReduce.hpp" +using namespace GSPar::Pattern; + +int reduce_sum(const int size, const int *vector) { + int total; + try { + auto pattern = new Reduce("in_vector", "+", "total"); + pattern->setParameter("in_vector", sizeof(int) * size, vector) + .setParameter("total", sizeof(int), &total, GSPAR_PARAM_OUT); + pattern->run({(unsigned int)size, 0}); + delete pattern; + } catch (GSPar::GSParException &ex) { + std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl; + exit(-1); + } + return total; +} + +void print_vector(int size, const int* vector, bool compact = false) { + if (compact || size > 100) { + std::cout << vector[0] << "..." << vector[size-1]; + } else { + for (int i = 0; i < size; i++) { + std::cout << vector[i] << " "; + } + } + std::cout << std::endl; +} + +int main(int argc, const char * argv[]) { + if (argc < 2) { + std::cerr << "Use: " << argv[0] << " " << std::endl; + exit(-1); + } + + const int VECTOR_SIZE = std::stoul(argv[1]); + int *vector = new int[VECTOR_SIZE]; + for (int i = 0; i < VECTOR_SIZE; i++) { + vector[i] = i; + } + + std::cout << "Summing vector: "; + print_vector(VECTOR_SIZE, vector); + + int total = reduce_sum(VECTOR_SIZE, vector); + + std::cout << "Summed vector of " << VECTOR_SIZE << " elements: " << total << std::endl; +} \ No newline at end of file diff --git a/examples/pattern_api/vector_sum_map.cpp b/examples/pattern_api/vector_sum_map.cpp new file mode 100644 index 0000000..d096da6 --- /dev/null +++ b/examples/pattern_api/vector_sum_map.cpp @@ -0,0 +1,107 @@ +#include +#include + +#ifdef GSPARDRIVER_OPENCL + #include "GSPar_OpenCL.hpp" + using namespace GSPar::Driver::OpenCL; +#else + #include "GSPar_CUDA.hpp" + using namespace GSPar::Driver::CUDA; +#endif + +#include "GSPar_PatternMap.hpp" +using namespace GSPar::Pattern; + +void vector_sum(const unsigned int max, const unsigned int* a, const unsigned int* b, unsigned int* result) { + try { + + auto pattern = new Map(GSPAR_STRINGIZE_SOURCE( + result[x] = a[x] + b[x]; + )); + + pattern->setParameter("a", sizeof(unsigned int) * max, a) + .setParameter("b", sizeof(unsigned int) * max, b) + .setParameter("result", sizeof(unsigned int) * max, result, GSPAR_PARAM_OUT); + + // This set only max values + unsigned long dims[3] = {max, 0, 0}; // Pass ulong max values directly + // GSPar::Driver::Dimensions dims(max, 0, 0); // Makes struct passing max values + // GSPar::Driver::Dimensions dims = {max, 0, 0}; // Makes struct using auto-initialization with ulong max values + + // This way we can set max and min values + // GSPar::Driver::Dimensions dims = { // Makes struct using auto-initialization with ulong max and min values + // {max, 0}, // X: max, min + // {0, 0}, // Y: max, min + // {0, 0} // Z: max, min + // }; + + // Makes empty struct and them fill values for intended dimensions + // GSPar::Driver::Dimensions dims; + // dims.x = GSPar::Driver::SingleDimension(max, 5); + + pattern->run(dims); + + // We could also call initialize the Dimensions directly when calling the method: + // pattern->run({max, 0}); + + delete pattern; + + } catch (GSPar::GSParException &ex) { + std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl; + exit(-1); + } +} + +void print_vector(unsigned int size, const unsigned int* vector, bool compact = false) { + if (compact || size > 100) { + std::cout << vector[0] << "..." << vector[size-1]; + } else { + for (unsigned int i = 0; i < size; i++) { + std::cout << vector[i] << " "; + } + } + std::cout << std::endl; +} + +int main(int argc, const char * argv[]) { + if (argc < 2) { + std::cerr << "Use: " << argv[0] << " " << std::endl; + exit(-1); + } + + const unsigned int VECTOR_SIZE = std::stoi(argv[1]); + + // Create memory objects + unsigned int* result = new unsigned int[VECTOR_SIZE]; + unsigned int* a = new unsigned int[VECTOR_SIZE]; + unsigned int* b = new unsigned int[VECTOR_SIZE]; + for (unsigned int i = 0; i < VECTOR_SIZE; i++) { + a[i] = i; + b[i] = i + 1; + result[i] = 0; + } + + std::cout << "Summing vectors:" << std::endl; + std::cout << "Vector A: "; + print_vector(VECTOR_SIZE, a); + std::cout << "Vector B: "; + print_vector(VECTOR_SIZE, b); + + auto t_start = std::chrono::steady_clock::now(); + + vector_sum(VECTOR_SIZE, a, b, result); + + auto t_end = std::chrono::steady_clock::now(); + + // Output the result buffer + std::cout << "Result: "; + print_vector(VECTOR_SIZE, result); + + delete result; + delete a; + delete b; + + std::cout << "Test finished succesfully in " << std::chrono::duration_cast(t_end - t_start).count() << " ms " << std::endl; + + return 0; +} diff --git a/examples/pattern_api/vector_sum_map_batch.cpp b/examples/pattern_api/vector_sum_map_batch.cpp new file mode 100644 index 0000000..63e4b12 --- /dev/null +++ b/examples/pattern_api/vector_sum_map_batch.cpp @@ -0,0 +1,117 @@ +#include +#include + +#ifdef GSPARDRIVER_OPENCL + #include "GSPar_OpenCL.hpp" + using namespace GSPar::Driver::OpenCL; +#else + #include "GSPar_CUDA.hpp" + using namespace GSPar::Driver::CUDA; +#endif + +#include "GSPar_PatternMap.hpp" +using namespace GSPar::Pattern; + +void vector_sum(const unsigned int num_vectors, const unsigned int batch_size, const unsigned int vector_size, unsigned int **as, unsigned int **bs, unsigned int **results) { + try { + + auto pattern = new Map(GSPAR_STRINGIZE_SOURCE( + result[x] = a[x] + b[x]; + )); + + pattern->setParameter("size", vector_size) + .setParameterPlaceholder("a", GSPAR_PARAM_POINTER, GSPAR_PARAM_IN, true) + .setParameterPlaceholder("b", GSPAR_PARAM_POINTER, GSPAR_PARAM_IN, true) + .setParameterPlaceholder("result", GSPAR_PARAM_POINTER, GSPAR_PARAM_OUT, true); + + pattern->setBatchSize(batch_size); + + pattern->compile({vector_size, 0}); + + // If num_vectors is not divisible by batch_size, the lib issues a segfault. + // unsigned int batches = ceil((double)num_vectors/batch_size); + unsigned int batches = num_vectors/batch_size; + for (unsigned int b = 0; b < batches; b++) { + pattern->setBatchedParameter("a", sizeof(unsigned int) * vector_size, &as[b*batch_size]) + .setBatchedParameter("b", sizeof(unsigned int) * vector_size, &bs[b*batch_size]) + .setBatchedParameter("result", sizeof(unsigned int) * vector_size, &results[b*batch_size], GSPAR_PARAM_OUT); + + pattern->run(); + } + + } catch (GSPar::GSParException &ex) { + std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl; + exit(-1); + } +} + +void print_vector(unsigned int size, const unsigned int* vector, bool compact = false) { + if (compact || size > 100) { + std::cout << vector[0] << "..." << vector[size-1]; + } else { + for (unsigned int i = 0; i < size; i++) { + std::cout << vector[i] << " "; + } + } + std::cout << std::endl; +} + +int main(int argc, const char * argv[]) { + if (argc < 4) { + std::cerr << "Use: " << argv[0] << " " << std::endl; + exit(-1); + } + + const unsigned int VECTOR_SIZE = std::stoi(argv[1]); + const unsigned int NUM_VECTORS = std::stoi(argv[2]); + const unsigned int BATCH_SIZE = std::stoi(argv[3]); + + // Create memory objects + unsigned int** results = new unsigned int*[NUM_VECTORS]; + unsigned int** as = new unsigned int*[NUM_VECTORS]; + unsigned int** bs = new unsigned int*[NUM_VECTORS]; + for (unsigned int v = 0; v < NUM_VECTORS; v++) { + results[v] = new unsigned int[VECTOR_SIZE]; + as[v] = new unsigned int[VECTOR_SIZE]; + bs[v] = new unsigned int[VECTOR_SIZE]; + for (unsigned int i = 0; i < VECTOR_SIZE; i++) { + as[v][i] = i + v; + bs[v][i] = i + v + 1; + results[v][i] = 0; + } + } + + std::cout << "Summing " << NUM_VECTORS << " vectors:" << std::endl; + for (unsigned int v = 0; v < NUM_VECTORS; v++) { + std::cout << "Vector A" << v+1 << ": "; + print_vector(VECTOR_SIZE, as[v]); + std::cout << "Vector B" << v+1 << ": "; + print_vector(VECTOR_SIZE, bs[v]); + } + + auto t_start = std::chrono::steady_clock::now(); + + vector_sum(NUM_VECTORS, BATCH_SIZE, VECTOR_SIZE, as, bs, results); + + auto t_end = std::chrono::steady_clock::now(); + + // Output the result buffer + std::cout << "Results:" << std::endl; + for (unsigned int v = 0; v < NUM_VECTORS; v++) { + std::cout << "Vector " << v+1 << ": "; + print_vector(VECTOR_SIZE, results[v]); + } + + for (unsigned int v = 0; v < NUM_VECTORS; v++) { + delete results[v]; + delete as[v]; + delete bs[v]; + } + delete results; + delete as; + delete bs; + + std::cout << "Test finished succesfully in " << std::chrono::duration_cast(t_end - t_start).count() << " ms " << std::endl; + + return 0; +} diff --git a/examples/pattern_api/vector_sum_map_managing_memory.cpp b/examples/pattern_api/vector_sum_map_managing_memory.cpp new file mode 100644 index 0000000..1c2e690 --- /dev/null +++ b/examples/pattern_api/vector_sum_map_managing_memory.cpp @@ -0,0 +1,103 @@ +#include +#include + +#ifdef GSPARDRIVER_OPENCL + #include "GSPar_OpenCL.hpp" + using namespace GSPar::Driver::OpenCL; +#else + #include "GSPar_CUDA.hpp" + using namespace GSPar::Driver::CUDA; +#endif + +#include "GSPar_PatternMap.hpp" +using namespace GSPar::Pattern; + +void vector_sum(const unsigned int max, const unsigned int* a, const unsigned int* b, unsigned int* result) { + try { + + auto pattern = new Map(GSPAR_STRINGIZE_SOURCE( + result[x] = a[x] + b[x]; + )); + + auto gpu = pattern->getGpu(); + // Memory spaces for matrixes a and result are managed by hand by the programmer, + // while matrix b is managed automatically by GSParLib. + auto resultA = gpu->malloc(sizeof(unsigned int) * max, a); + resultA->copyIn(); + auto resultDev = gpu->malloc(sizeof(unsigned int) * max, result); + + // The direction GSPAR_PARAM_PRESENT indicates to GSParLib that the data is already + // in the GPU memory and no memory copies should be performed. + pattern->setParameter("a", resultA, GSPAR_PARAM_PRESENT) + .setParameter("b", sizeof(unsigned int) * max, b) + .setParameter("result", resultDev, GSPAR_PARAM_PRESENT); + + pattern->run({max, 0}); + + // Since the parameter was informed using GSPAR_PARAM_PRESENT, we should copy the data. + // This would not be necessary if we passed the parameter with the direction GSPAR_PARAM_OUT. + resultDev->copyOut(); + + delete resultA; + delete resultDev; + delete pattern; + + } catch (GSPar::GSParException &ex) { + std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl; + exit(-1); + } +} + +void print_vector(unsigned int size, const unsigned int* vector, bool compact = false) { + if (compact || size > 100) { + std::cout << vector[0] << "..." << vector[size-1]; + } else { + for (unsigned int i = 0; i < size; i++) { + std::cout << vector[i] << " "; + } + } + std::cout << std::endl; +} + +int main(int argc, const char * argv[]) { + if (argc < 2) { + std::cerr << "Use: " << argv[0] << " " << std::endl; + exit(-1); + } + + const unsigned int VECTOR_SIZE = std::stoi(argv[1]); + + // Create memory objects + unsigned int* result = new unsigned int[VECTOR_SIZE]; + unsigned int* a = new unsigned int[VECTOR_SIZE]; + unsigned int* b = new unsigned int[VECTOR_SIZE]; + for (unsigned int i = 0; i < VECTOR_SIZE; i++) { + a[i] = i; + b[i] = i + 1; + result[i] = 0; + } + + std::cout << "Summing vectors:" << std::endl; + std::cout << "Vector A: "; + print_vector(VECTOR_SIZE, a); + std::cout << "Vector B: "; + print_vector(VECTOR_SIZE, b); + + auto t_start = std::chrono::steady_clock::now(); + + vector_sum(VECTOR_SIZE, a, b, result); + + auto t_end = std::chrono::steady_clock::now(); + + // Output the result buffer + std::cout << "Result: "; + print_vector(VECTOR_SIZE, result); + + delete result; + delete a; + delete b; + + std::cout << "Test finished succesfully in " << std::chrono::duration_cast(t_end - t_start).count() << " ms " << std::endl; + + return 0; +} diff --git a/examples/pattern_api/vector_sum_map_parallel.cpp b/examples/pattern_api/vector_sum_map_parallel.cpp new file mode 100644 index 0000000..a930d6b --- /dev/null +++ b/examples/pattern_api/vector_sum_map_parallel.cpp @@ -0,0 +1,195 @@ +#include +#include +#include +#include + +#ifdef GSPARDRIVER_OPENCL + #include "GSPar_OpenCL.hpp" + using namespace GSPar::Driver::OpenCL; +#else + #include "GSPar_CUDA.hpp" + using namespace GSPar::Driver::CUDA; +#endif + +#include "GSPar_PatternMap.hpp" +using namespace GSPar::Pattern; + +struct Task { + float* a; + float* b; + float* result; + float total; +}; + +void vector_sum(const unsigned int from, const unsigned int to, const unsigned int max, Task* tasks, Map* pattern) { + // Sequential version, for debugging purposes + // for (unsigned int t = from; t < to; t++) { + // tasks[t].total = 0; + // for (unsigned int x = 0; x < max; x++) { + // tasks[t].result[x] = tasks[t].a[x] + tasks[t].b[x]; + // tasks[t].total += tasks[t].result[x]; + // } + // } + // return; + + std::stringstream ss; + +#ifdef GSPAR_DEBUG + ss << "Pattern " << pattern << " processing tasks " << from+1 << " to " << to << std::endl; + std::cout << ss.str(); + ss.str(""); +#endif + for (unsigned int t = from; t < to; t++) { + try { + + // Now we set the real parameter values + pattern->setParameter("a", sizeof(float) * max, tasks[t].a) + .setParameter("b", sizeof(float) * max, tasks[t].b) + .setParameter("result", sizeof(float) * max, tasks[t].result, GSPAR_PARAM_OUT); + + + // As we compiled the kernel before, it is not needed to compile it again now. + // The pattern will automatically skip the compiling phase. + unsigned long dimensions[3] = {max, 0, 0}; // If the dimensions were to be different from the already compiled kernel, it would be re-compiled. +#ifdef GSPAR_DEBUG + ss << "Pattern " << pattern << " running task " << (t+1) << std::endl; + std::cout << ss.str(); + ss.str(""); +#endif + pattern->run(dimensions); + + // Reduce on CPU + for (unsigned int x = 0; x < max; x++) { + tasks[t].total += tasks[t].result[x]; + } + + } catch (GSPar::GSParException &ex) { + std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl; + exit(-1); + } + } + + delete pattern; + +} + +void process_tasks(const unsigned int max, unsigned int tasks_size, Task* tasks, unsigned int workers) { + // Sequential version, for debugging purposes + // for (unsigned int t = 0; t < tasks_size; t++) { + // tasks[t].total = 0; + // for (unsigned int x = 0; x < max; x++) { + // tasks[t].result[x] = tasks[t].a[x] + tasks[t].b[x]; + // tasks[t].total += tasks[t].result[x]; + // } + // } + // return; + + // We assume that tasks_size is divisible by workers + const unsigned int work_for_each = tasks_size/workers; + std::cout << "Starting " << workers << " workers to process " << tasks_size << " tasks, " << work_for_each << " tasks for each worker" << std::endl; + + auto pattern = new Map(GSPAR_STRINGIZE_SOURCE( + result[x] = a[x] + b[x]; + )); + + try { + + // Fixed value parameters can be set. Parameter placeholder are for compiling the kernel. + pattern->setParameterPlaceholder("a") + .setParameterPlaceholder("b") + .setParameterPlaceholder("result", GSPAR_PARAM_POINTER, GSPAR_PARAM_OUT); + + // Compile the kernel once before cloning the pattern. The compiled Kernel would be copied over to all pattern's clones + unsigned long dimensions[3] = {max, 0, 0}; + pattern->compile(dimensions); + + } catch (GSPar::GSParException &ex) { + std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl; + exit(-1); + } + + // std::cout << "Pattern have " << pattern->getParameterList().size() << " parameters" << std::endl; + + std::thread* threads = new std::thread[workers]; + for (unsigned int w = 0; w < workers; w++) { + unsigned int from = w*work_for_each; + unsigned int to = from+work_for_each; + // Pattern must be cloned for each thread. The compiled kernel is thread-safe and therefore is carried over. + auto patternCopy = pattern->clone(); + threads[w] = std::thread(vector_sum, from, to, max, tasks, patternCopy); + } + + for (unsigned int w = 0; w < workers; w++) { + threads[w].join(); + } +} + +void print_vector(unsigned int size, const float* vector, float total = 0, bool compact = false) { + if (compact || size > 100) { + std::cout << vector[0] << "..." << vector[size-1]; + if (total) std::cout << " = " << total; + } else { + for (unsigned int i = 0; i < size; i++) { + std::cout << vector[i] << " "; + } + if (total) std::cout << "= " << total; + } + std::cout << std::endl; +} + +int main(int argc, const char * argv[]) { + if (argc < 4) { + std::cerr << "Use: " << argv[0] << " " << std::endl; + exit(-1); + } + + const unsigned int VECTOR_SIZE = std::stoi(argv[1]); + const unsigned int WORKERS = std::stoi(argv[2]); + const unsigned int NUM_TASKS = std::stoi(argv[3]); + + if (NUM_TASKS % WORKERS != 0) { + std::cerr << "Number of tasks (" << NUM_TASKS << ") must be divisible by number of workers (" << WORKERS << ")!" << std::endl; + exit(-1); + } + std::cout << "Summing vectors:" << std::endl; + + Task* tasks = new Task[NUM_TASKS]; + // Create memory objects + for (unsigned int t = 0; t < NUM_TASKS; t++) { + tasks[t].result = new float[VECTOR_SIZE]; + tasks[t].a = new float[VECTOR_SIZE]; + tasks[t].b = new float[VECTOR_SIZE]; + for (unsigned int i = 0; i < VECTOR_SIZE; i++) { + tasks[t].a[i] = (float)(i+t); + tasks[t].b[i] = (float)((i+t) * 2); + tasks[t].result[i] = 0; + } + + std::cout << "Task " << (t+1) << " vector A: "; + print_vector(VECTOR_SIZE, tasks[t].a); + std::cout << "Task " << (t+1) << " vector B: "; + print_vector(VECTOR_SIZE, tasks[t].b); + } + + auto t_start = std::chrono::steady_clock::now(); + + process_tasks(VECTOR_SIZE, NUM_TASKS, tasks, WORKERS); + + auto t_end = std::chrono::steady_clock::now(); + + // Output the result buffer + std::cout << "Results: " << std::endl; + for (unsigned int t = 0; t < NUM_TASKS; t++) { + std::cout << "Task " << (t+1) << ": "; + print_vector(VECTOR_SIZE, tasks[t].result, tasks[t].total); + + delete[] tasks[t].result; + delete[] tasks[t].a; + delete[] tasks[t].b; + } + delete tasks; + + std::cout << "Test finished succesfully in " << std::chrono::duration_cast(t_end - t_start).count() << " ms " << std::endl; + + return 0; +} diff --git a/examples/pattern_api/vector_sum_mapreduce.cpp b/examples/pattern_api/vector_sum_mapreduce.cpp new file mode 100644 index 0000000..1ad9077 --- /dev/null +++ b/examples/pattern_api/vector_sum_mapreduce.cpp @@ -0,0 +1,135 @@ +#include +#include +#include + +std::chrono::steady_clock::time_point tInitialization; +std::chrono::steady_clock::time_point tComputation; +std::chrono::steady_clock::time_point tFinishing; +std::chrono::steady_clock::time_point tEnd; + +#ifdef GSPARDRIVER_OPENCL + #include "GSPar_OpenCL.hpp" + using namespace GSPar::Driver::OpenCL; +#else + #include "GSPar_CUDA.hpp" + using namespace GSPar::Driver::CUDA; +#endif + +#include "GSPar_PatternComposition.hpp" +using namespace GSPar::Pattern; + +void print_vector(unsigned long size, const unsigned long *vector, bool compact = false) { + if (compact || size > 100) { + std::cout << vector[0] << "..." << vector[size-1]; + } else { + for (unsigned long i = 0; i < size; i++) { + std::cout << vector[i] << " "; + } + } + std::cout << std::endl; +} + +unsigned long vector_sum(const unsigned long max, const unsigned long *a, const unsigned long *b, unsigned long *result) { + try { + + auto map = new Map("result[x] = a[x] + b[x];"); + map->setParameter("a", sizeof(unsigned long) * max, a) + .setParameter("b", sizeof(unsigned long) * max, b) + .setParameter("result", sizeof(unsigned long) * max, result, GSPAR_PARAM_INOUT); + + unsigned long total = 5; + // "result" is the vector with the data + // "+" is the binary associative operator + // "total" must be an OUT pointer parameter + auto reduce = new Reduce("result", "+", "total"); + reduce->setParameter("result", sizeof(unsigned long) * max, result, GSPAR_PARAM_INOUT) + .setParameter("total", sizeof(unsigned long), &total, GSPAR_PARAM_INOUT); + + // Using initializer_list + // PatternComposition mapReduce {map, reduce}; + // Using variadic templates constructor + auto mapReduce = new PatternComposition(map, reduce); + + mapReduce->compilePatterns({max, 0}); + + + tComputation = std::chrono::steady_clock::now(); // Ends initialization, start computation + + + mapReduce->run(); + + + tFinishing = std::chrono::steady_clock::now(); // Ends computation, start finishing + + delete mapReduce; + delete reduce; + delete map; + + return total; + + } catch (GSPar::GSParException &ex) { + std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl; + exit(-1); + } +} + +int main(int argc, const char * argv[]) { + if (argc < 2) { + std::cerr << "Use: " << argv[0] << " " << std::endl; + exit(-1); + } + std::cout << std::fixed << std::setprecision(0); + + const unsigned long VECTOR_SIZE = std::stoul(argv[1]); + + // Create memory objects + unsigned long *result = new unsigned long[VECTOR_SIZE]; + unsigned long *a = new unsigned long[VECTOR_SIZE]; + unsigned long *b = new unsigned long[VECTOR_SIZE]; + for (unsigned long i = 0; i < VECTOR_SIZE; i++) { + a[i] = i; + b[i] = i + 1; + result[i] = 0; + } + +#ifdef DEBUG + std::cout << "Summing vectors:" << std::endl; + std::cout << "Vector A: "; + print_vector(VECTOR_SIZE, a); + std::cout << "Vector B: "; + print_vector(VECTOR_SIZE, b); +#endif + + tInitialization = std::chrono::steady_clock::now(); // Begins initialization + + unsigned long total = vector_sum(VECTOR_SIZE, a, b, result); + + tEnd = std::chrono::steady_clock::now(); // Ends finish + +#ifdef DEBUG + // Output the result buffer + std::cout << "Result: "; + print_vector(VECTOR_SIZE, result); +#endif + + delete[] result; + delete[] a; + delete[] b; + + double msTotal = std::chrono::duration_cast(tEnd - tInitialization).count(); + double msInitialization = std::chrono::duration_cast(tComputation - tInitialization).count(); + double msComputation = std::chrono::duration_cast(tFinishing - tComputation).count(); + double msFinishing = std::chrono::duration_cast(tEnd - tFinishing).count(); + +#ifdef DEBUG + std::cout << "Total: " << total << std::endl; + std::cout << "Total: " << msTotal << " ms" << std::endl; + std::cout << "Initialization: " << msInitialization << " ms" << std::endl; + std::cout << "Computation: " << msComputation << " ms" << std::endl; + std::cout << "Finishing: " << msFinishing << " ms" << std::endl; +#else + std::cout << total << ";" << msTotal << ";" << msInitialization << ";" << msComputation << ";" << msFinishing << std::endl; +#endif + + return 0; +} diff --git a/examples/sequential/mandel.cpp b/examples/sequential/mandel.cpp new file mode 100644 index 0000000..315cd0d --- /dev/null +++ b/examples/sequential/mandel.cpp @@ -0,0 +1,100 @@ +#include +#include +#include +#ifdef DEBUG +#include "marX2/marX2.h" +#endif + +std::chrono::steady_clock::time_point tInitialization; +std::chrono::steady_clock::time_point tComputation; +std::chrono::steady_clock::time_point tFinishing; +std::chrono::steady_clock::time_point tEnd; + +void mandelbrot(const double init_a, const double init_b, const double range, const unsigned long dim, const unsigned long niter, unsigned char *M) { + double step = range/((double) dim); + + tComputation = std::chrono::steady_clock::now(); // Ends initialization, start computation + + + for(unsigned long i = 0; i < dim; i++) { + double im=init_b+(step*i); + for (unsigned long j = 0; j < dim; j++) { + double cr; + double a=cr=init_a+step*j; + double b=im; + unsigned long k = 0; + for (k = 0; k < niter; k++) { + double a2=a*a; + double b2=b*b; + if ((a2+b2)>4.0) break; + b=2*a*b+im; + a=a2-b2+cr; + } + M[i*dim+j]= (unsigned char)(255-((k*255/niter))); + } + } + + + tFinishing = std::chrono::steady_clock::now(); // Ends computation, start finishing +} + +int main(int argc, char **argv) { + double init_a=-2.125,init_b=-1.5,range=3.0; + unsigned long dim = 1000; + unsigned long niter = 1000; + std::cout << std::fixed << std::setprecision(0); + + #ifndef DEBUG + if (argc<3) { + std::cerr << "Usage: " << argv[0] << " " << std::endl; + exit(-1); + } + #endif + if (argc > 1) { + dim = strtoul(argv[1], 0, 10); + } + if (argc > 2) { + niter = strtoul(argv[2], 0, 10); + } + + unsigned char *M = new unsigned char[dim*dim]; + + #ifdef DEBUG + SetupXWindows(dim,dim,1,NULL,"Mandelbroot"); + #endif + + tInitialization = std::chrono::steady_clock::now(); // Begins initialization + + mandelbrot(init_a, init_b, range, dim, niter, M); + + tEnd = std::chrono::steady_clock::now(); // Ends finish + + #ifdef DEBUG + for(unsigned long i=0; i(tEnd - tInitialization).count(); + double msInitialization = std::chrono::duration_cast(tComputation - tInitialization).count(); + double msComputation = std::chrono::duration_cast(tFinishing - tComputation).count(); + double msFinishing = std::chrono::duration_cast(tEnd - tFinishing).count(); + + #ifdef DEBUG + std::cout << "Teste: " << argv[0] << " " << dim << " " << niter << std::endl; + std::cout << "Total: " << msTotal << " ms" << std::endl; + std::cout << "Initialization: " << msInitialization << " ms" << std::endl; + std::cout << "Computation: " << msComputation << " ms" << std::endl; + std::cout << "Finishing: " << msFinishing << " ms" << std::endl; + #else + std::cout << argv[0] << " " << dim << " " << niter << ";" << msTotal << ";" << msInitialization << ";" << msComputation << ";" << msFinishing << std::endl; + #endif + + #ifdef DEBUG + getchar(); + CloseXWindows(); + #endif + + delete[] M; + return 0; +} diff --git a/examples/sequential/matrix_multi_cm.cpp b/examples/sequential/matrix_multi_cm.cpp new file mode 100644 index 0000000..8093ba3 --- /dev/null +++ b/examples/sequential/matrix_multi_cm.cpp @@ -0,0 +1,101 @@ +#include +#include +#include + +std::chrono::steady_clock::time_point tInitialization; +std::chrono::steady_clock::time_point tComputation; +std::chrono::steady_clock::time_point tFinishing; +std::chrono::steady_clock::time_point tEnd; + + +void multiply(const unsigned long size, const float *matrixA, const float *matrixB, float *result) { + tComputation = std::chrono::steady_clock::now(); // Ends initialization, start computation + + for (unsigned long i = 0; i < size; i++) { + for (unsigned long j = 0; j < size; j++) { + float sum = 0; + for(unsigned long k = 0; k < size; k++) { + // result[i * size + j] += matrixA[i * size + k] * matrixB[k * size + j]; // Row-major + sum += matrixA[k * size + i] * matrixB[j * size + k]; // Column-major + } + result[j * size + i] = sum; + } + } + + tFinishing = std::chrono::steady_clock::now(); // Ends computation, start finishing +} + +void printMatrix(const unsigned long size, float *matrix, bool compact = false) { + if (compact || size > 100) { + std::cout << matrix[0] << ".." << matrix[size-1] << ".." << matrix[size*size-1]; + } else { + for (unsigned long i = 0; i < size; i++) { + std::cout << std::endl; + for (unsigned long j = 0; j < size; j++) { + // std::cout << matrix[i * size + j] << " "; // Row-major + std::cout << matrix[j * size + i] << " "; // Column-major + } + } + } + std::cout << std::endl; +} + +int main(int argc, char const *argv[]) { + if (argc < 2) { + std::cerr << "Use: " << argv[0] << " " << std::endl; + exit(-1); + } + std::cout << std::fixed << std::setprecision(0); + + const unsigned long MX = std::stoi(argv[1]); + + float *matrixA = new float[MX * MX]; + float *matrixB = new float[MX * MX]; + float *result = new float[MX * MX]; + for (unsigned long i = 0; i < MX; i++) { + for (unsigned long j = 0; j < MX; j++) { + // Row-major + // matrixA[i * MX + j] = i+1; + // matrixB[i * MX + j] = j+1; + // result[i * MX + j] = 0; + // Column-major + matrixA[j * MX + i] = i+1; + matrixB[j * MX + i] = j+1; + result[j * MX + i] = 0; + } + } + + #ifdef DEBUG + std::cout << "Multiplying matrixes of " << MX << " x " << MX << std::endl; + std::cout << "Matrix A: "; + printMatrix(MX, matrixA); + std::cout << "Matrix B: "; + printMatrix(MX, matrixB); + #endif + + tInitialization = std::chrono::steady_clock::now(); // Begins initialization + + multiply(MX, matrixA, matrixB, result); + + tEnd = std::chrono::steady_clock::now(); // Ends finish + + double msTotal = std::chrono::duration_cast(tEnd - tInitialization).count(); + double msInitialization = std::chrono::duration_cast(tComputation - tInitialization).count(); + double msComputation = std::chrono::duration_cast(tFinishing - tComputation).count(); + double msFinishing = std::chrono::duration_cast(tEnd - tFinishing).count(); + + #ifdef DEBUG + std::cout << "Result: "; + printMatrix(MX, result); + std::cout << "Total: " << msTotal << " ms" << std::endl; + std::cout << "Initialization: " << msInitialization << " ms" << std::endl; + std::cout << "Computation: " << msComputation << " ms" << std::endl; + std::cout << "Finishing: " << msFinishing << " ms" << std::endl; + #else + std::cout << result[0] << ".." << result[MX-1] << ".." << result[MX*MX-1] << ";" << msTotal << ";" << msInitialization << ";" << msComputation << ";" << msFinishing << std::endl; + #endif + + delete[] matrixA; + delete[] matrixB; + delete[] result; +} diff --git a/examples/sequential/matrix_multi_rm.cpp b/examples/sequential/matrix_multi_rm.cpp new file mode 100644 index 0000000..6462321 --- /dev/null +++ b/examples/sequential/matrix_multi_rm.cpp @@ -0,0 +1,95 @@ +#include +#include +#include + +std::chrono::steady_clock::time_point tInitialization; +std::chrono::steady_clock::time_point tComputation; +std::chrono::steady_clock::time_point tFinishing; +std::chrono::steady_clock::time_point tEnd; + + +void multiply(const unsigned long size, const float *matrixA, const float *matrixB, float *result) { + tComputation = std::chrono::steady_clock::now(); // Ends initialization, start computation + + for (unsigned long i = 0; i < size; i++) { + for (unsigned long j = 0; j < size; j++) { + float sum = 0; + for(unsigned long k = 0; k < size; k++) { + sum += matrixA[i * size + k] * matrixB[k * size + j]; // Row-major + } + result[i * size + j] = sum; + } + } + + tFinishing = std::chrono::steady_clock::now(); // Ends computation, start finishing +} + +void printMatrix(const unsigned long size, float *matrix, bool compact = false) { + if (compact || size > 100) { + std::cout << matrix[0] << ".." << matrix[size-1] << ".." << matrix[size*size-1]; + } else { + for (unsigned long i = 0; i < size; i++) { + std::cout << std::endl; + for (unsigned long j = 0; j < size; j++) { + std::cout << matrix[i * size + j] << " "; // Row-major + } + } + } + std::cout << std::endl; +} + +int main(int argc, char const *argv[]) { + if (argc < 2) { + std::cerr << "Use: " << argv[0] << " " << std::endl; + exit(-1); + } + std::cout << std::fixed << std::setprecision(0); + + const unsigned long MX = std::stoi(argv[1]); + + float *matrixA = new float[MX * MX]; + float *matrixB = new float[MX * MX]; + float *result = new float[MX * MX]; + for (unsigned long i = 0; i < MX; i++) { + for (unsigned long j = 0; j < MX; j++) { + // Row-major + matrixA[i * MX + j] = i+1; + matrixB[i * MX + j] = j+1; + result[i * MX + j] = 0; + } + } + + #ifdef DEBUG + std::cout << "Multiplying matrixes of " << MX << " x " << MX << std::endl; + std::cout << "Matrix A: "; + printMatrix(MX, matrixA); + std::cout << "Matrix B: "; + printMatrix(MX, matrixB); + #endif + + tInitialization = std::chrono::steady_clock::now(); // Begins initialization + + multiply(MX, matrixA, matrixB, result); + + tEnd = std::chrono::steady_clock::now(); // Ends finish + + double msTotal = std::chrono::duration_cast(tEnd - tInitialization).count(); + double msInitialization = std::chrono::duration_cast(tComputation - tInitialization).count(); + double msComputation = std::chrono::duration_cast(tFinishing - tComputation).count(); + double msFinishing = std::chrono::duration_cast(tEnd - tFinishing).count(); + + #ifdef DEBUG + std::cout << "Result: "; + printMatrix(MX, result); + std::cout << "Total: " << msTotal << " ms" << std::endl; + std::cout << "Initialization: " << msInitialization << " ms" << std::endl; + std::cout << "Computation: " << msComputation << " ms" << std::endl; + std::cout << "Finishing: " << msFinishing << " ms" << std::endl; + #else + std::cout << result[0] << ".." << result[MX-1] << ".." << result[MX*MX-1] << ";" << msTotal << ";" << msInitialization << ";" << msComputation << ";" << msFinishing << std::endl; + #endif + + delete[] matrixA; + delete[] matrixB; + delete[] result; +} diff --git a/examples/sequential/primer.cpp b/examples/sequential/primer.cpp new file mode 100644 index 0000000..7e93895 --- /dev/null +++ b/examples/sequential/primer.cpp @@ -0,0 +1,79 @@ +/* *************************************************************************** + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * As a special exception, you may use this file as part of a free software + * library without restriction. Specifically, if other files instantiate + * templates or use macros or inline functions from this file, or you compile + * this file and link it with other files to produce an executable, this + * file does not by itself cause the resulting executable to be covered by + * the GNU General Public License. This exception does not however + * invalidate any other reasons why the executable file might be covered by + * the GNU General Public License. + * + **************************************************************************** + * Authors: Dalvan Griebler + * + * Copyright: GNU General Public License + * Description: Application that counts the number of primes between 1 and N (argument [...] are optional). + * File Name: prime.cpp + * Version: 1.0 (17/07/2016) + * Compilation Command: g++ -std=c++1y prime.cpp -o exe + * Exacution Command: ./exe -h +*/ + +#include +#include +#include +#include +#include +#include + +int prime_number ( int n ){ + int total = 0; + for (int i = 2; i <= n; i++ ){ + int prime = 1; + for (int j = 2; j < i; j++ ){ + if ( i % j == 0 ){ + prime = 0; + break; + } + } + // if (prime) { + // std::cout << "Prime found: " << i << std::endl; + // } + total = total + prime; + } + return total; +} + + +int main ( int argc, char *argv[]){ + int n = 0; + if (argc != 2){ + std::cout << "Usage: " << argv[0] << " " << std::endl; + exit(1); + } + n = atoi(argv[1]); + + auto t_start = std::chrono::high_resolution_clock::now(); + + int total_primes = prime_number( n ); + + auto t_end = std::chrono::high_resolution_clock::now(); + + std::cout << n << " max\t" << total_primes << " primes\t" << std::chrono::duration_cast(t_end-t_start).count() << "ms" << std::endl; + + return 0; +} +/******************************************************************************/ \ No newline at end of file diff --git a/examples/sequential/raytracer.cpp b/examples/sequential/raytracer.cpp new file mode 100644 index 0000000..3eae6a3 --- /dev/null +++ b/examples/sequential/raytracer.cpp @@ -0,0 +1,524 @@ +// [header] +// A very basic raytracer example. +// [/header] +// [compile] +// c++ -o raytracer -O3 -Wall raytracer.cpp +// [/compile] +// [ignore] +// Copyright (C) 2012 www.scratchapixel.com +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . +// [/ignore] +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "rapidxml-1.13/rapidxml.hpp" + +#if defined __linux__ || defined __APPLE__ +// "Compiled for Linux +#else +// Windows doesn't define these values by default, Linux does +#define M_PI 3.141592653589793 +#define INFINITY 1e8 +#endif + +// This variable controls the maximum recursion depth +#define MAX_RAY_DEPTH 5 +// This variable controls if it should work in memory. If it is not defined, works in disk +#define WORK_IN_MEMORY + +#ifdef WORK_IN_MEMORY +#define WORKING_MEDIA "memory" +#else +#define WORKING_MEDIA "disk" +#endif + +template +class Vec3 +{ +public: + T x, y, z; + Vec3() : x(T(0)), y(T(0)), z(T(0)) {} + Vec3(T xx) : x(xx), y(xx), z(xx) {} + Vec3(T xx, T yy, T zz) : x(xx), y(yy), z(zz) {} + Vec3& normalize() + { + T nor2 = length2(); + if (nor2 > 0) { + T invNor = 1 / sqrt(nor2); + x *= invNor, y *= invNor, z *= invNor; + } + return *this; + } + Vec3 operator * (const T &f) const { return Vec3(x * f, y * f, z * f); } + Vec3 operator * (const Vec3 &v) const { return Vec3(x * v.x, y * v.y, z * v.z); } + T dot(const Vec3 &v) const { return x * v.x + y * v.y + z * v.z; } + Vec3 operator - (const Vec3 &v) const { return Vec3(x - v.x, y - v.y, z - v.z); } + Vec3 operator + (const Vec3 &v) const { return Vec3(x + v.x, y + v.y, z + v.z); } + Vec3& operator += (const Vec3 &v) { x += v.x, y += v.y, z += v.z; return *this; } + Vec3& operator *= (const Vec3 &v) { x *= v.x, y *= v.y, z *= v.z; return *this; } + Vec3 operator - () const { return Vec3(-x, -y, -z); } + T length2() const { return x * x + y * y + z * z; } + T length() const { return sqrt(length2()); } + friend std::ostream & operator << (std::ostream &os, const Vec3 &v) + { + os << "[" << v.x << " " << v.y << " " << v.z << "]"; + return os; + } +}; + +typedef Vec3 Vec3f; +typedef Vec3 Vec3b; + +class Sphere +{ +public: + const char *id; + Vec3f center; /// position of the sphere + float radius, radius2; /// sphere radius and radius^2 + Vec3f surfaceColor, emissionColor; /// surface color and emission (light) + float transparency, reflection; /// surface transparency and reflectivity + int animation_frame; + Vec3b animation_position_rand; + Vec3f animation_position; + Sphere() { } + Sphere( + const char *id, + const Vec3f &c, + const float &r, + const Vec3f &sc, + const float &refl = 0, + const float &transp = 0, + const Vec3f &ec = 0) : + id(id), center(c), radius(r), radius2(r * r), surfaceColor(sc), + emissionColor(ec), transparency(transp), reflection(refl) + { + animation_frame = 0; + } + //[comment] + // Compute a ray-sphere intersection using the geometric solution + //[/comment] + bool intersect(const Vec3f &rayorig, const Vec3f &raydir, float &t0, float &t1) const + { + Vec3f l = center - rayorig; + float tca = l.dot(raydir); + if (tca < 0) return false; + float d2 = l.dot(l) - tca * tca; + if (d2 > radius2) return false; + float thc = sqrt(radius2 - d2); + t0 = tca - thc; + t1 = tca + thc; + + return true; + } +}; + +float mix(const float &a, const float &b, const float &mix) +{ + return b * mix + a * (1 - mix); +} + +//[comment] +// This is the main trace function. It takes a ray as argument (defined by its origin +// and direction). We test if this ray intersects any of the geometry in the scene. +// If the ray intersects an object, we compute the intersection point, the normal +// at the intersection point, and shade this point using this information. +// Shading depends on the surface property (is it transparent, reflective, diffuse). +// The function returns a color for the ray. If the ray intersects an object that +// is the color of the object at the intersection point, otherwise it returns +// the background color. +//[/comment] +Vec3f trace( + const Vec3f &rayorig, + const Vec3f &raydir, + const Sphere *spheres, + const unsigned int spheres_size, + const int &depth) +{ + //if (raydir.length() != 1) std::cerr << "Error " << raydir << std::endl; + float tnear = INFINITY; + const Sphere* sphere = NULL; + // find intersection of this ray with the sphere in the scene + for (unsigned i = 0; i < spheres_size; ++i) { + float t0 = INFINITY, t1 = INFINITY; + if (spheres[i].intersect(rayorig, raydir, t0, t1)) { + if (t0 < 0) t0 = t1; + if (t0 < tnear) { + tnear = t0; + sphere = &spheres[i]; + } + } + } + // if there's no intersection return black or background color + if (!sphere) return Vec3f(2); + Vec3f surfaceColor = 0; // color of the ray/surfaceof the object intersected by the ray + Vec3f phit = rayorig + raydir * tnear; // point of intersection + Vec3f nhit = phit - sphere->center; // normal at the intersection point + nhit.normalize(); // normalize normal direction + // If the normal and the view direction are not opposite to each other + // reverse the normal direction. That also means we are inside the sphere so set + // the inside bool to true. Finally reverse the sign of IdotN which we want + // positive. + float bias = 1e-4; // add some bias to the point from which we will be tracing + bool inside = false; + if (raydir.dot(nhit) > 0) nhit = -nhit, inside = true; + if ((sphere->transparency > 0 || sphere->reflection > 0) && depth < MAX_RAY_DEPTH) { + float facingratio = -raydir.dot(nhit); + // change the mix value to tweak the effect + float fresneleffect = mix(pow(1 - facingratio, 3), 1, 0.1); + // compute reflection direction (not need to normalize because all vectors + // are already normalized) + Vec3f refldir = raydir - nhit * 2 * raydir.dot(nhit); + refldir.normalize(); + Vec3f reflection = trace(phit + nhit * bias, refldir, spheres, spheres_size, depth + 1); + Vec3f refraction = 0; + // if the sphere is also transparent compute refraction ray (transmission) + if (sphere->transparency) { + float ior = 1.1, eta = (inside) ? ior : 1 / ior; // are we inside or outside the surface? + float cosi = -nhit.dot(raydir); + float k = 1 - eta * eta * (1 - cosi * cosi); + Vec3f refrdir = raydir * eta + nhit * (eta * cosi - sqrt(k)); + refrdir.normalize(); + refraction = trace(phit - nhit * bias, refrdir, spheres, spheres_size, depth + 1); + } + // the result is a mix of reflection and refraction (if the sphere is transparent) + surfaceColor = ( + reflection * fresneleffect + + refraction * (1 - fresneleffect) * sphere->transparency) * sphere->surfaceColor; + } + else { + // it's a diffuse object, no need to raytrace any further + for (unsigned i = 0; i < spheres_size; ++i) { + if (spheres[i].emissionColor.x > 0) { + // this is a light + Vec3f transmission = 1; + Vec3f lightDirection = spheres[i].center - phit; + lightDirection.normalize(); + for (unsigned j = 0; j < spheres_size; ++j) { + if (i != j) { + float t0, t1; + if (spheres[j].intersect(phit + nhit * bias, lightDirection, t0, t1)) { + transmission = 0; + break; + } + } + } + surfaceColor += sphere->surfaceColor * transmission * + std::max(float(0), nhit.dot(lightDirection)) * spheres[i].emissionColor; + } + } + } + + return surfaceColor + sphere->emissionColor; +} + + +void save_image(const std::string output_folder, const int frame, const unsigned int width, const unsigned int height, Vec3f *image) { + // Save result to a PPM image (keep these flags if you compile under Windows) + std::stringstream ss; + ss << std::setfill('0') << std::setw(5) << frame; + std::string filename = output_folder + "/frame" + ss.str() + ".ppm"; +#ifdef DEBUG + std::cout << "[Work] Writing frame " << frame << " to " << filename << std::endl; +#endif + std::ofstream ofs(filename, std::ios::out | std::ios::binary); + ofs << "P6\n" << width << " " << height << "\n255\n"; + for (unsigned i = 0; i < width * height; ++i) { + ofs << (unsigned char)(std::min(float(1), image[i].x) * 255) << + (unsigned char)(std::min(float(1), image[i].y) * 255) << + (unsigned char)(std::min(float(1), image[i].z) * 255); + } + ofs.close(); +} + + +void raytrace(std::string output_folder, int total_frames, unsigned int width, unsigned int height, const std::vector &initial_spheres) { + float invWidth = 1 / float(width); + float invHeight = 1 / float(height); + float fov = 30; + float aspectratio = width / float(height); + float angle = tan(M_PI * 0.5 * fov / 180.); + +#ifdef WORK_IN_MEMORY + unsigned int total_memory = sizeof(Vec3f)*total_frames*width*height; + std::string total_memory_unit = " bytes"; + if (total_memory > 1024) { + total_memory = (total_frames*width*height)/1024; + total_memory_unit = " KB"; + } + if (total_memory > (10*1024)) { + total_memory /= 1024; + total_memory_unit = " MB"; + } +#ifdef DEBUG + std::cout << "[Init] Allocating " << total_memory << total_memory_unit << " of memory to store images" << std::endl; +#endif + Vec3f **images = new Vec3f*[total_frames]; + for (int f=0; f 0 && frame < spheres[i].animation_frame)) { + continue; + } + + int adjusted_frame = frame; + if (spheres[i].animation_frame < 0) { + if (frame > spheres[i].animation_frame*-1) { + adjusted_frame = spheres[i].animation_frame*-1; + } + } else if (spheres[i].animation_frame > 0) { + adjusted_frame -= spheres[i].animation_frame; + } + + if (spheres[i].animation_position.x) { + if (spheres[i].animation_position_rand.x) { + spheres[i].center.x += (drand48()*spheres[i].animation_position.x); + } else { + spheres[i].center.x += adjusted_frame*spheres[i].animation_position.x; + } + } + if (spheres[i].animation_position.y) { + if (spheres[i].animation_position_rand.y) { + spheres[i].center.y += (drand48()*spheres[i].animation_position.y); + } else { + spheres[i].center.y += adjusted_frame*spheres[i].animation_position.y; + } + } + if (spheres[i].animation_position.z) { + if (spheres[i].animation_position_rand.z) { + spheres[i].center.z += (drand48()*spheres[i].animation_position.z); + } else { + spheres[i].center.z += adjusted_frame*spheres[i].animation_position.z; + } + } + } + +#ifdef WORK_IN_MEMORY + Vec3f *image = images[frame-1]; +#else + Vec3f *image = new Vec3f[width * height]; +#endif + // Trace rays + for (unsigned y = 0; y < height; ++y) { + for (unsigned x = 0; x < width; ++x) { + float xx = (2 * ((x + 0.5) * invWidth) - 1) * angle * aspectratio; + float yy = (1 - 2 * ((y + 0.5) * invHeight)) * angle; + Vec3f raydir(xx, yy, -1); + raydir.normalize(); + image[y*width+x] = trace(Vec3f(0), raydir, spheres, spheres_size, 0); + } + } + +#ifndef WORK_IN_MEMORY + save_image(output_folder, frame, width, height, image); + delete [] image; +#endif + } + +# ifndef NO_TIME_MEASUREMENT +#ifdef DEBUG + std::cout << "[Time] Stopping time measurement" << std::endl; +#endif + clock_t cpu_end = clock(); + time(&wall_end); + double cpu_time_seconds = ((double) (cpu_end - cpu_start)) / CLOCKS_PER_SEC; + double wall_time_seconds = difftime(wall_end, wall_start); + printf("The generation of %d frames in %s of %u x %u with %lu spheres took:\n", total_frames, WORKING_MEDIA, width, height, initial_spheres.size()); + printf("%.0f wall-clock seconds (%.2f FPS)\n", wall_time_seconds, ((double)total_frames)/wall_time_seconds); + printf("%.2f CPU time seconds\n", cpu_time_seconds); +# endif + +#ifdef WORK_IN_MEMORY + for (int frame = 1; frame <= total_frames; frame++) { + save_image(output_folder, frame, width, height, images[frame-1]); + delete [] images[frame-1]; + } + delete [] images; +#endif +} + + +int main(int argc, char **argv) +{ + int image_size_parameter = 2; + int total_frames = 1; + + if (argc < 3) { + std::cerr << "Usage: " << argv[0] << " [ [ ]]" << std::endl; + std::cerr << " : XML with the scene description" << std::endl; + std::cerr << " : Folder on which produce output images" << std::endl; + std::cerr << " : Size of images to generate, a single integer meaning 1=320x180, 2=640x360, 4=HD, 6=FHD and so on. Defaults to " << image_size_parameter << std::endl; + std::cerr << " : Number of frames to produce. Defaults to " << total_frames << std::endl; + exit(1); + } + srand48(13); + + std::string scene_filename(argv[1]); + std::string output_folder = argv[2]; + if (argc > 3) { + image_size_parameter = atoi(argv[3]); + } + if (argc > 4) { + total_frames = atoi(argv[4]); + } + + // 1 = 320x180 + // 2 = 640x360 + // 4 = 1280x720 (HD) + // 6 = 1920x1080 (FHD) + unsigned int image_size_multiplier = 20*image_size_parameter; + + unsigned int width = image_size_multiplier*16; + unsigned int height = image_size_multiplier*9; + + std::vector initial_spheres; + +#ifdef DEBUG + std::cout << "[Init] Generating " << total_frames << " frames of " << width << "x" << height << " in " << WORKING_MEDIA << " in " << output_folder << std::endl; + std::cout << "[Init] Loading scene from " << scene_filename << std::endl; +#endif + + // Parses the scene + std::ifstream scene_file(scene_filename, std::ios::binary | std::ios::ate); + std::streamsize scene_file_size = scene_file.tellg(); + scene_file.seekg(0, std::ios::beg); + char *scene_buffer = new char[scene_file_size]; + if (scene_file.read(scene_buffer, scene_file_size)) { + rapidxml::xml_document<> doc; + doc.parse<0>(scene_buffer); + rapidxml::xml_node<> *scene_node = doc.first_node("scene"); + + rapidxml::xml_node<> *spheres_node = scene_node->first_node("spheres"); + rapidxml::xml_node<> *sphere_node = spheres_node->first_node(); + while (sphere_node != 0) { + // position, radius, surface color, reflectivity, transparency, emission color + initial_spheres.emplace_back( + sphere_node->first_attribute("id")->value(), + Vec3f( //Center position + atof(sphere_node->first_node("position")->first_attribute("x")->value()), + atof(sphere_node->first_node("position")->first_attribute("y")->value()), + atof(sphere_node->first_node("position")->first_attribute("z")->value()) + ), + atof(sphere_node->first_node("size")->first_attribute("radius")->value()), // Radius + Vec3f( //Surface color + atof(sphere_node->first_node("surface_color")->first_attribute("red")->value()), + atof(sphere_node->first_node("surface_color")->first_attribute("green")->value()), + atof(sphere_node->first_node("surface_color")->first_attribute("blue")->value()) + ), + atof(sphere_node->first_node("reflectivity")->first_attribute("value")->value()), // Reflectivity + atof(sphere_node->first_node("transparency")->first_attribute("value")->value()) // Transparency + ); + if (sphere_node->first_node("emission_color")) { + initial_spheres.back().emissionColor = Vec3f( + atof(sphere_node->first_node("emission_color")->first_attribute("red")->value()), + atof(sphere_node->first_node("emission_color")->first_attribute("green")->value()), + atof(sphere_node->first_node("emission_color")->first_attribute("blue")->value()) + ); + } + sphere_node = sphere_node->next_sibling(); + } +#ifdef DEBUG + std::cout << "[Init] Loaded " << initial_spheres.size() << " spheres, looking for animations" << std::endl; +#endif + + rapidxml::xml_node<> *animation_node = scene_node->first_node("animation"); + for (rapidxml::xml_node<> *sphere_animation = animation_node->first_node(); + sphere_animation; sphere_animation = sphere_animation->next_sibling()) { + std::string id = sphere_animation->first_attribute("id")->value(); + for(unsigned long i = 0; i != initial_spheres.size(); i++) { + if (id == initial_spheres[i].id) { + rapidxml::xml_node<> *position_node = sphere_animation->first_node("position"); + if (position_node) { + rapidxml::xml_attribute<> *attr; + attr = position_node->first_attribute("after"); + if (attr) { + initial_spheres[i].animation_frame = atoi(attr->value()); + } + attr = position_node->first_attribute("before"); + if (attr) { + initial_spheres[i].animation_frame = atoi(attr->value())*-1; + } + attr = position_node->first_attribute("x"); + if (attr) { + if (strcmp(attr->value(), "random") == 0) { + initial_spheres[i].animation_position_rand.x = true; + initial_spheres[i].animation_position.x = atof(position_node->first_attribute("random")->value()); + } else { + initial_spheres[i].animation_position.x = atof(attr->value()); + } + } + attr = position_node->first_attribute("y"); + if (attr) { + if (strcmp(attr->value(), "random") == 0) { + initial_spheres[i].animation_position_rand.y = true; + initial_spheres[i].animation_position.y = atof(position_node->first_attribute("random")->value()); + } else { + initial_spheres[i].animation_position.y = atof(position_node->first_attribute("y")->value()); + } + } + attr = position_node->first_attribute("z"); + if (attr) { + if (strcmp(attr->value(), "random") == 0) { + initial_spheres[i].animation_position_rand.z = true; + initial_spheres[i].animation_position.z = atof(position_node->first_attribute("random")->value()); + } else { + initial_spheres[i].animation_position.z = atof(position_node->first_attribute("z")->value()); + } + } + } + } + } + } +#ifdef DEBUG + std::cout << "[Init] Finished loading animation for spheres" << std::endl; +#endif + + } + + raytrace(output_folder, total_frames, width, height, initial_spheres); + + return 0; +} \ No newline at end of file diff --git a/examples/sequential/reduce.cpp b/examples/sequential/reduce.cpp new file mode 100644 index 0000000..1d0d541 --- /dev/null +++ b/examples/sequential/reduce.cpp @@ -0,0 +1,79 @@ +#include +#include +#include + +std::chrono::steady_clock::time_point tInitialization; +std::chrono::steady_clock::time_point tComputation; +std::chrono::steady_clock::time_point tFinishing; +std::chrono::steady_clock::time_point tEnd; + + +unsigned int reduce_vector(const size_t vector_size, const unsigned int* vector) { + tInitialization = std::chrono::steady_clock::now(); // Begins initialization + + tComputation = std::chrono::steady_clock::now(); // Ends initialization, start computation + + unsigned int total = 0; + for (size_t i = 0; i < vector_size; i++) { + total += vector[i]; + } + + tFinishing = std::chrono::steady_clock::now(); // Ends computation, start finishing + + tEnd = std::chrono::steady_clock::now(); // Ends finish + + return total; +} + +void print_vector(size_t size, const unsigned int* vector, bool compact = false) { + if (compact || size > 100) { + std::cout << vector[0] << "..." << vector[size-1]; + } else { + for (size_t i = 0; i < size; i++) { + std::cout << vector[i] << " "; + } + } + std::cout << std::endl; +} + +int main(int argc, const char * argv[]) { + if (argc < 2) { + std::cerr << "Use: " << argv[0] << " " << std::endl; + exit(-1); + } + std::cout << std::fixed << std::setprecision(0); + + const size_t VECTOR_SIZE = std::stoi(argv[1]); + + // Create memory objects + unsigned int *vector = new unsigned int[VECTOR_SIZE]; + for (size_t i = 0; i < VECTOR_SIZE; i++) { + vector[i] = 1; + } + +#ifdef DEBUG + std::cout << "Reducing vector:" << std::endl; + print_vector(VECTOR_SIZE, vector); +#endif + + unsigned int total = reduce_vector(VECTOR_SIZE, vector); + + delete[] vector; + + double msTotal = std::chrono::duration_cast(tEnd - tInitialization).count(); + double msInitialization = std::chrono::duration_cast(tComputation - tInitialization).count(); + double msComputation = std::chrono::duration_cast(tFinishing - tComputation).count(); + double msFinishing = std::chrono::duration_cast(tEnd - tFinishing).count(); + +#ifdef DEBUG + std::cout << "Result: " << total << std::endl; + std::cout << "Total: " << msTotal << " ms" << std::endl; + std::cout << "Initialization: " << msInitialization << " ms" << std::endl; + std::cout << "Computation: " << msComputation << " ms" << std::endl; + std::cout << "Finishing: " << msFinishing << " ms" << std::endl; +#else + std::cout << total << ";" << msTotal << ";" << msInitialization << ";" << msComputation << ";" << msFinishing << std::endl; +#endif + + return 0; +} diff --git a/examples/sequential/saxpy.cpp b/examples/sequential/saxpy.cpp new file mode 100644 index 0000000..cbcc5ae --- /dev/null +++ b/examples/sequential/saxpy.cpp @@ -0,0 +1,96 @@ +#include +#include +#include + +std::chrono::steady_clock::time_point tInitialization; +std::chrono::steady_clock::time_point tComputation; +std::chrono::steady_clock::time_point tFinishing; +std::chrono::steady_clock::time_point tEnd; + + +unsigned long saxpy(const unsigned long vector_size, const unsigned long scal, const unsigned long* a, const unsigned long* b, unsigned long* result) { + tInitialization = std::chrono::steady_clock::now(); // Begins initialization + + tComputation = std::chrono::steady_clock::now(); // Ends initialization, start computation + + unsigned long total = 0; + for (unsigned long i = 0; i < vector_size; i++) { + result[i] = scal*a[i] + b[i]; + total += result[i]; + } + + tFinishing = std::chrono::steady_clock::now(); // Ends computation, start finishing + + tEnd = std::chrono::steady_clock::now(); // Ends finish + + return total; +} + +void print_vector(unsigned long size, const unsigned long* vector, bool compact = false) { + if (compact || size > 100) { + std::cout << vector[0] << "..." << vector[size-1]; + } else { + for (unsigned long i = 0; i < size; i++) { + std::cout << vector[i] << " "; + } + } + std::cout << std::endl; +} + +int main(int argc, const char * argv[]) { + if (argc < 3) { + std::cerr << "Use: " << argv[0] << " " << std::endl; + exit(-1); + } + std::cout << std::fixed << std::setprecision(0); + + const unsigned long VECTOR_SIZE = std::stoul(argv[1]); + const unsigned long SCALAR = std::stoul(argv[2]); + + // Create memory objects + unsigned long *result = new unsigned long[VECTOR_SIZE]; + unsigned long *a = new unsigned long[VECTOR_SIZE]; + unsigned long *b = new unsigned long[VECTOR_SIZE]; + for (unsigned long i = 0; i < VECTOR_SIZE; i++) { + a[i] = (unsigned long)i; + b[i] = (unsigned long)i + 1; + result[i] = 0; + } + +#ifdef DEBUG + std::cout << "Summing vectors:" << std::endl; + std::cout << "Vector A: "; + print_vector(VECTOR_SIZE, a); + std::cout << "Vector B: "; + print_vector(VECTOR_SIZE, b); +#endif + + unsigned long total = saxpy(VECTOR_SIZE, SCALAR, a, b, result); + +#ifdef DEBUG + // Output the result buffer + std::cout << "Result: "; + print_vector(VECTOR_SIZE, result); +#endif + + delete[] result; + delete[] a; + delete[] b; + + double msTotal = std::chrono::duration_cast(tEnd - tInitialization).count(); + double msInitialization = std::chrono::duration_cast(tComputation - tInitialization).count(); + double msComputation = std::chrono::duration_cast(tFinishing - tComputation).count(); + double msFinishing = std::chrono::duration_cast(tEnd - tFinishing).count(); + +#ifdef DEBUG + std::cout << "Total: " << total << std::endl; + std::cout << "Total: " << msTotal << " ms" << std::endl; + std::cout << "Initialization: " << msInitialization << " ms" << std::endl; + std::cout << "Computation: " << msComputation << " ms" << std::endl; + std::cout << "Finishing: " << msFinishing << " ms" << std::endl; +#else + std::cout << total << ";" << msTotal << ";" << msInitialization << ";" << msComputation << ";" << msFinishing << std::endl; +#endif + + return 0; +} diff --git a/examples/sequential/vector_sum.cpp b/examples/sequential/vector_sum.cpp new file mode 100644 index 0000000..ac12986 --- /dev/null +++ b/examples/sequential/vector_sum.cpp @@ -0,0 +1,95 @@ +#include +#include +#include + +std::chrono::steady_clock::time_point tInitialization; +std::chrono::steady_clock::time_point tComputation; +std::chrono::steady_clock::time_point tFinishing; +std::chrono::steady_clock::time_point tEnd; + + +unsigned long vector_sum(const unsigned long vector_size, const unsigned long *a, const unsigned long *b, unsigned long *result) { + tComputation = std::chrono::steady_clock::now(); // Ends initialization, start computation + + unsigned long total = 0; + for (unsigned long i = 0; i < vector_size; i++) { + result[i] = a[i] + b[i]; + total += result[i]; + } + + tFinishing = std::chrono::steady_clock::now(); // Ends computation, start finishing + + return total; +} + +void print_vector(unsigned long size, const unsigned long* vector, bool compact = false) { + if (compact || size > 100) { + std::cout << vector[0] << "..." << vector[size-1]; + } else { + for (unsigned long i = 0; i < size; i++) { + std::cout << vector[i] << " "; + } + } + std::cout << std::endl; +} + +int main(int argc, const char * argv[]) { + if (argc < 2) { + std::cerr << "Use: " << argv[0] << " " << std::endl; + exit(-1); + } + std::cout << std::fixed << std::setprecision(0); + + const unsigned long VECTOR_SIZE = std::stoul(argv[1]); + + // Create memory objects + unsigned long *result = new unsigned long[VECTOR_SIZE]; + unsigned long *a = new unsigned long[VECTOR_SIZE]; + unsigned long *b = new unsigned long[VECTOR_SIZE]; + for (unsigned long i = 0; i < VECTOR_SIZE; i++) { + a[i] = i; + b[i] = i + 1; + result[i] = 0; + } + +#ifdef DEBUG + std::cout << "Summing vectors:" << std::endl; + std::cout << "Vector A: "; + print_vector(VECTOR_SIZE, a); + std::cout << "Vector B: "; + print_vector(VECTOR_SIZE, b); +#endif + + tInitialization = std::chrono::steady_clock::now(); // Begins initialization + + unsigned long total = vector_sum(VECTOR_SIZE, a, b, result); + + tEnd = std::chrono::steady_clock::now(); // Ends finish + +#ifdef DEBUG + // Output the result buffer + std::cout << "Result: "; + print_vector(VECTOR_SIZE, result); +#endif + + delete[] result; + delete[] a; + delete[] b; + + double msTotal = std::chrono::duration_cast(tEnd - tInitialization).count(); + double msInitialization = std::chrono::duration_cast(tComputation - tInitialization).count(); + double msComputation = std::chrono::duration_cast(tFinishing - tComputation).count(); + double msFinishing = std::chrono::duration_cast(tEnd - tFinishing).count(); + +#ifdef DEBUG + std::cout << "Total: " << total << std::endl; + std::cout << "Total: " << msTotal << " ms" << std::endl; + std::cout << "Initialization: " << msInitialization << " ms" << std::endl; + std::cout << "Computation: " << msComputation << " ms" << std::endl; + std::cout << "Finishing: " << msFinishing << " ms" << std::endl; +#else + std::cout << total << ";" << msTotal << ";" << msInitialization << ";" << msComputation << ";" << msFinishing << std::endl; +#endif + + return 0; +} diff --git a/examples/workloads/raytracer_scene.xml b/examples/workloads/raytracer_scene.xml new file mode 100644 index 0000000..62180e7 --- /dev/null +++ b/examples/workloads/raytracer_scene.xml @@ -0,0 +1,137 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/GSPar.hpp b/src/GSPar.hpp new file mode 100644 index 0000000..572f677 --- /dev/null +++ b/src/GSPar.hpp @@ -0,0 +1,13 @@ + +#ifndef __GSPAR_INCLUDED__ +#define __GSPAR_INCLUDED__ + +// Include Drivers +#include "GSPar_CUDA.hpp" +#include "GSPar_OpenCL.hpp" + +// Include Patterns +#include "GSPar_PatternMap.hpp" +#include "GSPar_PatternReduce.hpp" + +#endif diff --git a/src/GSPar_Base.cpp b/src/GSPar_Base.cpp new file mode 100644 index 0000000..ab95146 --- /dev/null +++ b/src/GSPar_Base.cpp @@ -0,0 +1,28 @@ + +#include +#include +#include //std::generate_n + +namespace GSPar { + static bool srandInitiated = false; + + std::string getRandomString(short length) { + if (!srandInitiated) { + // Initialize random seed with ms since linux epoch + std::srand(std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count()); + srandInitiated = true; + } + + auto randchar = []() -> char { + const char charset[] = + "0123456789" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz"; + const size_t max_index = (sizeof(charset) - 1); + return charset[ std::rand() % max_index ]; + }; + std::string generatedName(length,0); + std::generate_n(generatedName.begin(), length, randchar); + return generatedName; + } +} diff --git a/src/GSPar_Base.hpp b/src/GSPar_Base.hpp new file mode 100644 index 0000000..0dfc0f8 --- /dev/null +++ b/src/GSPar_Base.hpp @@ -0,0 +1,37 @@ + +#ifndef __GSPAR_BASE_INCLUDED__ +#define __GSPAR_BASE_INCLUDED__ + +#include +#include +#include //std::generate_n + +#define GSPAR_STRINGIZE_SOURCE(...) #__VA_ARGS__ + +namespace GSPar { + + class GSParException : public std::exception { + protected: + std::string msg; + std::string details; + + public: + GSParException() : std::exception() { } + explicit GSParException(std::string msg, std::string details = "") { + this->msg = msg; + this->details = details; + } + virtual std::string what() { return this->msg; } + virtual std::string getDetails() { return this->details; } + }; + + // Auxiliary functions + std::string getRandomString(short length); + + template + inline bool instanceof(const T*) { + return std::is_base_of::value; + } +} + +#endif diff --git a/src/GSPar_BaseGPUDriver.hpp b/src/GSPar_BaseGPUDriver.hpp new file mode 100644 index 0000000..c95f33c --- /dev/null +++ b/src/GSPar_BaseGPUDriver.hpp @@ -0,0 +1,796 @@ + +#ifndef __GSPAR_BASEGPUDRIVER_INCLUDED__ +#define __GSPAR_BASEGPUDRIVER_INCLUDED__ + +#define SUPPORTED_DIMS 3 + +#include +#include +#include +#include +#include +#include +#ifdef GSPAR_DEBUG +#include //std::cout and std::cerr +#endif + +///// Forward declarations ///// + +namespace GSPar { + namespace Driver { + + enum Runtime { + GSPAR_RT_NONE, + GSPAR_RT_CUDA, + GSPAR_RT_OPENCL + }; + + struct SingleDimension { + unsigned long max; + unsigned long min; + // TODO step + // unsigned long step; + + SingleDimension() : SingleDimension(0, 0) { } + SingleDimension(unsigned long max) : SingleDimension(max, 0) { } + SingleDimension(unsigned long max, unsigned long min) : max(max), min(min) { } + + unsigned long delta() { return this->max - this->min; } + + std::string toString() { + std::string out; + if (this->min) { + out += std::to_string(this->min) + " to "; + } + out += std::to_string(this->max); + return out; + } + + SingleDimension& operator= (SingleDimension other) { // https://en.cppreference.com/w/cpp/language/copy_assignment + if (&other == this) return *this; + this->max = other.max; + this->min = other.min; + return *this; + } + explicit operator bool() const { return this->max > 0; } + bool operator==(SingleDimension& other) { + return this->max == other.max && this->min == other.min; + } + bool operator!=(SingleDimension& other) { return !(*this == other); } + SingleDimension& operator*=(unsigned int number) { + this->max *= number; + this->min *= number; + return *this; + } + SingleDimension operator*(unsigned int number) { return SingleDimension(this->max*number, this->min*number); } + }; + + struct Dimensions { + // TODO remove this crap + SingleDimension _empty; + SingleDimension x; + SingleDimension y; + SingleDimension z; + + Dimensions() : _empty(0), x(0), y(0), z(0) { }; + Dimensions(SingleDimension x, SingleDimension y) : Dimensions() { + this->x = x; + this->y = y; + } + Dimensions(SingleDimension x, SingleDimension y, SingleDimension z) : Dimensions(x, y) { + this->z = z; + } + Dimensions(unsigned long maxX, unsigned long maxY, unsigned long maxZ) : Dimensions(SingleDimension(maxX), SingleDimension(maxY), SingleDimension(maxZ)) { }; + /** + * Creates a 3-Dimensions with specified max values and min=0 + * @param max Max values for the 3 dimensions + */ + Dimensions(unsigned long max[3]) : Dimensions(max[0], max[1], max[2]) { }; + /** + * Created a 3-Dimensions with specified max and min values. + * Eg.: dims[0][0] is max value for X dim, dims[0][1] is min value for X dim, dims[1] is Y, dims[2] is Z + * @param dims Max and min values for dimensions + */ + Dimensions(unsigned long dims[3][2]) : Dimensions(SingleDimension(dims[0][0], dims[0][1]), SingleDimension(dims[1][0], dims[1][1]), SingleDimension(dims[2][0], dims[2][1])) { }; + // This constructor gets called instead of copy assignment when assignin directly or passing values to function + Dimensions(const Dimensions &other) : Dimensions(other.x, other.y, other.z) { }; + + bool is(unsigned int dimension) { return (bool)((*this)[dimension]); }; + int getCount() const { return (bool)this->x + (bool)this->y + (bool)this->z; } + + std::string getName(unsigned int dimension) { + if (this->is(dimension)) { + switch (dimension) { + case 0: return "x"; + case 1: return "y"; + case 2: return "z"; + } + } + return NULL; + } + + std::string toString() { + std::string out; + out += "[dim" + std::to_string(this->getCount()) + ":"; + for (int d = 0; d < this->getCount(); d++) { + out += (*this)[d].toString() + "x"; + } + out.pop_back(); + out += "]"; + return out; + } + + // https://en.cppreference.com/w/cpp/language/operators + SingleDimension& operator[] (const int index) { + if (index == 0) return this->x; + if (index == 1) return this->y; + if (index == 2) return this->z; + return this->_empty; // TODO Should we throw an exception? + } + Dimensions& operator= (Dimensions& other) { // https://en.cppreference.com/w/cpp/language/copy_assignment + if (&other == this) return *this; + this->_empty = other._empty; + this->x = other.x; + this->y = other.y; + this->z = other.z; + return *this; + } + bool operator==(Dimensions& other) { + bool ret = this->getCount() == other.getCount(); + for (int d = 0; ret && d < 3; d++) { + ret = ret && (*this)[d] == other[d]; + } + return ret; + } + bool operator!=(Dimensions& other) { return !(*this == other); } + Dimensions& operator*=(unsigned int number) { + for (int d = 0; d < this->getCount(); d++) { + (*this)[d] *= number; + } + return *this; + } + Dimensions operator*(unsigned int number) { return Dimensions( + this->x ? this->x*number : 0, + this->y ? this->y*number : 0, + this->z ? this->z*number : 0 + ); } + explicit operator bool() const { return this->getCount() > 0 && (bool)this->x; } + }; + + template + class BaseException; + + class BaseExecutionFlowBase; + + template + class BaseExecutionFlow; + + template + class BaseAsyncExecutionSupport; + + class BaseInstanceBase; + + template + class BaseInstance; + + class BaseDeviceBase; + + template + class BaseDevice; + + class BaseKernelBase; + + template + class BaseKernel; + + /** + * Class to allow storing pointers to BaseMemoryObject without templates. + */ + class BaseMemoryObjectBase { + protected: + size_t size; + void* hostPtr = NULL; + public: + BaseMemoryObjectBase() {} + virtual ~BaseMemoryObjectBase() {} + + size_t getSize() { return this->size; } + void* getHostPointer() { return this->hostPtr; } + }; + + template + class BaseMemoryObject; + + template + class BaseChunkedMemoryObject; + + template + class BaseStreamElement; + + class BaseKernelGeneration; + + } +} + +#include "GSPar_Base.hpp" +#include "GSPar_BaseParallelPattern.hpp" + +namespace GSPar { + namespace Driver { + + #define defaultExceptionDetails() std::string(__func__) + " in " + std::string(__FILE__) + ":" + std::to_string(__LINE__) + #define throwExceptionIfFailed( code ) Exception::throwIfFailed( code, defaultExceptionDetails() ) + + /** + * Base class for exceptions + * + * @param Type of the (lib-specific) error code + */ + template + class BaseException : public GSParException { + protected: + TLibCode code; + + virtual std::string getErrorString(TLibCode code) = 0; + + template + static TChildException* checkError(TLibCode code, TLibCode successCode, std::string details = "") { + if (code != successCode) { + return new TChildException(code, details); + } + return nullptr; + } + + template + static void throwIfFailed(TLibCode code, TLibCode sucessCode, std::string details = "") { + TChildException* ex = BaseException::checkError(code, sucessCode, details); + if (ex != nullptr) { + throw *ex; + } + } + + public: + BaseException() : GSParException() { } + explicit BaseException(std::string msg, std::string details = "") : GSParException(msg, details) { } + explicit BaseException(TLibCode code, std::string details = "") : GSParException("", details) { + this->code = code; + this->details = details; + // This virtual method call must be placed in child's implementation + // this->msg = this->getErrorString(code); + } + TLibCode getCode() { + return this->code; + } + }; + + /** + * Class to allow storing pointers to BaseExecutionFlow without templates. + */ + class BaseExecutionFlowBase { + public: + BaseExecutionFlowBase() {} + virtual ~BaseExecutionFlowBase() {} + }; + + /** + * Classes that manage an execution flow should inherit from this class. + * + * @param Type of the specialized BaseExecutionFlow class + * @param Type of the specialized BaseDevice class + * @param Type of the (lib-specific) underlying flow control object + */ + template + class BaseExecutionFlow : public BaseExecutionFlowBase { + protected: + TDevice* device = NULL; + TLibFlowObject flowObject = NULL; + + public: + BaseExecutionFlow() : BaseExecutionFlowBase() { } + explicit BaseExecutionFlow(TDevice* device) { + this->device = device; + } + virtual ~BaseExecutionFlow() { } + virtual void setBaseFlowObject(TLibFlowObject flowObject) { this->flowObject = flowObject; } + virtual TLibFlowObject getBaseFlowObject() { return this->flowObject; } + virtual void setDevice(TDevice* device) { this->device = device; } + virtual TDevice* getDevice() { return this->device; } + + /** + * Start the execution flow if it hasn't been started yet. + * Can be safely called multiple times. + */ + virtual TLibFlowObject start() = 0; + /** + * Wait for the operations in this execution flow to complete. + */ + virtual void synchronize() = 0; + + /** + * Check if the execution flow was provided and get the device's default execution flow otherwise. + * Start the execution flow and returns + * + * @param device The device from which get the default execution flow if the executionFlow is NULL + * @param executionFlow The execution flow to start + */ + static TLibFlowObject checkAndStartFlow(TDevice* device, TExecutionFlow* executionFlow = NULL) { + if (executionFlow) { + return executionFlow->start(); + } else { + return device->startDefaultExecutionFlow(); + } + } + }; + + /** + * Classes that support asynchronous execution should inherit from this class. + * + * @param Type of the (lib-specific) underlying async object + */ + template + class BaseAsyncExecutionSupport { + protected: + TLibAsyncObj asyncObject = NULL; + bool runningAsync = false; + + virtual void clearRunningAsync() { + this->runningAsync = false; + } + + public: + BaseAsyncExecutionSupport(TLibAsyncObj asyncObj = NULL) { + if (asyncObj) this->asyncObject = asyncObj; + } + virtual ~BaseAsyncExecutionSupport() { } + virtual void setBaseAsyncObject(TLibAsyncObj asyncObject) { this->asyncObject = asyncObject; } + virtual TLibAsyncObj getBaseAsyncObject() { return this->asyncObject; } + virtual bool isRunningAsync() { return this->runningAsync; } + + /** + * Wait for the async operations represented by this async object to complete + */ + virtual void waitAsync() = 0; + }; + + /** + * Class to allow references to BaseInstance without templates. + */ + class BaseInstanceBase { + protected: + Runtime runtime; + BaseInstanceBase(Runtime rt) : runtime(rt) { } + public: + BaseInstanceBase() { } + virtual ~BaseInstanceBase() { } + }; + + /** + * This class represents the entry point of the API. + * + * @param Type of the specialized BaseExecutionFlow class + * @param Type of the specialized BaseDevice class + * @param Type of the specialized BaseKernel class + * @param Type of the specialized BaseMemoryObject class + * @param Type of the specialized BaseChunkedMemoryObject class + * @param Type of the specialized BaseKernelGenerator class + */ + template + class BaseInstance : + public BaseInstanceBase { + private: + TKernelGenerator* kernelGenerator = nullptr; + + protected: + bool instanceInitiated = false; + std::vector devices; + virtual void loadGpuList() = 0; + virtual void clearGpuList() { + for (size_t i = 0; i < this->devices.size(); i++) { + delete this->devices[i]; + } + this->devices.clear(); + } + + BaseInstance(Runtime rt) : BaseInstanceBase(rt) { } + + public: + BaseInstance() {} + virtual ~BaseInstance() { + if (!this->devices.empty()) { + this->clearGpuList(); + } + } + virtual void init() = 0; + virtual unsigned int getGpuCount() = 0; + virtual std::vector getGpuList() { + if (this->devices.empty()) { + this->loadGpuList(); + } + return this->devices; + } + virtual TDevice* getGpu(unsigned int index) { + std::vector gpus = this->getGpuList(); + if (gpus.size() > index) { + return gpus.at(index); + } + return nullptr; + } + virtual TKernelGenerator* getKernelGenerator() { + // TODO implement thread safety + if (!this->kernelGenerator) { + this->kernelGenerator = new TKernelGenerator(); + } + return this->kernelGenerator; + } + + static TExecutionFlow getExecutionFlowType() { return TExecutionFlow(); } + static TDevice getDeviceType() { return TDevice(); } + static TKernel getKernelType() { return TKernel(); } + static TMemoryObject getMemoryObjectType() { return TMemoryObject(); } + static TChunkedMemoryObject getChunkedMemoryObjectType() { return TChunkedMemoryObject(); } + }; + + /** + * Class to allow references to BaseDevice without templates. + */ + class BaseDeviceBase { + public: + BaseDeviceBase() { } + virtual ~BaseDeviceBase() { } + }; + + /** + * Class that represent a single GPU device + * + * @param Type of the specialized BaseExecutionFlow class + * @param Type of the specialized BaseKernel class + * @param Type of the specialized BaseMemoryObject class + * @param Type of the specialized BaseChunkedMemoryObject class + * @param Type of the (lib-specific) underlying context object + * @param Type of the (lib-specific) underlying device object + * @param Type of the (lib-specific) underlying async execution flow object (the same used when inheriting BaseAsyncExecutionSupport) + */ + template + class BaseDevice : + public BaseDeviceBase { + protected: + mutable std::mutex libContextMutex; + TLibContext libContext = NULL; + TLibDevice libDevice = NULL; + mutable std::mutex defaultExecutionFlowMutex; + TExecutionFlow* defaultExecutionFlow = NULL; //TODO use a smart pointer + + public: + BaseDevice() { } + virtual ~BaseDevice() { } + virtual TExecutionFlow* getDefaultExecutionFlow() = 0; + virtual void setBaseDeviceObject(TLibDevice device) { this->libDevice = device; } + virtual TLibDevice getBaseDeviceObject() { return this->libDevice; } + virtual void setContext(TLibContext context) { this->libContext = context; } + virtual TLibContext getContext() { return this->libContext; } + + virtual TLibFlowObject startDefaultExecutionFlow() = 0; + virtual const std::string getName() = 0; + virtual unsigned int getComputeUnitsCount() = 0; // Number of multiprocessors + virtual unsigned int getWarpSize() = 0; + virtual unsigned int getMaxThreadsPerBlock() = 0; + /** + * Device's global memory size + */ + virtual unsigned long getGlobalMemorySizeBytes() = 0; + /** + * Device's local (block-shared) memory size + */ + virtual unsigned long getLocalMemorySizeBytes() = 0; + /** + * Device's amount of shared memory per compute unit + */ + virtual unsigned long getSharedMemoryPerComputeUnitSizeBytes() = 0; + virtual unsigned int getClockRateMHz() = 0; + virtual bool isIntegratedMainMemory() = 0; + // virtual bool supportUnifiedMemory() = 0; //CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING + virtual TMemoryObject* malloc(long size, void* hostPtr = nullptr, bool readOnly = false, bool writeOnly = false) = 0; + virtual TMemoryObject* malloc(long size, const void* hostPtr = nullptr) = 0; + virtual TChunkedMemoryObject* mallocChunked(unsigned int chunks, long chunkSize, void** hostPointers = nullptr, bool readOnly = false, bool writeOnly = false) = 0; + virtual TChunkedMemoryObject* mallocChunked(unsigned int chunks, long chunkSize, const void** hostPointers = nullptr) = 0; + // Can't convert this BaseGPUDriver instance to child Driver instance + // virtual TMemoryObject* malloc(long size, void* hostPtr = NULL) { + // return new TMemoryObject(this, size, hostPtr, false, false); + // } + virtual TKernel* prepareKernel(const std::string kernelSource, const std::string kernelName) = 0; + virtual std::vector prepareKernels(const std::string kernelSource, const std::vector kernelNames) = 0; + }; + + /** + * Class to allow storing pointers to BaseKernel without templates. + */ + class BaseKernelBase { + public: + BaseKernelBase() {} + virtual ~BaseKernelBase() {} + + virtual void cloneInto(BaseKernelBase* other) { } + virtual Dimensions getNumBlocksAndThreads(Dimensions dims, const unsigned int maxThreadsPerBlock, size_t* maxThreadsDimension) { return dims; } + virtual Dimensions getNumBlocksAndThreadsFor(Dimensions dims) { return dims; } + }; + + /** + * Class that represent a single GPU kernel, which can be invoked multiple times. + * + * @param Type of the specialized BaseExecutionFlow class + * @param Type of the specialized BaseDevice class + * @param Type of the specialized BaseMemoryObject class + * @param Type of the specialized BaseChunkedMemoryObject class + * @param Type of the (lib-specific) underlying async object (the same used when inheriting BaseAsyncExecutionSupport) + */ + template + class BaseKernel : + public BaseKernelBase, + virtual public BaseAsyncExecutionSupport { + protected: + std::string kernelName; + TDevice* device; + unsigned int parameterCount = 0; + unsigned int sharedMemoryBytes = 0; + Dimensions numThreadsPerBlock = {0, 0, 0}; + + BaseKernel(TDevice* device) : BaseKernel() { + this->device = device; + } + virtual Dimensions getNumBlocksAndThreads(Dimensions dims, const unsigned int maxThreadsPerBlock, size_t* maxThreadsDimension) override { + #ifdef GSPAR_DEBUG + std::stringstream ss; // Using stringstream eases multi-threaded debugging + ss.str(""); + #endif + // maxThreadsDimension is unsigned int[SUPPORTED_DIMS] + // Max is threads, min is blocks + Dimensions blocksAndThreads = { + {1, 1}, // X + {1, 1}, // Y + {1, 1} // Z + }; + + if (dims.y) { + if (dims.z) { + + // TODO support 3D kernels + throw GSParException("3-dimensional kernels not supported"); + + } else { + if ((dims.x.max * dims.y.max) > maxThreadsPerBlock) { + int maxThreads2D = sqrt(maxThreadsPerBlock); + maxThreadsDimension[0] = maxThreads2D; + maxThreadsDimension[1] = maxThreads2D; + } + } + } + + #ifdef GSPAR_DEBUG + if (this->numThreadsPerBlock) { + ss << "[GSPar Kernel " << this << "] Configured num of threads per block is " << this->numThreadsPerBlock.toString() << std::endl; + std::cout << ss.str(); + ss.str(""); + } + #endif + + for (int d = 0; d < SUPPORTED_DIMS; d++) { + if (dims[d]) { + if (numThreadsPerBlock[d] && numThreadsPerBlock[d].max < maxThreadsDimension[d]) { + maxThreadsDimension[d] = numThreadsPerBlock[d].max; + } + if (dims[d].delta() <= maxThreadsDimension[d]) { + blocksAndThreads[d].min = 1; // Blocks + blocksAndThreads[d].max = dims[d].delta(); // Threads + } else { + blocksAndThreads[d].min = ceil((double)dims[d].delta()/maxThreadsDimension[d]); // Blocks + blocksAndThreads[d].max = maxThreadsDimension[d]; // Threads + } + } + } + + return blocksAndThreads; + } + + public: + BaseKernel() { } + BaseKernel(TDevice* device, const std::string kernelSource, const std::string kernelName) : BaseKernel(device) { + this->kernelName = kernelName; + } + virtual ~BaseKernel() { } + virtual void cloneInto(BaseKernelBase* baseOther) override { + BaseKernelBase::cloneInto(baseOther); + BaseKernel* other = static_cast(baseOther); + other->kernelName = this->kernelName; + other->device = this->device; + other->parameterCount = this->parameterCount; + other->sharedMemoryBytes = this->sharedMemoryBytes; + } + virtual void setSharedMemoryAllocation(unsigned int sharedMemoryBytes) { + this->sharedMemoryBytes = sharedMemoryBytes; + } + virtual BaseKernel& setNumThreadsPerBlockForX(unsigned long num) { this->numThreadsPerBlock[0] = num; return *this; } + virtual BaseKernel& setNumThreadsPerBlockForY(unsigned long num) { this->numThreadsPerBlock[1] = num; return *this; } + virtual BaseKernel& setNumThreadsPerBlockForZ(unsigned long num) { this->numThreadsPerBlock[2] = num; return *this; } + virtual BaseKernel& setNumThreadsPerBlockFor(int dim, unsigned long num) { + this->numThreadsPerBlock[dim] = num; + return *this; + } + virtual BaseKernel& setNumThreadsPerBlock(unsigned long numX, unsigned long numY, unsigned long numZ) { + this->numThreadsPerBlock[0] = numX; + this->numThreadsPerBlock[1] = numY; + this->numThreadsPerBlock[2] = numZ; + return *this; + } + // TODO setParameter should return the Kernel object itself to allow fluent programming, such as BaseParallelPattern + virtual int setParameter(TMemoryObject* memoryObject) = 0; + virtual int setParameter(TChunkedMemoryObject* chunkedMemoryObject) = 0; + virtual int setParameter(size_t parmSize, void* parm) = 0; + virtual int setParameter(size_t parmSize, const void* parm) = 0; + virtual void clearParameters() { + this->parameterCount = 0; + } + virtual void runAsync(unsigned long max[3], TExecutionFlow* executionFlow = NULL) { + this->runAsync(Dimensions(max), executionFlow); + } + virtual void runAsync(Dimensions max, TExecutionFlow* executionFlow = NULL) = 0; + }; + + /** + * Class that represent a single memory object. + * It is bound to a device. It holds a (optional) host and a device pointer. + * + * @param Type of the specialized BaseException class + * @param Type of the specialized BaseExecutionFlow class + * @param Type of the specialized BaseDevice class + * @param Type of the (lib-specific) underlying error code + * @param Type of the (lib-specific) underlying async object (the same used when inheriting BaseAsyncExecutionSupport) + */ + template + class BaseMemoryObject : + virtual public BaseMemoryObjectBase, + virtual public BaseAsyncExecutionSupport { + protected: + // https://www.learncpp.com/cpp-tutorial/3-8a-bit-flags-and-bit-masks/ + static const unsigned char CAN_READ_FLAG = 1 << 0; + static const unsigned char CAN_WRITE_FLAG = 1 << 1; + + TDevice* device; + TLibMemoryObject devicePtr = NULL; + unsigned char flags = CAN_READ_FLAG | CAN_WRITE_FLAG; + bool _isPinnedHostMemory = false; + + /** + * @param readOnly identify that this memory object can only be read inside kernel + * @param writeOnly identify that this memory object can only be written inside kernel + */ + explicit BaseMemoryObject(bool readOnly, bool writeOnly) { + if (readOnly && writeOnly) { + throw TException("A memory object can't be read-only and write-only at the same time"); + } else if (readOnly) { + this->flags &= ~CAN_WRITE_FLAG; + } else if (writeOnly) { + this->flags &= ~CAN_READ_FLAG; + } + } + explicit BaseMemoryObject(TDevice* device, size_t size, void* hostPtr, bool readOnly, bool writeOnly) : BaseMemoryObject(readOnly, writeOnly) { + this->device = device; + this->hostPtr = hostPtr; + this->size = size; + } + explicit BaseMemoryObject(TDevice* device, size_t size, const void* hostPtr) : + // const pointer must be read-only + BaseMemoryObject(device, size, const_cast(hostPtr), true, false) { } + + public: + BaseMemoryObject() {} + virtual ~BaseMemoryObject() {} + TLibMemoryObject getBaseMemoryObject() { return this->devicePtr; } + bool isReadOnly() { return !(this->flags & CAN_WRITE_FLAG); } + bool isWriteOnly() { return !(this->flags & CAN_READ_FLAG); } + void bindTo(void* hostPtr) { this->hostPtr = hostPtr; } + void bindTo(void* hostPtr, size_t size) { + this->bindTo(hostPtr); + this->size = size; + } + virtual void pinHostMemory() { this->setPinnedHostMemory(true); } + virtual void setPinnedHostMemory(bool pinned) { this->_isPinnedHostMemory = pinned; } + virtual bool isPinnedHostMemory() { return this->_isPinnedHostMemory; } + virtual void copyIn() = 0; + virtual void copyOut() = 0; + virtual void copyInAsync(TExecutionFlow* executionFlow = NULL) = 0; + virtual void copyOutAsync(TExecutionFlow* executionFlow = NULL) = 0; + }; + + /** + * Class that represent a chunked memory object. + * It is bound to a device. It holds a bunch of host pointers (the chunks) and a single device pointer. + * + * @param Type of the specialized BaseException class + * @param Type of the specialized BaseExecutionFlow class + * @param Type of the specialized BaseDevice class + * @param Type of the (lib-specific) underlying error code + * @param Type of the (lib-specific) underlying async object (the same used when inheriting BaseAsyncExecutionSupport) + */ + template + class BaseChunkedMemoryObject : + virtual public BaseMemoryObjectBase, + virtual public BaseMemoryObject { + protected: + void** hostPointers = NULL; + unsigned int chunks = 0; + // We use the base property size for the chunkSize (size of each data chunk) + + // TODO shouldn't we call the base constructor? + explicit BaseChunkedMemoryObject(TDevice* device, unsigned int chunks, size_t chunkSize, void** hostPointers, bool readOnly, bool writeOnly) { + this->device = device; + this->hostPtr = NULL; + this->size = chunkSize; + this->hostPointers = hostPointers; + this->chunks = chunks; + } + explicit BaseChunkedMemoryObject(TDevice* device, unsigned int chunks, size_t chunkSize, const void** hostPointers) : + // const pointer must be read-only + BaseChunkedMemoryObject(device, chunks, chunkSize, const_cast(hostPointers), true, false) { } + + public: + BaseChunkedMemoryObject() : BaseMemoryObject() { } + virtual ~BaseChunkedMemoryObject() { } + size_t getChunkSize() { return this->size; } + unsigned int getChunkCount() { return this->chunks; } + }; + + /** + * Class that will end up being part of the stream elements + * + * @param Type of the specialized BaseExecutionFlow class + * @param Type of the specialized BaseDevice class + * @param Type of the (lib-specific) underlying async object (the same used when inheriting BaseAsyncExecutionSupport) + * @param Type of the (lib-specific) underlying async execution flow object (the same used when inheriting BaseAsyncExecutionSupport) + */ + template + class BaseStreamElement : + virtual public BaseAsyncExecutionSupport, + virtual public BaseExecutionFlow { + public: + explicit BaseStreamElement(TDevice* device) { + // We should extend BaseExecutionFlow::constructor(device) + this->device = device; + } + virtual ~BaseStreamElement() {} + + }; + + /** + * Base class for kernel code generation + */ + class BaseKernelGenerator { + protected: + std::array defaultStdVarNames = {"x", "y", "z"}; + + public: + virtual const std::string getKernelPrefix() = 0; + virtual std::string generateStdFunctions() = 0; + virtual std::string replaceMacroKeywords(std::string kernelSource) = 0; + virtual std::string generateInitKernel(Pattern::BaseParallelPattern* pattern, Dimensions dims) = 0; + virtual std::string generateParams(Pattern::BaseParallelPattern* pattern, Dimensions dims) = 0; + virtual std::string generateStdVariables(Pattern::BaseParallelPattern* pattern, Dimensions dims) = 0; + virtual std::string generateBatchedParametersInitialization(Pattern::BaseParallelPattern* pattern, Dimensions dims) = 0; + virtual std::string getStdVarNameForDimension(std::array& patternNames, int dimension) { + if (patternNames[dimension].empty()) { + return this->defaultStdVarNames[dimension]; + } + return patternNames[dimension]; + } + virtual std::array getStdVarNames(std::array& patternNames) { + return { + this->getStdVarNameForDimension(patternNames, 0), + this->getStdVarNameForDimension(patternNames, 1), + this->getStdVarNameForDimension(patternNames, 2) + }; + } + }; + + } +} + +#endif diff --git a/src/GSPar_BaseParallelPattern.hpp b/src/GSPar_BaseParallelPattern.hpp new file mode 100644 index 0000000..f7c7770 --- /dev/null +++ b/src/GSPar_BaseParallelPattern.hpp @@ -0,0 +1,1129 @@ + +#ifndef __GSPAR_BASEPARALLELPATTERN_INCLUDED__ +#define __GSPAR_BASEPARALLELPATTERN_INCLUDED__ + +#include +#include +#include +#include +#include //std::cout and std::cerr +#include +#include //std::generate_n +#ifdef GSPAR_DEBUG +#include +#include +#endif + +// Includes for getTypeName +#include +#include +#ifndef _MSC_VER +# include +#endif +#include +#include + +///// Forward declarations ///// + +namespace GSPar { + namespace Pattern { + + enum ParameterValueType { + GSPAR_PARAM_VALUE, + GSPAR_PARAM_POINTER + }; + + enum ParameterDirection { + GSPAR_PARAM_NONE, + GSPAR_PARAM_IN, + GSPAR_PARAM_OUT, + GSPAR_PARAM_INOUT, + GSPAR_PARAM_PRESENT // It avoids memory transfers when using a MemoryObject from user + }; + + struct VarType { + std::string name; + bool isPointer; //std::is_pointer + // Remember that struct are classes also + bool isClass; //std::is_class + bool isConst; //std::is_const + bool isVolatile; //std::is_volatile + bool isLValueRef; //std::is_lvalue_reference + bool isRValueRef; //std::is_rvalue_reference + + std::string getDeclarationName() { + return std::string("") + // Classes are not supported in OpenCL C99, so we assume the class is a struct + + (isClass ? "struct " : "") + + (isConst ? "const " : "") + + (isVolatile ? "volatile " : "") + + this->getFullName(); + } + + std::string getFullName() { + return std::string("") + + (isLValueRef ? "&" : "") + + (isRValueRef ? "&&" : "") + + name; + } + + std::string toString() { + return getFullName() + + (isPointer ? "*" : ""); + } + }; + + /** + * Base class for pattern parameters + */ + class BaseParameter { + protected: + bool complete = true; // Placeholder parameters are not complete + bool batched = false; // If the parameter is part of the batch + public: + std::string name; + VarType type; + size_t size; + ParameterValueType paramValueType; + ParameterDirection direction; + + BaseParameter() { } + BaseParameter(std::string name, VarType type, size_t size, ParameterValueType paramValueType, ParameterDirection direction = GSPAR_PARAM_IN, bool batched = false) : + name(name), type(type), size(size), paramValueType(paramValueType), direction(direction), batched(batched) { + // std::cout << "Creating parameter " << type.name << " " << name << " of " << size << " bytes" << (batched ? " [batched]" : "") << std::endl; + }; + virtual ~BaseParameter() { } + + virtual std::string toString() { + return this->type.getFullName() + " " + name; + } + virtual std::string getNonPointerTypeName() { + auto type = this->type.getFullName(); + if (type.back() == '*') { // Should we check isPointer instead? + type.pop_back(); + } + return type; + } + virtual bool isComplete() { + return this->complete; + } + virtual void setComplete(bool complete) { + this->complete = complete; + } + virtual bool isBatched() { + return this->batched; + } + virtual bool isConstant() { + return type.isConst; + } + virtual bool isIn() { + return this->direction == GSPAR_PARAM_IN || this->direction == GSPAR_PARAM_INOUT; + } + virtual bool isOut() { + return this->direction == GSPAR_PARAM_OUT || this->direction == GSPAR_PARAM_INOUT; + } + /** + * Returns the parameter type for use inside the kernel + */ + virtual std::string toKernelParameter() { + std::string type = this->type.getFullName(); + if (this->isBatched() && paramValueType == GSPAR_PARAM_VALUE) { + // A batched parameter is a pointer of values. + // If it's a PointerParameter, we already ripped off the extra * and will flatten the pointers. + // If it's a ValueParameter, we need to add an extra * (we will use a pointer of values) + type += "*"; + } + return type + " " + this->getKernelParameterName(); + } + virtual std::string getKernelParameterName() { + return (this->isBatched() ? "gspar_batched_" : "") + this->name; + } + virtual bool isValueTyped() = 0; + }; + + template + class TypedParameter; + + class ValueParameter; + + class PointerParameter; + + class BaseParallelPattern; + + } +} + +#include "GSPar_Base.hpp" +#include "GSPar_BaseGPUDriver.hpp" + +namespace GSPar { + namespace Pattern { + + // TODO this specialized classes are completely useless. We could work all out with only BaseParameter and it would be far simpler + + /** + * A pattern typed parameter + */ + template + class TypedParameter + : public BaseParameter { + protected: + T value; + std::unique_ptr memoryObject; + Driver::BaseMemoryObjectBase* userMemoryObject = nullptr; // MemoryObject from user + public: + size_t numberOfElements; + + TypedParameter() { } + TypedParameter(std::string name, VarType type, size_t size, T value, + ParameterValueType paramValueType, ParameterDirection direction = GSPAR_PARAM_IN, bool batched = false) : + BaseParameter(name, type, size, paramValueType, direction, batched), value(value) { }; + virtual ~TypedParameter() { } + + virtual Driver::BaseMemoryObjectBase *getMemoryObject() { + if(userMemoryObject != nullptr){ + return this->userMemoryObject; + } + return this->memoryObject.get(); + } + + virtual void setUserMemoryObject(Driver::BaseMemoryObjectBase* memoryObjectFromUser) { + this->userMemoryObject = memoryObjectFromUser; + } + + // virtual T getValue() { return this->value; } + }; + + /** + * A value parameter for pattern + */ + class ValueParameter + : public TypedParameter { + public: + ValueParameter() : TypedParameter() { } + ValueParameter(std::string name, VarType type, size_t size, void *value, ParameterDirection direction = GSPAR_PARAM_IN, bool batched = false) : + TypedParameter(name, type, size, value, ParameterValueType::GSPAR_PARAM_VALUE, direction, batched) { + if (value == nullptr) { // It may be just a placeholder + this->complete = false; + } + }; + virtual ~ValueParameter() { } + + virtual bool isValueTyped() override { return true; } + virtual void* getPointer() { return this->value; } + + template + Driver::BaseMemoryObjectBase *malloc(TDevice gpu, unsigned int batchSize) { + if (this->isBatched()) { + // By default, it is a read-only parameter + this->memoryObject = std::unique_ptr(gpu->malloc(batchSize * this->size, this->getPointer(), true, false)); + } + // If it is a non-batched ValueParameter, we return a nullptr + return this->memoryObject.get(); + } + }; + + /** + * A pointer parameter for pattern + */ + class PointerParameter + : public TypedParameter { + public: + PointerParameter() : TypedParameter() { } + // Constructor with no MemoryObject from user + PointerParameter(std::string name, VarType type, size_t size, void *value, ParameterDirection direction = GSPAR_PARAM_IN, bool batched = false) : + TypedParameter(name, type, size, value, ParameterValueType::GSPAR_PARAM_POINTER, direction, batched) { + if (!value) { // It is just a placeholder + this->complete = false; + } + }; + // Constructor with MemoryObject from user + PointerParameter(std::string name, VarType type, Driver::BaseMemoryObjectBase* userMemoryObject, ParameterDirection direction = GSPAR_PARAM_IN, bool batched =false) : + TypedParameter(name, type, userMemoryObject->getSize(), userMemoryObject->getHostPointer(), ParameterValueType::GSPAR_PARAM_POINTER, direction, batched) { + this->setUserMemoryObject(userMemoryObject); + }; + virtual ~PointerParameter() { } + + virtual bool isValueTyped() override { return false; } + virtual void* getPointer() { return this->value; } + + template + Driver::BaseMemoryObjectBase *malloc(TDevice gpu, unsigned int batchSize) { + // If it is only IN, the kernel won't write, if is OUT, the kernel won't read + bool readOnly = (this->direction == Pattern::ParameterDirection::GSPAR_PARAM_IN); + bool writeOnly = (this->direction == Pattern::ParameterDirection::GSPAR_PARAM_OUT); + if (this->isBatched()) { + // A batched PointerParameter is conversible to void** + this->memoryObject = std::unique_ptr(gpu->mallocChunked(batchSize, this->size, (void**)this->getPointer(), readOnly, writeOnly)); + } else { + this->memoryObject = std::unique_ptr(gpu->malloc(this->size, this->getPointer(), readOnly, writeOnly)); + } + return this->memoryObject.get(); + } + }; + + /** + * Base class for parallel patterns + */ + class BaseParallelPattern { + private: + unsigned int gpuIndex = 0; + Driver::BaseDeviceBase* gpuDevice = nullptr; + + protected: + std::unique_ptr executionFlow; + bool batched = false; + unsigned int batchSize = 1; //TODO what if Dimension max is not divisible by batchSize? It actually segfaults + bool _isKernelCompiled = false; + bool isKernelStale = false; // Do we need to recompile the kernel? + mutable std::mutex compiledKernelMutex; + // Should we use a std::map to support multiple pre-compiled kernels? + Driver::Dimensions compiledKernelDimension; + std::shared_ptr compiledKernel; + std::string kernelName; + std::string userKernel; + std::string extraKernelCode; + std::vector paramsOrder; + // Set the thread block size (it is an optional paramenter) #gabriell + int numThreadsPerBlock[3] = {0, 0, 0}; + /** + * We use a shared_ptr of parameters, so they can be safely cloned together with the Pattern + * And they'll be automatically released as soon as all clones are destroyed + */ + std::map> params; + std::array stdVarNames; + bool useSharedMemory = false; + mutable std::mutex sharedMemoryParameterMutex; + PointerParameter* sharedMemoryParameter = nullptr; + + // Parameters + + /** + * Get the type (as string) of the template argument + * from https://stackoverflow.com/a/20170989/ + */ + template + VarType getTemplatedType() { + typedef typename std::remove_reference::type TR; + std::unique_ptr own ( + #ifndef _MSC_VER + abi::__cxa_demangle(typeid(TR).name(), nullptr, nullptr, nullptr), + #else + nullptr, + #endif + std::free + ); + VarType varType; + varType.name = own != nullptr ? own.get() : typeid(TR).name(); + varType.isPointer = std::is_pointer::value; + if (varType.isPointer) { + typedef typename std::remove_pointer::type TNoPtr; + varType.isClass = std::is_class::value; + } else { + varType.isClass = std::is_class::value; + } + varType.isConst = std::is_const::value; + varType.isVolatile = std::is_volatile::value; + varType.isLValueRef = std::is_lvalue_reference::value; + if (!varType.isLValueRef) { // Can't be both + varType.isRValueRef = std::is_rvalue_reference::value; + } + return varType; + } + + virtual void setPointerParameter(std::string name, VarType type, size_t size, void *value, ParameterDirection direction = GSPAR_PARAM_IN, bool batched = false) { + // GSPAR_PARAM_PRESENT is incorrect when using a host pointer instead of a MemoryObject + if (direction == GSPAR_PARAM_PRESENT) { + throw GSParException("Pattern parameter \"" + name + "\": GSPAR_PARAM_PRESENT is only allowed when a MemoryObject is provided"); + } + std::shared_ptr parameter(new PointerParameter(name, type, size, value, direction, batched)); + this->setParameter(parameter); + } + // Using MemoryObject from user + virtual void setPointerParameter(std::string name, VarType type, Driver::BaseMemoryObjectBase* userMemoryObject, ParameterDirection direction = GSPAR_PARAM_IN, bool batched = false) { + // new PointParameter with MemoryObject from user + std::shared_ptr parameter(new PointerParameter(name, type, userMemoryObject, direction, batched)); + this->setParameter(parameter); + } + virtual void setValueParameter(std::string name, VarType type, size_t size, void *value, ParameterDirection direction = GSPAR_PARAM_IN, bool batched = false) { + std::shared_ptr parameter(new ValueParameter(name, type, size, value, direction, batched)); + this->setParameter(parameter); + } + virtual void setParameter(std::shared_ptr parameter) { + // std::cout << "Setting BaseParameter " << parameter->type.getFullName() << " " << parameter->name << " of " << parameter->size << " bytes" << (parameter->isBatched() ? " [batched]" : "") << std::endl; + auto paramName = parameter.get()->name; + if (std::find(this->paramsOrder.begin(), this->paramsOrder.end(), paramName) == this->paramsOrder.end()) { + this->paramsOrder.push_back(paramName); + this->isKernelStale = true; // There is a new parameter, we need to recompile the kernel + } + this->params[paramName] = parameter; + } + + template + decltype(TDriverInstance::getExecutionFlowType())* getExecutionFlow() { + return dynamic_cast(this->executionFlow.get()); + } + + // Main run function for Parallel Pattern + template + void run(Driver::Dimensions pDims, bool useCompiledDim) { + Driver::Dimensions dimsToUse = useCompiledDim ? this->compiledKernelDimension : pDims; + if (!dimsToUse.getCount()) { + throw GSParException("No dimensions set to run the pattern"); + } + #ifdef GSPAR_DEBUG + std::stringstream ss; + #endif + + // TODO validade if dimsToUse is valid + + Driver::Dimensions dimsToRun = dimsToUse; + if (this->isBatched()) { + dimsToRun *= this->batchSize; + #ifdef GSPAR_DEBUG + ss << "[" << std::this_thread::get_id() << " GSPar Pattern "<batchSize << " batch size, so we'll run for " << dimsToRun.toString() << std::endl; + std::cout << ss.str(); + ss.str(""); + #endif + } + + this->compile(dimsToUse); + + // #ifdef GSPAR_DEBUG + // auto gpu = this->getGpu(); + // ss << "[" << std::this_thread::get_id() << " GSPar Pattern "<getName() << std::endl; + // std::cout << ss.str(); + // ss.str(""); + // #endif + + auto kernel = this->getCompiledKernel(); + kernel->clearParameters(); + + // Set the thread block size (it is an optional paramenter) + if (numThreadsPerBlock[0] != 0) { + kernel->setNumThreadsPerBlockForX(numThreadsPerBlock[0]); + } + if (numThreadsPerBlock[1] != 0) { + kernel->setNumThreadsPerBlockForY(numThreadsPerBlock[1]); + } + if (numThreadsPerBlock[2] != 0) { + kernel->setNumThreadsPerBlockForZ(numThreadsPerBlock[2]); + } + + this->callbackBeforeAllocatingMemoryOnGpu(dimsToUse, kernel); + + this->mallocParametersInGpu(); + + this->copyParametersFromHostToGpuAsync(); + + this->setSharedMemoryInKernel(kernel, dimsToUse); + + this->setParametersInKernel(kernel, dimsToUse); + + this->callbackAfterCopyDataFromHostToGpu(); + this->callbackBeforeRunInGpu(); + + auto executionFlow = this->getExecutionFlow(); + + #ifdef GSPAR_DEBUG + ss << "[" << std::this_thread::get_id() << " GSPar Pattern "<runAsync(dimsToRun, executionFlow); + + #ifdef GSPAR_DEBUG + ss << "[" << std::this_thread::get_id() << " GSPar Pattern "<waitAsync(); + + #ifdef GSPAR_DEBUG + ss << "[" << std::this_thread::get_id() << " GSPar Pattern "<callbackAfterRunInGpu(); + + this->copyParametersFromGpuToHostAsync(); + + this->callbackAfterCopyDataFromGpuToHost(dimsToUse, kernel); + + #ifdef GSPAR_DEBUG + ss << "[" << std::this_thread::get_id() << " GSPar Pattern "<batched; + } + + virtual BaseParallelPattern& setBatchSize(unsigned int batchSize) { + if (!batchSize) { // Set not batched + if (this->isBatched()) { + // The pattern was batched and now it isn't, we need to recompile the kernel + this->isKernelStale = true; + } + this->batched = false; + } else { + if (!this->isBatched()) { + // The pattern wasn't batched and now it is, we need to recompile the kernel + this->isKernelStale = true; + } + this->batched = true; + } + this->batchSize = batchSize; + return *this; + } + + // TODO support using GPUs based on some scheduler (round-robin, etc) + virtual void setGpuIndex(unsigned int index) { + if (this->gpuIndex != index) { + this->isKernelStale = true; // If the GPU changed, we need to recompile the kernel + this->gpuDevice = nullptr; + this->executionFlow.reset(); + this->gpuIndex = index; + } + } + virtual unsigned int getGpuIndex() { + return this->gpuIndex; + } + + template + void cloneInto(BaseParallelPattern* other) const { + this->cloneIntoNonTemplated(other); + // Clone templated values + other->setGpu((decltype(TDriverInstance::getDeviceType())*)this->gpuDevice); + // executionFlow is not copied, each instance uses it's own. setGpu call initializes it also + + if (this->_isKernelCompiled && !this->isKernelStale) { // We only copy the kernel if it's a valid (and usable) one + std::lock_guard lock(other->compiledKernelMutex); // Auto-unlock, RAII + other->_isKernelCompiled = this->_isKernelCompiled; + other->isKernelStale = this->isKernelStale; + // compiledKernelMutex is (quite obviously) unique for each instance + if (this->compiledKernelDimension.getCount()) { + Driver::Dimensions compiledKernelDimension = this->compiledKernelDimension; + other->compiledKernelDimension = compiledKernelDimension; + } + if (this->compiledKernel.get()) { + other->compiledKernel = std::shared_ptr(new decltype(TDriverInstance::getKernelType())()); + auto localKernel = this->getCompiledKernel(); + localKernel->cloneInto(other->compiledKernel.get()); + } + } + } + + void cloneIntoNonTemplated(BaseParallelPattern* other) const { + // Clone + other->gpuIndex = this->gpuIndex; + other->batched = this->batched; + other->batchSize = this->batchSize; + other->kernelName = this->kernelName; + other->userKernel = this->userKernel; + other->extraKernelCode = this->extraKernelCode; + other->paramsOrder = this->paramsOrder; + other->params = this->params; + other->stdVarNames = this->stdVarNames; + other->useSharedMemory = this->useSharedMemory; + other->sharedMemoryParameter = this->sharedMemoryParameter; + } + + template + BaseParallelPattern& setCompiledKernel(decltype(TDriverInstance::getKernelType())* kernel, Driver::Dimensions dims) { + std::lock_guard lock(this->compiledKernelMutex); // Auto-unlock, RAII + this->compiledKernel = std::shared_ptr(kernel); + this->compiledKernelDimension = dims; + this->_isKernelCompiled = true; + this->isKernelStale = false; + return *this; + // Auto-unlock of compiledKernelMutex, RAII + } + + template + decltype(TDriverInstance::getKernelType())* getCompiledKernel() const { + return static_cast(this->compiledKernel.get()); + } + + template + void setGpu(decltype(TDriverInstance::getDeviceType())* device) { + if (this->gpuDevice != device) { + this->gpuDevice = device; + auto executionFlow = new decltype(TDriverInstance::getExecutionFlowType())(device); + executionFlow->start(); + this->executionFlow = std::unique_ptr(executionFlow); + } + } + template + decltype(TDriverInstance::getDeviceType())* getGpu() { + // Driver::BaseDeviceBase* getGpu() { + if (this->gpuDevice == nullptr) { + TDriverInstance* driver = TDriverInstance::getInstance(); + // Driver::CUDA::Instance driver = TDriverInstance::getInstance(); //Provides autocomplete + driver->init(); + + if (driver->getGpuCount() == 0) { + return nullptr; + } + + auto gpu = driver->getGpu(this->gpuIndex); + this->setGpu(gpu); + } + return (decltype(TDriverInstance::getDeviceType())*)this->gpuDevice; + } + + virtual BaseParallelPattern& addExtraKernelCode(std::string extraKernelCode) { + this->extraKernelCode += extraKernelCode; + this->isKernelStale = true; // The kernel code changed, we need to recompile it + return *this; + } + + virtual std::pair generateDefaultControlIf(Driver::Dimensions dims, std::array stdVarNames) { + std::string r = "if ("; + for(int d = 0; d < SUPPORTED_DIMS; d++) { + if (dims[d]) { + if (this->isBatched()) { + r += "(gspar_batch_" + stdVarNames[d] + " < gspar_batch_size)&&"; + } + r += "(" + stdVarNames[d] + " < gspar_max_" + stdVarNames[d] + ")&&"; + } + } + // Removes last && + r.pop_back(); + r.pop_back(); + r += ") {\n"; + return std::make_pair(r, "}"); + } + + template + std::string generateKernelSource(Driver::Dimensions dims) { + + auto codeGenerator = TDriverInstance::getInstance()->getKernelGenerator(); + std::string kernelName = this->getKernelName(); + + std::pair ifDimensions = this->generateDefaultControlIf(dims, codeGenerator->getStdVarNames(this->stdVarNames)); + + return (!this->extraKernelCode.empty() ? this->extraKernelCode + "\n" : "") + + codeGenerator->getKernelPrefix() + " " + kernelName + "(" + + codeGenerator->generateParams(this, dims) + ") {\n" + + codeGenerator->generateInitKernel(this, dims) + "\n" + + codeGenerator->generateStdVariables(this, dims) + + codeGenerator->generateBatchedParametersInitialization(this, dims) + "\n" + + ifDimensions.first + + this->getKernelCore(dims, codeGenerator->getStdVarNames(this->stdVarNames)) + + "\n" + ifDimensions.second + "\n" // if (dims) + + "}\n"; // kernel + } + + virtual std::string getKernelName() { + if (this->kernelName.empty()) { + this->kernelName = "gspar_kernel_" + getRandomString(7); + } + return this->kernelName; + } + + virtual void setKernelName(std::string kernelName) { + this->kernelName = kernelName; + } + + std::array& getStdVarNames() { + return this->stdVarNames; + } + BaseParallelPattern& setStdVarNames(std::array names) { + this->stdVarNames = names; + this->isKernelStale = true; // The kernel code changed, we need to recompile it + // TODO should we check if the names really changed? + return *this; + } + + virtual std::string getKernelCore(Driver::Dimensions dims, std::array stdVarNames) { + return std::string(this->getUserKernel()); + } + std::string getUserKernel() { + return userKernel; + } + + bool isUsingSharedMemory() { + return this->useSharedMemory; + } + virtual PointerParameter* generateSharedMemoryParameter(Driver::Dimensions dims, Driver::BaseKernelBase *kernel) { + return this->getSharedMemoryParameter(); + } + virtual PointerParameter* getSharedMemoryParameter() { + return this->sharedMemoryParameter; + } + BaseParameter* getParameter(std::string name) { + auto it = this->params.find(name); + if (it == this->params.end()) { + return nullptr; + } + return it->second.get(); + } + virtual std::vector getParameterList() { + std::vector paramList; + for (auto ¶mName : this->paramsOrder) { + paramList.push_back(this->getParameter(paramName)); + } + return paramList; + } + // Set the thread block size (it is an optional paramenter) #gabriell + virtual BaseParallelPattern& setNumThreadsPerBlockForX(unsigned long num) { + return this->setNumThreadsPerBlockFor(0, num); + } + virtual BaseParallelPattern& setNumThreadsPerBlockForY(unsigned long num) { + return this->setNumThreadsPerBlockFor(1, num); + } + virtual BaseParallelPattern& setNumThreadsPerBlockForZ(unsigned long num) { + return this->setNumThreadsPerBlockFor(2, num); + } + virtual BaseParallelPattern& setNumThreadsPerBlockFor(int dim, unsigned long num) { + this->numThreadsPerBlock[dim] = num; + return *this; + } + virtual BaseParallelPattern& setNumThreadsPerBlock(unsigned long numX, unsigned long numY, unsigned long numZ) { + this->numThreadsPerBlock[0] = numX; + this->numThreadsPerBlock[1] = numY; + this->numThreadsPerBlock[2] = numZ; + return *this; + } + + /** + * Parameter placeholder + */ + template + BaseParallelPattern& setParameterPlaceholder(std::string name, ParameterValueType parameterType = GSPAR_PARAM_POINTER, ParameterDirection direction = GSPAR_PARAM_IN, bool batched = false) { + VarType varType = getTemplatedType(); + if (parameterType == ParameterValueType::GSPAR_PARAM_POINTER) { + this->setPointerParameter(name, varType, 0, nullptr, direction, batched); + } else if (parameterType == ParameterValueType::GSPAR_PARAM_VALUE) { + this->setValueParameter(name, varType, sizeof(T), nullptr, direction, batched); + } + if (batched) { + this->batched = true; + } + return *this; + } + + /** + * Pointer parameters + */ + template + BaseParallelPattern& setParameter(std::string name, size_t size, T* value, ParameterDirection direction = GSPAR_PARAM_IN) { + VarType varType = getTemplatedType(); + this->setPointerParameter(name, varType, size, value, direction); + return *this; + } + template + BaseParallelPattern& setParameter(std::string name, size_t size, const T* value) { + // Can't call setParameter(non-const T) because getTemplatedType would lost const information + VarType varType = getTemplatedType(); + // A const parameter must be IN, as it can't be modified + this->setPointerParameter(name, varType, size, const_cast(value), GSPAR_PARAM_IN); + return *this; + } + // Using MemoryObject from user + template + BaseParallelPattern& setParameter(std::string name, Driver::BaseMemoryObjectBase* userMemoryObject, ParameterDirection direction = GSPAR_PARAM_IN) { + VarType varType = getTemplatedType(); + this->setPointerParameter(name, varType, userMemoryObject, direction); + return *this; + } + + /** + * Value parameters + */ + template + BaseParallelPattern& setParameter(std::string name, T value) { + VarType varType = getTemplatedType(); + // We need a pointer, so we allocate memory and copy the value + T* value_copy = new T; + *value_copy = value; + // A value parameter must be IN, as it can't be modified + this->setValueParameter(name, varType, sizeof(T), value_copy, GSPAR_PARAM_IN); + return *this; + } + + /** + * Batched (pointer and value) parameters + */ + template + BaseParallelPattern& setBatchedParameter(std::string name, size_t sizeOfEachBatch, T** value, ParameterDirection direction = GSPAR_PARAM_IN) { + this->batched = true; + VarType varType = getTemplatedType(); + varType.name.pop_back(); // We receive ** due to the batch. So the kernel type is only * (we flatten the pointers) + this->setPointerParameter(name, varType, sizeOfEachBatch, value, direction, true); + return *this; + } + template + BaseParallelPattern& setBatchedParameter(std::string name, size_t sizeOfEachBatch, const T** value) { + // Can't call setBatchedParameter(non-const T) because getTypeName would lost const information + this->batched = true; + VarType varType = getTemplatedType(); + varType.name.pop_back(); // We receive ** due to the batch. So the kernel type is only * (we flatten the pointers) + // A const parameter must be IN, as it can't be modified + this->setPointerParameter(name, varType, sizeOfEachBatch, const_cast(value), GSPAR_PARAM_IN, true); + return *this; + } + template + BaseParallelPattern& setBatchedParameter(std::string name, const T* value) { + this->batched = true; + VarType varType = getTemplatedType(); + varType.name.pop_back(); // We receive * due to the batch. + // The effective kernel type is a pure value, but for the parameters we still need it to be a pointer (check BaseParameter::toKernelParameter). + this->setValueParameter(name, varType, sizeof(T), const_cast(value), GSPAR_PARAM_IN, true); + return *this; + } + + virtual bool isKernelCompiledFor(Driver::Dimensions dims) { + // We only compile if the kernel wasn't compiled yet and the configuration didn't change + return this->_isKernelCompiled && !this->isKernelStale && + // TODO #10 Do we really need the exact same dimension? The sizes are passed in parameters. + this->compiledKernelDimension == dims; + } + + /** + * Compiles the pattern (including the generation and compilation of the GPU kernel) for the dims Dimensions. + * + * @param Type of the specialized BaseInstance class + * @param dims The Dimensions for which the pattern should be compiled + */ + template + BaseParallelPattern& compile(Driver::Dimensions dims) { + // We only compile if the kernel wasn't compiled yet and the configuration didn't change + if (this->isKernelCompiledFor(dims)) { + return *this; + } + std::lock_guard lock(this->compiledKernelMutex); // Auto-unlock, RAII + #ifdef GSPAR_DEBUG + std::stringstream ss; + ss << "[" << std::this_thread::get_id() << " GSPar "<getGpu(); + if (gpu == nullptr) { + throw GSParException("No GPU found for Pattern compilation"); + } + + std::string kernelName = this->getKernelName(); + + this->callbackBeforeGeneratingKernelSource(); + + std::string kernelSource = this->generateKernelSource(dims); + + #ifdef GSPAR_DEBUG + ss << "[" << std::this_thread::get_id() << " GSPar "<compiledKernel = std::unique_ptr{ + // (void*)(gpu->prepareKernel(kernelSource.c_str(), kernelName.c_str())), + // [](void *ptr) { delete static_cast(ptr); } + // }; + auto kernel = gpu->prepareKernel(kernelSource.c_str(), kernelName.c_str()); + this->compiledKernel = std::shared_ptr(kernel); + this->compiledKernelDimension = dims; + this->_isKernelCompiled = true; + this->isKernelStale = false; + return *this; + // Auto-unlock of compiledKernelMutex, RAII + } + + // TODO most of the following functions should have protected visibility + + /** + * Set shared memory allocation in kernel object + * + * @param Type of the specialized BaseInstance class + * @param kernel The kernel on which the shared memory will be configured + * @param dims The Dimensions for which the shared memory will be configured + */ + template + void setSharedMemoryInKernel(decltype(TDriverInstance::getKernelType())* kernel, Driver::Dimensions dims) { + if (!this->isUsingSharedMemory()) { + return; + } + #ifdef GSPAR_DEBUG + std::stringstream ss; + ss << "[" << std::this_thread::get_id() << " GSPar Pattern "<generateSharedMemoryParameter(dims, kernel); + kernel->setSharedMemoryAllocation(shmemParam->size); + } + + /** + * Allocates memory in GPU device for this pattern's parameters + * + * @param Type of the specialized BaseInstance class + */ + template + void mallocParametersInGpu() { + auto device = this->getGpu(); + if (device == nullptr) { + throw GSParException("No GPU found to allocate memory for parameters for Pattern"); + } + for (auto ¶mName : this->paramsOrder) { + auto param = this->getParameter(paramName); + if (!param || !param->isComplete()) { + throw GSParException("Pattern parameter \"" + param->name + "\" is just a placeholder. The parameter list must be complete to run the parallel pattern."); + } + if (param->paramValueType == Pattern::ParameterValueType::GSPAR_PARAM_POINTER) { // It is a PointerParameter + auto paramPointer = static_cast(param); + if (paramPointer->getMemoryObject() == nullptr) { // It returns a MemoryObject from user, if available + paramPointer->malloc(device, this->batchSize); //TODO check if the batchSize changed since the last parameter allocation + #ifndef GSPAR_PATTERN_DISABLE_PINNED_MEMORY + // In some cases, copyInAsync fails with CUDA_ERROR_INVALID_VALUE: invalid argument. According to the docs: + // Memory regions requested must be either entirely registered with CUDA, or in the case of host pageable transfers, not registered at all. + // Memory regions spanning over allocations that are both registered and not registered with CUDA are not supported and will return CUDA_ERROR_INVALID_VALUE. + // We confirmed that avoiding pinned memory eliminates the failure, but we are still unsure why it happens + if (paramPointer->direction == GSPAR_PARAM_INOUT || paramPointer->direction == GSPAR_PARAM_OUT) { + // Pinned memory allows for memory operations overlapping in CUDA + if (paramPointer->isBatched()) { + auto chunkedMemObj = dynamic_cast(paramPointer->getMemoryObject()); + chunkedMemObj->pinHostMemory(); + } else { + auto singleMemObj = dynamic_cast(paramPointer->getMemoryObject()); + singleMemObj->pinHostMemory(); + } + } + #endif + } + } else if (param->paramValueType == Pattern::ParameterValueType::GSPAR_PARAM_VALUE) { + auto paramValue = static_cast(param); + if (paramValue->getMemoryObject() == nullptr) { + paramValue->malloc(device, this->batchSize); + } + } + } + } + + /** + * Copies IN and INOUT parameters from host to device (asynchronously) + * + * @param Type of the specialized BaseInstance class + */ + template + void copyParametersFromHostToGpuAsync() { + #ifdef GSPAR_DEBUG + std::stringstream ss; ss.str(""); + #endif + // We use the same execution flow as the kernel itself, so we don't need to wait the async copies to finish + // Waiting the async copies to finish causes OpenCL to hang (possibly a deadlock?) + auto executionFlow = this->getExecutionFlow(); + + for (auto ¶mName : this->paramsOrder) { + auto param = this->getParameter(paramName); + if (param && param->isIn()) { + if (param->paramValueType == Pattern::ParameterValueType::GSPAR_PARAM_POINTER) { + auto paramPointer = static_cast(param); + #ifdef GSPAR_DEBUG + ss << "[" << std::this_thread::get_id() << " GSPar Pattern "<name << " (" << paramPointer->getMemoryObject() << ") to GPU in flow " << executionFlow << std::endl; + std::cout << ss.str(); + ss.str(""); + #endif + if (param->isBatched()) { + auto chunkedMemObj = dynamic_cast(paramPointer->getMemoryObject()); + if (this->batchSize != chunkedMemObj->getChunkCount()) { + // The pattern batch size changed from when the parameter was created. + // If it is lower than the parameter batch size, we copy only the related chunks + // TODO what if it is higher? + for (unsigned int c = 0; c < this->batchSize; c++) { + chunkedMemObj->copyInAsync(c, executionFlow); + } + } else { + chunkedMemObj->copyInAsync(executionFlow); // Copy all the chunks + } + } else { + auto singleMemObj = dynamic_cast(paramPointer->getMemoryObject()); + singleMemObj->copyInAsync(executionFlow); + } + } else if (param->paramValueType == Pattern::ParameterValueType::GSPAR_PARAM_VALUE) { + if (param->isBatched()) { + auto paramValue = static_cast(param); + auto memObj = dynamic_cast(paramValue->getMemoryObject()); + memObj->copyInAsync(executionFlow); + } + } + } + } + } + + template + void copyParametersFromGpuToHostAsync() { + // #ifdef GSPAR_DEBUG + // std::stringstream ss; + // #endif + for (auto& paramName : this->paramsOrder) { + // #ifdef GSPAR_DEBUG + // ss << "[GSPar Pattern "<getParameter(paramName); + if (param && param->isOut() && param->paramValueType == Pattern::ParameterValueType::GSPAR_PARAM_POINTER) { + auto paramPointer = static_cast(param); + // TODO copy async + // memObj->copyOutAsync(); + // std::cout << "Asking to copy " << param->name << " back from GPU" << std::endl; + if (param->isBatched()) { + auto chunkedMemObj = dynamic_cast(paramPointer->getMemoryObject()); + if (this->batchSize != chunkedMemObj->getChunkCount()) { + // The pattern batch size changed from when the parameter was created. + // If it is lower than the parameter batch size, we copy only the related chunks + // TODO what if it is higher? + for (unsigned int c = 0; c < this->batchSize; c++) { + chunkedMemObj->copyOut(c); + } + } else { + chunkedMemObj->copyOut(); // Copy all the chunks + } + } else { + auto singleMemObj = dynamic_cast(paramPointer->getMemoryObject()); + if (singleMemObj) { + singleMemObj->copyOut(); + } + } + } + } + } + + template + void setParametersInKernel(decltype(TDriverInstance::getKernelType())* kernel, Driver::Dimensions dims) { + this->setDimsParametersInKernel(kernel, dims); + + if (this->isBatched()) { + kernel->setParameter(sizeof(unsigned int), &this->batchSize); + } + + // Sets Pattern parameters in Kernel object + for (auto ¶mName : this->paramsOrder) { + auto param = this->getParameter(paramName); + this->setParameterInKernel(kernel, param); + } + } + + template + void setDimsParametersInKernel(decltype(TDriverInstance::getKernelType())* kernel, Driver::Dimensions dims) { + for(int d = 0; d < dims.getCount(); d++) { + if (dims.is(d)) { + // #ifdef GSPAR_DEBUG + // std::stringstream ss; ss.str(""); + // ss << "[" << std::this_thread::get_id() << " GSPar Pattern "<setParameter(sizeof(unsigned long), &(dims[d].max)); + if (dims[d].min && !this->isBatched()) { // Same check as codeGenerator + // TODO Support min in batches + // #ifdef GSPAR_DEBUG + // ss << "[" << std::this_thread::get_id() << " GSPar Pattern "<setParameter(sizeof(unsigned long), &(dims[d].min)); + } + } + } + } + + template + void setParameterInKernel(decltype(TDriverInstance::getKernelType())* kernel, BaseParameter* parameter) { + if (parameter->direction == Pattern::ParameterDirection::GSPAR_PARAM_NONE) { + return; // NONE parameters doesn't go in kernel + } + #ifdef GSPAR_DEBUG + std::stringstream ss; + ss << "[" << std::this_thread::get_id() << " GSPar Pattern "<name << "' in kernel " << kernel << std::endl; + std::cout << ss.str(); + ss.str(""); + #endif + if (parameter->paramValueType == Pattern::ParameterValueType::GSPAR_PARAM_POINTER) { // It is a PointerParameter + auto paramPointer = static_cast(parameter); + if (parameter->isBatched()) { + auto chunkedMemObj = dynamic_cast(paramPointer->getMemoryObject()); + // We don't need to wait the async copy because they are running in the same execution flow as the kernel itself + // if (chunkedMemObj) { + // chunkedMemObj->waitAsync(); // Waits for async copy to finish + // } + kernel->setParameter(chunkedMemObj); // We can simply set the memory object + } else { + auto singleMemObj = dynamic_cast(paramPointer->getMemoryObject()); + // We don't need to wait the async copy because they are running in the same execution flow as the kernel itself + // if (singleMemObj) { + // singleMemObj->waitAsync(); // Waits for async copy to finish + // } + kernel->setParameter(singleMemObj); // We can simply set the memory object + } + } else if (parameter->paramValueType == Pattern::ParameterValueType::GSPAR_PARAM_VALUE) { // It is a ValueParameter + auto paramValue = static_cast(parameter); + if (parameter->isBatched()) { + // Batched ValueParameters are allocated as a single buffer + auto singleMemObj = dynamic_cast(paramValue->getMemoryObject()); + // We don't need to wait the async copy because they are running in the same execution flow as the kernel itself + // if (singleMemObj) { + // singleMemObj->waitAsync(); // Waits for async copy to finish + // } + kernel->setParameter(singleMemObj); // We can simply set the memory object + } else { + // We get the pointer directly + auto paramValue = static_cast(parameter); + kernel->setParameter(paramValue->size, paramValue->getPointer()); + } + } + + } + + template + void run() { + this->run(Driver::Dimensions(), true); + } + + template + void run(unsigned long dims[3][2]) { + this->run(Driver::Dimensions(dims), false); + } + + template + void run(unsigned long max[3]) { + this->run(Driver::Dimensions(max), false); + } + + template + void run(Driver::Dimensions dims) { + this->run(dims, false); + } + + // Overridable callbacks + // TODO these callbacks should have protected visibility + virtual void callbackBeforeGeneratingKernelSource() { } + virtual void callbackBeforeAllocatingMemoryOnGpu(Driver::Dimensions dims, Driver::BaseKernelBase *kernel) { } + virtual void callbackAfterCopyDataFromHostToGpu() { } + virtual void callbackBeforeRunInGpu() { } + virtual void callbackAfterRunInGpu() { } + virtual void callbackAfterCopyDataFromGpuToHost(Driver::Dimensions dims, Driver::BaseKernelBase *kernel) { } + }; + + } +} + +#endif diff --git a/src/GSPar_CUDA.cpp b/src/GSPar_CUDA.cpp new file mode 100644 index 0000000..92048c4 --- /dev/null +++ b/src/GSPar_CUDA.cpp @@ -0,0 +1,942 @@ + +#include +#include +#include +#include +#ifdef GSPAR_DEBUG +#include +#endif +#include +#include + +#include "GSPar_CUDA.hpp" + +using namespace GSPar::Driver::CUDA; + + +///// Exception ///// + +std::string Exception::getErrorString(CUresult code) { + const char* errName; + cuGetErrorName(code, &errName); + const char* errString; + cuGetErrorString(code, &errString); + std::string res(errName); + res.append(": "); + res.append(errString); + return res; +} +Exception::Exception(std::string msg, std::string details) : BaseException(msg, details) { } +Exception::Exception(CUresult code, std::string details) : BaseException(code, details) { + // Can't call this virtual function in the base constructor + this->msg = this->getErrorString(code); +} +Exception* Exception::checkError(CUresult code, std::string details) { + return BaseException::checkError(code, CUDA_SUCCESS, details); +} +void Exception::throwIfFailed(CUresult code, std::string details) { + // Exception* ex = Exception::checkError(code, details); + // if (ex) std::cerr << "Exception: " << ex->what() << " - " << ex->getDetails() << std::endl; + BaseException::throwIfFailed(code, CUDA_SUCCESS, details); +} + +std::string CompilationException::getErrorString(nvrtcResult code) { + const char* errString = nvrtcGetErrorString(code); + return std::string(errString); +} +CompilationException::CompilationException(std::string msg, std::string details) : BaseException(msg, details) { } +CompilationException::CompilationException(nvrtcResult code, std::string details) : BaseException(code, details) { + // Can't call this virtual function in the base constructor + this->msg = this->getErrorString(code); +} +CompilationException* CompilationException::checkError(nvrtcResult code, std::string details) { + return BaseException::checkError(code, NVRTC_SUCCESS, details); +} +void CompilationException::throwIfFailed(nvrtcResult code, std::string details) { + BaseException::throwIfFailed(code, NVRTC_SUCCESS, details); +} +void CompilationException::throwIfFailed(nvrtcResult code, nvrtcProgram cudaProgram, std::string details) { + if (code == NVRTC_ERROR_COMPILATION) { + size_t logSize; + nvrtcGetProgramLogSize(cudaProgram, &logSize); + char* log = new char[logSize]; + nvrtcGetProgramLog(cudaProgram, log); + details += "\n" + std::string(log); + } + BaseException::throwIfFailed(code, NVRTC_SUCCESS, details); +} + + +///// ExecutionFlow ///// + +ExecutionFlow::ExecutionFlow() : BaseExecutionFlow() { } +ExecutionFlow::ExecutionFlow(Device* device) : BaseExecutionFlow(device) { } +ExecutionFlow::~ExecutionFlow() { + // We don't throw exceptions on destructors + #ifdef GSPAR_DEBUG + std::stringstream ss; // Using stringstream eases multi-threaded debugging + #endif + if (this->flowObject) { + // In case the device is still doing work in the stream when cuStreamDestroy() is called, + // the function will return immediately and the resources associated with the stream will + // be released automatically once the device has completed all work in the stream. + // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html + #ifdef GSPAR_DEBUG + ss << "[GSPar Execution Flow " << this << "] clearing CUstream" << std::endl; + std::cout << ss.str(); + ss.str(""); + #endif + Exception* ex = Exception::checkError( cuStreamDestroy(this->flowObject) ); + if (ex != nullptr) { + std::cerr << "Failed when releasing cuda stream of execution flow: "; + std::cerr << ex->what() << " - " << ex->getDetails() << std::endl; + delete ex; + } + + this->flowObject = NULL; + } +} +CUstream ExecutionFlow::start() { + // #ifdef GSPAR_DEBUG + // std::stringstream ss; // Using stringstream eases multi-threaded debugging + // ss << "[GSPar CUDA "<device << std::endl; + // std::cout << ss.str(); + // ss.str(""); + // #endif + + if (!this->device) { + // Can't start flow on a NULL device + throw Exception("A device is required to start an execution flow", defaultExceptionDetails()); + } + if (!this->flowObject) { + this->device->getContext(); // There must be a context to create a stream + CUstream stream; + throwExceptionIfFailed( cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING) ); + this->setBaseFlowObject(stream); + } + return this->getBaseFlowObject(); +} +void ExecutionFlow::synchronize() { + throwExceptionIfFailed( cuStreamSynchronize(this->getBaseFlowObject()) ); +} +CUstream ExecutionFlow::checkAndStartFlow(Device* device, ExecutionFlow* executionFlow) { + return BaseExecutionFlow::checkAndStartFlow(device, executionFlow); +} + + +///// AsyncExecutionSupport ///// + +AsyncExecutionSupport::AsyncExecutionSupport(CUstream asyncObj) : BaseAsyncExecutionSupport(asyncObj) { } +void AsyncExecutionSupport::waitAsync() { + if (this->asyncObject) { + throwExceptionIfFailed( cuStreamSynchronize(this->asyncObject) ); + this->runningAsync = false; + } +}; +// static +void AsyncExecutionSupport::waitAllAsync(std::initializer_list asyncs) { + for (auto async : asyncs) { + throwExceptionIfFailed( cuStreamSynchronize(async->getBaseAsyncObject()) ); + } +} + + +///// Instance ///// + +Instance *Instance::instance = nullptr; + +void Instance::loadGpuList() { + this->init(); + this->clearGpuList(); + + unsigned int gpuCount = this->getGpuCount(); + for (unsigned int i = 0; i < gpuCount; ++i) { + this->devices.push_back(new Device(i)); + } +} + +Instance::Instance() : BaseInstance(Runtime::GSPAR_RT_CUDA) { } +Instance::~Instance() { + Instance::instance = nullptr; +} +Instance* Instance::getInstance() { + // TODO implement thread-safety + if (!instance) { + instance = new Instance(); + } + return instance; +} + +void Instance::init() { + if (!this->instanceInitiated) { + throwExceptionIfFailed( cuInit(0) ); + this->instanceInitiated = true; + } +} + +unsigned int Instance::getGpuCount() { + this->init(); + int gpuCount = 0; + throwExceptionIfFailed( cuDeviceGetCount(&gpuCount) ); + return gpuCount; +} + + +///// Device ///// + +Device::Device() : BaseDevice() { } +Device::Device(int ordinal) { + this->libDevice = new CUdevice; + this->deviceId = ordinal; + throwExceptionIfFailed( cuDeviceGet(this->libDevice, ordinal) ); +} +Device::~Device() { + // We don't throw exceptions on destructors +#ifdef GSPAR_DEBUG + std::cout << "[GSPar Device " << this << "] Destructing"; +#endif + if (this->defaultExecutionFlow) { + delete this->defaultExecutionFlow; + this->defaultExecutionFlow = NULL; + } + + if (this->libContext && this->libDevice) { + Exception* ex = Exception::checkError( cuCtxSynchronize() ); + if (ex) { + std::cerr << "Failed when waiting for context to synchronize on Device's destructor: "; + std::cerr << ex->what() << " - " << ex->getDetails() << std::endl; + delete ex; + } + + + ex = Exception::checkError( cuDevicePrimaryCtxRelease(*this->libDevice) ); + if (ex) { + std::cerr << "Failed when releasing primary device context on Device's destructor: "; + std::cerr << ex->what() << " - " << ex->getDetails() << std::endl; + delete ex; + } + this->libContext = NULL; + } + if (this->libDevice) { + delete this->libDevice; + this->libDevice = NULL; + } +#ifdef GSPAR_DEBUG + std::cout << "[GSPar Device " << this << "] Destructed successfully"; +#endif +} +ExecutionFlow* Device::getDefaultExecutionFlow() { + std::lock_guard lock(this->defaultExecutionFlowMutex); // Auto-unlock, RAII + if (!this->defaultExecutionFlow) { + this->defaultExecutionFlow = new ExecutionFlow(this); + } + return this->defaultExecutionFlow; + // Auto-unlock of defaultExecutionFlowMutex, RAII +} +CUcontext Device::getContext() { + if (!this->libContext) { + std::lock_guard lock(this->libContextMutex); // Auto-unlock, RAII + if (!this->libContext) { // Check if someone changed it while we were waiting for the lock + CUcontext context; + throwExceptionIfFailed( cuDevicePrimaryCtxRetain(&context, *this->libDevice) ); + this->setContext(context); + } + // Auto-unlock of libContextMutex, RAII + } + // Sets the context as current for the caller thread + throwExceptionIfFailed( cuCtxSetCurrent(this->libContext) ); + return this->libContext; +} +CUstream Device::startDefaultExecutionFlow() { + return this->getDefaultExecutionFlow()->start(); +} +unsigned int Device::getDeviceId() { + this->getContext(); // There must be a context to call almost everything + return this->deviceId; +} +const std::string Device::getName() { + this->getContext(); // There must be a context to call almost everything + unsigned int default_size = 256; + char* name = new char[default_size]; + throwExceptionIfFailed( cuDeviceGetName(name, default_size, *this->getBaseDeviceObject()) ); + // Try 6 times more + while (default_size <= 16384 && std::string(name).length() > default_size) { + default_size *= 2; + delete name; + name = new char[default_size]; + throwExceptionIfFailed( cuDeviceGetName(name, default_size, *this->getBaseDeviceObject()) ); + } + return name; +} +unsigned int Device::getComputeUnitsCount() { + this->getContext(); // There must be a context to call almost everything + return this->queryInfoNumeric(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT); +} +unsigned int Device::getWarpSize() { + this->getContext(); // There must be a context to call almost everything + return this->queryInfoNumeric(CU_DEVICE_ATTRIBUTE_WARP_SIZE); +} +unsigned int Device::getMaxThreadsPerBlock() { + this->getContext(); // There must be a context to call almost everything + return this->queryInfoNumeric(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK); +} +unsigned long Device::getGlobalMemorySizeBytes() { + this->getContext(); // There must be a context to call almost everything + unsigned long bytes; + throwExceptionIfFailed( cuDeviceTotalMem(&bytes, *this->getBaseDeviceObject()) ); + return bytes; +} +unsigned long Device::getLocalMemorySizeBytes() { + this->getContext(); // There must be a context to call almost everything + return this->queryInfoNumeric(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK); +} +unsigned long Device::getSharedMemoryPerComputeUnitSizeBytes() { + this->getContext(); // There must be a context to call almost everything + return this->queryInfoNumeric(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR); +} +unsigned int Device::getClockRateMHz() { + this->getContext(); // There must be a context to call almost everything + return (this->queryInfoNumeric(CU_DEVICE_ATTRIBUTE_CLOCK_RATE) / 1000); +} +bool Device::isIntegratedMainMemory() { + this->getContext(); // There must be a context to call almost everything + return this->queryInfoNumeric(CU_DEVICE_ATTRIBUTE_INTEGRATED); +} +MemoryObject* Device::malloc(long size, void* hostPtr, bool readOnly, bool writeOnly) { + return new MemoryObject(this, size, hostPtr, readOnly, writeOnly); +} +MemoryObject* Device::malloc(long size, const void* hostPtr) { + return new MemoryObject(this, size, hostPtr); +} +ChunkedMemoryObject* Device::mallocChunked(unsigned int chunks, long chunkSize, void** hostPointers, bool readOnly, bool writeOnly) { + return new ChunkedMemoryObject(this, chunks, chunkSize, hostPointers, readOnly, writeOnly); +} +ChunkedMemoryObject* Device::mallocChunked(unsigned int chunks, long chunkSize, const void** hostPointers) { + return new ChunkedMemoryObject(this, chunks, chunkSize, hostPointers); +} +Kernel* Device::prepareKernel(const std::string kernel_source, const std::string kernel_name) { + this->getContext(); // There must be a context to call almost everything + return new Kernel(this, kernel_source, kernel_name); +} +std::vector Device::prepareKernels(const std::string kernelSource, const std::vector kernelNames) { + this->getContext(); // There must be a context to call almost everything + + std::string programName = "program_" + kernelNames.front(); + + auto programAndModule = this->compileCudaProgramAndLoadModule(kernelSource, programName); + nvrtcProgram cudaProgram = std::get<0>(programAndModule); + CUmodule cudaModule = std::get<1>(programAndModule); + + std::vector kernels; + for (auto name : kernelNames) { + kernels.push_back(new Kernel(this, cudaProgram, cudaModule, name)); + } + return kernels; +} +const int Device::queryInfoNumeric(CUdevice_attribute paramName, bool cacheable) { + // https://www.quora.com/Is-it-thread-safe-to-write-to-distinct-keys-different-key-for-each-thread-in-a-std-map-in-C-for-keys-that-have-existing-entries-in-the-map/answer/John-R-Grout + if (cacheable) { // Check if the attribute is cached + std::lock_guard lock(this->attributeCacheMutex); // Auto-unlock, RAII + auto it = this->attributeCache.find(paramName); + if (it != this->attributeCache.end()) { + return it->second; + } + // Auto-unlock of attributeCacheMutex, RAII + } + + int pi; + throwExceptionIfFailed( cuDeviceGetAttribute(&pi, paramName, *this->getBaseDeviceObject()) ); + if (cacheable) { // Stores the attribute in cache + std::lock_guard lock(this->attributeCacheMutex); // Auto-unlock, RAII + this->attributeCache[paramName] = pi; + // Auto-unlock of attributeCacheMutex, RAII + } + return pi; +} +std::tuple Device::compileCudaProgramAndLoadModule(std::string source, const std::string programName) { +#ifdef GSPAR_DEBUG + std::stringstream ss; // Using stringstream eases multi-threaded debugging + ss << "[GSPar Device " << this << "] Kernel received to compile: [" << programName << "] = \n" << source << std::endl; + std::cout << ss.str(); + ss.str(""); +#endif + + // -------------------------------------------------------------------- + // gets the compute capability + // -------------------------------------------------------------------- + int computeCapabilityMajor = this->queryInfoNumeric(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR); + int computeCapabilityMinor = this->queryInfoNumeric(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR); + std::string computeCapabilityArg = "--gpu-architecture=compute_" + std::to_string(computeCapabilityMajor) + std::to_string(computeCapabilityMinor); + + // -------------------------------------------------------------------- + // Appending additional routines to the kernel source + // -------------------------------------------------------------------- + std::string completeKernelSource = ""; + if (computeCapabilityMajor < 6) { + // atomicAdd() for double-precision floating-point numbers is not available by + // default on devices with compute capability lower than 6.0 + // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicadd + completeKernelSource.append(KernelGenerator::ATOMIC_ADD_POLYFILL); + } + completeKernelSource.append(Instance::getInstance()->getKernelGenerator()->generateStdFunctions()); + completeKernelSource.append(Instance::getInstance()->getKernelGenerator()->replaceMacroKeywords(source)); + +#ifdef GSPAR_DEBUG + ss << "[GSPar Device " << this << "] Complete kernel for compilation: [" << programName << "] = \n" << completeKernelSource << std::endl; + std::cout << ss.str(); + ss.str(""); +#endif + + nvrtcProgram cudaProgram; + CUmodule cudaModule; + + throwCompilationExceptionIfFailed( nvrtcCreateProgram(&cudaProgram, completeKernelSource.c_str(), programName.c_str(), 0, NULL, NULL), cudaProgram ); + + // https://docs.nvidia.com/cuda/nvrtc/index.html + int numOptions = 7; + const char *compilationOptions[numOptions]; + compilationOptions[0] = "--device-as-default-execution-space"; + compilationOptions[1] = computeCapabilityArg.c_str(); + std::string gsparMacroKernel = "--define-macro=GSPAR_DEVICE_KERNEL=" + KernelGenerator::KERNEL_PREFIX; + compilationOptions[2] = gsparMacroKernel.c_str(); + std::string gsparMacroGlobalMemory = "--define-macro=GSPAR_DEVICE_GLOBAL_MEMORY=" + KernelGenerator::GLOBAL_MEMORY_PREFIX; + compilationOptions[3] = gsparMacroGlobalMemory.c_str(); + std::string gsparMacroSharedMemory = "--define-macro=GSPAR_DEVICE_SHARED_MEMORY=" + KernelGenerator::SHARED_MEMORY_PREFIX; + compilationOptions[4] = gsparMacroSharedMemory.c_str(); + std::string gsparMacroConstant = "--define-macro=GSPAR_DEVICE_CONSTANT=" + KernelGenerator::CONSTANT_PREFIX; + compilationOptions[5] = gsparMacroConstant.c_str(); + std::string gsparMacroDevFunction = "--define-macro=GSPAR_DEVICE_FUNCTION=" + KernelGenerator::DEVICE_FUNCTION_PREFIX; + compilationOptions[6] = gsparMacroDevFunction.c_str(); + +#ifdef GSPAR_DEBUG + ss << "[GSPar Device " << this << "] Compiling kernel with " << numOptions << " options: "; + for (int iDebug = 0; iDebug < numOptions; iDebug++) { + ss << compilationOptions[iDebug] << " "; + } + ss << std::endl; + std::cout << ss.str(); + ss.str(""); +#endif + + throwCompilationExceptionIfFailed( nvrtcCompileProgram(cudaProgram, numOptions, compilationOptions), cudaProgram ); + + size_t ptxSize; + throwCompilationExceptionIfFailed( nvrtcGetPTXSize(cudaProgram, &ptxSize), cudaProgram ); + char* ptxSource = new char[ptxSize]; + throwCompilationExceptionIfFailed( nvrtcGetPTX(cudaProgram, ptxSource), cudaProgram ); + + unsigned int error_buffer_size = 1024; + std::vector options; + std::vector values; + char* error_log = new char[error_buffer_size]; + //Pointer to a buffer in which to print any log messages that reflect errors + options.push_back(CU_JIT_ERROR_LOG_BUFFER); + values.push_back(error_log); + //Log buffer size in bytes. Log messages will be capped at this size (including null terminator) + options.push_back(CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES); + // Casting through uintptr_t avoids compiler warning [https://stackoverflow.com/a/30106751/3136474] + values.push_back((void*)(uintptr_t)error_buffer_size); // https://developer.nvidia.com/nvidia_bug/2917596 + //Determines the target based on the current attached context (default) + options.push_back(CU_JIT_TARGET_FROM_CUCONTEXT); + values.push_back(0); //No option value required for CU_JIT_TARGET_FROM_CUCONTEXT + + Exception::throwIfFailed( cuModuleLoadDataEx(&cudaModule, ptxSource, options.size(), options.data(), values.data()), error_log); + + return std::make_tuple(cudaProgram, cudaModule); +} + + +///// Kernel ///// + +void Kernel::loadCudaFunction(const std::string kernelName) { + throwExceptionIfFailed( cuModuleGetFunction(&this->cudaFunction, this->cudaModule, kernelName.c_str()) ); +} + +Kernel::Kernel() : BaseKernel() { } +Kernel::Kernel(Device* device, const std::string kernelSource, const std::string kernelName) : BaseKernel(device, kernelSource, kernelName) { + std::string programName = "program_" + kernelName; + + auto programAndModule = this->device->compileCudaProgramAndLoadModule(kernelSource, programName); + this->cudaProgram = std::get<0>(programAndModule); + this->cudaModule = std::get<1>(programAndModule); + + this->isPrecompiled = false; //Kernel owns cudaProgram + + this->loadCudaFunction(kernelName); + + #ifdef GSPAR_DEBUG + std::stringstream ss; // Using stringstream eases multi-threaded debugging + ss.str(""); + // See Kernel::getNumBlocksAndThreadsFor for explanation on this code. + int deviceRegsPerBlock = this->device->queryInfoNumeric(CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK); + int funcNumRegs = this->queryInfoNumeric(CU_FUNC_ATTRIBUTE_NUM_REGS); + funcNumRegs *= 1.15; // +15% of margin + ss << "[GSPar Kernel " << this << "] " << this->kernelName << " Device Num regs is " << deviceRegsPerBlock << ", Func Num regs is " << funcNumRegs << "." << std::endl; + std::cout << ss.str(); + ss.str(""); + #endif +} +Kernel::Kernel(Device* device, nvrtcProgram cudaProgram, CUmodule cudaModule, const std::string kernelName) : BaseKernel(device) { + this->cudaProgram = cudaProgram; + this->isPrecompiled = true; //Kernel shares cudaProgram + + this->cudaModule = cudaModule; + + this->loadCudaFunction(kernelName); +} +Kernel::~Kernel() { +#ifdef GSPAR_DEBUG + std::stringstream ss; // Using stringstream eases multi-threaded debugging + ss << "[GSPar Kernel " << this << "] Destructing..." << std::endl; + std::cout << ss.str(); + ss.str(""); +#endif + if (this->isRunningAsync()) { + this->waitAsync(); + } + if (!this->isPrecompiled && this->cudaProgram) { + nvrtcDestroyProgram(&this->cudaProgram); // We don't throw exceptions on destructors + } +} +void Kernel::cloneInto(BaseKernelBase* baseOther) { + BaseKernel::cloneInto(baseOther); + Kernel* other = static_cast(baseOther); + other->cudaProgram = this->cudaProgram; + other->cudaModule = this->cudaModule; + other->cudaFunction = this->cudaFunction; + other->kernelParams = this->kernelParams; + // TODO Who will destroy the NVRTC program? + this->isPrecompiled = true; // Now the program is shared + other->isPrecompiled = true; + other->attributeCache = this->attributeCache; +} +int Kernel::setParameter(MemoryObject* memoryObject) { + CUdeviceptr* cudaObject = memoryObject->getBaseMemoryObject(); + this->kernelParams.push_back(cudaObject); + return ++this->parameterCount; +} +int Kernel::setParameter(ChunkedMemoryObject* chunkedMemoryObject) { + CUdeviceptr* cudaObject = chunkedMemoryObject->getBaseMemoryObject(); + this->kernelParams.push_back(cudaObject); + return ++this->parameterCount; +} +int Kernel::setParameter(size_t parm_size, void* parm) { + void *parmPtr = parm; + if (parm_size <= sizeof(unsigned long long)) { // We copy single values + // Should we copy all parameters? + parmPtr = new unsigned char[parm_size]; + memcpy(parmPtr, parm, parm_size); + } + this->kernelParams.push_back(parmPtr); + return ++this->parameterCount; +} +int Kernel::setParameter(size_t parm_size, const void* parm) { + // cuLaunchKernel expects a void**, so we can't work with const + // Another nice trick to cast to void*: https://migocpp.wordpress.com/2018/04/16/cuda-runtime-templates/ + return this->setParameter(parm_size, const_cast(parm)); +} +void Kernel::clearParameters() { + BaseKernel::clearParameters(); + this->kernelParams.clear(); +} +GSPar::Driver::Dimensions Kernel::getNumBlocksAndThreadsFor(Dimensions dims) { + #ifdef GSPAR_DEBUG + std::stringstream ss; // Using stringstream eases multi-threaded debugging + ss.str(""); + #endif + + unsigned int deviceMaxThreadsPerBlock = this->device->getMaxThreadsPerBlock(); + + // #ifdef GSPAR_DEBUG + // ss << "[GSPar Kernel " << this << "] Max threads per block in device " << this->device << ": " << deviceMaxThreadsPerBlock << std::endl; + // std::cout << ss.str(); + // ss.str(""); + // #endif + + // Check if the function uses too much registers + int deviceRegsPerBlock = this->device->queryInfoNumeric(CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK); + int funcNumRegs = this->queryInfoNumeric(CU_FUNC_ATTRIBUTE_NUM_REGS); + // In practice, we've seen CUDA exploding with CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: too many resources requested for launch + // when we use the exact number of threads that can be used according to the number of registers reported by CU_FUNC_ATTRIBUTE_NUM_REGS. + // The raytracer test is an example of such issue. + // So, we increase this number a little bit to have some margin. + funcNumRegs *= 1.15; // +15% of margin + + unsigned int regsMaxThreadsPerBlock = (double)deviceRegsPerBlock/funcNumRegs; // Max threads per block according to the register usage + + // Actual max threads per block according to device capability and function register usage + unsigned int actualMaxThreadsPerBlock = deviceMaxThreadsPerBlock; + if (regsMaxThreadsPerBlock < deviceMaxThreadsPerBlock) { + actualMaxThreadsPerBlock = regsMaxThreadsPerBlock; + } + + #ifdef GSPAR_DEBUG + ss << "[GSPar Kernel " << this << "] " << this->kernelName << " Device Num regs is " << deviceRegsPerBlock << ", Func Num regs is " << funcNumRegs << ", so max threads per block is " << regsMaxThreadsPerBlock; + ss << ". Max threads per block of device is " << deviceMaxThreadsPerBlock << ", but actual max threads is " << actualMaxThreadsPerBlock << std::endl; + std::cout << ss.str(); + ss.str(""); + #endif + + size_t maxThreadsDimension[SUPPORTED_DIMS] = { + (size_t)this->device->queryInfoNumeric(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X), + (size_t)this->device->queryInfoNumeric(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y), + (size_t)this->device->queryInfoNumeric(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z), + }; + + // #ifdef GSPAR_DEBUG + // ss << "[GSPar Kernel " << this << "] Max threads per dimension is " << maxThreadsDimension[0] << " x " << maxThreadsDimension[1] << " x " << maxThreadsDimension[2] << std::endl; + // std::cout << ss.str(); + // ss.str(""); + // #endif + + return this->getNumBlocksAndThreads(dims, actualMaxThreadsPerBlock, maxThreadsDimension); +} +void Kernel::runAsync(Dimensions dims, ExecutionFlow* executionFlow) { + + #ifdef GSPAR_DEBUG + std::stringstream ss; // Using stringstream eases multi-threaded debugging + ss << "[GSPar Kernel " << this << "] Running kernel async with " << this->kernelParams.size() << " parameters for " << dims.toString() << " in flow " << executionFlow << std::endl; + std::cout << ss.str(); + ss.str(""); + #endif + + CUstream cudaStream = ExecutionFlow::checkAndStartFlow(this->device, executionFlow); + + if (!dims.x) { + throw Exception("The first dimension is required to run a kernel"); + } + + // #ifdef GSPAR_DEBUG + // ss << "[GSPar Kernel " << this << "] Checking max threads per block in device " << this->device << std::endl; + // std::cout << ss.str(); + // ss.str(""); + // #endif + + Dimensions blocksAndThreads = this->getNumBlocksAndThreadsFor(dims); + + unsigned int numBlocks[SUPPORTED_DIMS] = { + (unsigned int)blocksAndThreads.x.min, + (unsigned int)blocksAndThreads.y.min, + (unsigned int)blocksAndThreads.z.min + }; + unsigned int numThreads[SUPPORTED_DIMS] = { + (unsigned int)blocksAndThreads.x.max, + (unsigned int)blocksAndThreads.y.max, + (unsigned int)blocksAndThreads.z.max + }; + + #ifdef GSPAR_DEBUG + ss << "[GSPar Kernel " << this << "] Starting kernel with " << this->kernelParams.size() << " parameters" << std::endl; + ss << "[GSPar Kernel " << this << "] Shall start " << dims.toString() << " threads: "; + ss << "starting (" << numThreads[0] << "," << numThreads[1] << "," << numThreads[2] << ") threads "; + ss << "in (" << numBlocks[0] << "," << numBlocks[1] << "," << numBlocks[2] << ") blocks "; + ss << "using " << this->sharedMemoryBytes << " bytes of shared memory in execution flow " << executionFlow << " (CUstream " << cudaStream << ")" << std::endl; + std::cout << ss.str(); + ss.str(""); + #endif + + throwExceptionIfFailed( cuLaunchKernel(this->cudaFunction, + numBlocks[0], numBlocks[1], numBlocks[2], // 3D blocks + numThreads[0], numThreads[1], numThreads[2], // 3D threads + this->sharedMemoryBytes, cudaStream, this->kernelParams.data(), NULL) ); + + // #ifdef GSPAR_DEBUG + // ss << "[GSPar Kernel " << this << "] Started kernel execution in execution flow " << executionFlow << " (CUstream " << cudaStream << ")" << std::endl; + // std::cout << ss.str(); + // ss.str(""); + // #endif + + this->setBaseAsyncObject(cudaStream); + + this->runningAsync = true; +} +const int Kernel::queryInfoNumeric(CUfunction_attribute paramName, bool cacheable) { + if (cacheable) { // Check if the attribute is cached + // We don't use locks here because the Kernel object is not intended to be shared among threads + auto it = this->attributeCache.find(paramName); + if (it != this->attributeCache.end()) { + return it->second; + } + } + + int pi; + throwExceptionIfFailed( cuFuncGetAttribute(&pi, paramName, this->cudaFunction) ); + if (cacheable) { // Stores the attribute in cache + this->attributeCache[paramName] = pi; + } + return pi; +} + + + +///// MemoryObject ///// +void MemoryObject::allocDeviceMemory() { + this->device->getContext(); // There must be a context to call cuMemAlloc + + this->devicePtr = new CUdeviceptr; // It is initialized as NULL, we have to allocate space for it + throwExceptionIfFailed( cuMemAlloc(this->devicePtr, size) ); +} + +MemoryObject::MemoryObject(Device* device, size_t size, void* hostPtr, bool readOnly, bool writeOnly) : BaseMemoryObject(device, size, hostPtr, readOnly, writeOnly) { + this->allocDeviceMemory(); +} +MemoryObject::MemoryObject(Device* device, size_t size, const void* hostPtr) : BaseMemoryObject(device, size, hostPtr) { + this->allocDeviceMemory(); +} +MemoryObject::~MemoryObject() { + if (this->devicePtr) { + cuMemFree(*(this->devicePtr)); // We don't throw exceptions on destructors + this->devicePtr = NULL; + } + if (this->isPinnedHostMemory()) { + cuMemHostUnregister(this->hostPtr); // We don't throw exceptions on destructors + } +} +void MemoryObject::pinHostMemory() { + if (!this->isPinnedHostMemory()) { // TODO implement thread-safety + CUresult result = cuMemHostRegister(this->hostPtr, this->size, 0); + if (result != CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED) { + throwExceptionIfFailed(result); + } + } + BaseMemoryObject::pinHostMemory(); +} + +void MemoryObject::copyIn() { + throwExceptionIfFailed( cuMemcpyHtoD(*(this->devicePtr), this->hostPtr, this->size) ); +} +void MemoryObject::copyOut() { + throwExceptionIfFailed( cuMemcpyDtoH(this->hostPtr, *(this->devicePtr), this->size) ); +} +void MemoryObject::copyInAsync(ExecutionFlow* executionFlow) { + CUstream cudaStream = ExecutionFlow::checkAndStartFlow(this->device, executionFlow); + throwExceptionIfFailed( cuMemcpyHtoDAsync(*(this->devicePtr), this->hostPtr, this->size, cudaStream) ); + this->setBaseAsyncObject(cudaStream); +} +void MemoryObject::copyOutAsync(ExecutionFlow* executionFlow) { + CUstream cudaStream = ExecutionFlow::checkAndStartFlow(this->device, executionFlow); + throwExceptionIfFailed( cuMemcpyDtoHAsync(this->hostPtr, *(this->devicePtr), this->size, cudaStream) ); + this->setBaseAsyncObject(cudaStream); +} + + + +///// ChunkedMemoryObject ///// + +void ChunkedMemoryObject::allocDeviceMemory() { + this->device->getContext(); // There must be a context to call cuMemAlloc + + this->devicePtr = new CUdeviceptr; // It is initialized as NULL, we have to allocate space for it + throwExceptionIfFailed( cuMemAlloc(this->devicePtr, this->getChunkSize() * this->chunks) ); // We allocate space for all the chunks +} + +ChunkedMemoryObject::ChunkedMemoryObject(Device* device, unsigned int chunks, size_t chunkSize, void** hostPointers, bool readOnly, bool writeOnly) : + BaseChunkedMemoryObject(device, chunks, chunkSize, hostPointers, readOnly, writeOnly) { + this->allocDeviceMemory(); +} +ChunkedMemoryObject::ChunkedMemoryObject(Device* device, unsigned int chunks, size_t chunkSize, const void** hostPointers) : + BaseChunkedMemoryObject(device, chunks, chunkSize, hostPointers) { + this->allocDeviceMemory(); +} +ChunkedMemoryObject::~ChunkedMemoryObject() { } +void ChunkedMemoryObject::pinHostMemory() { + // TODO implement pinned memory in chunked memory objects + // We need to keep this empty method here while it is not implemented so the parent method does not get called +} +void ChunkedMemoryObject::copyIn() { + for (unsigned int chunk = 0; chunk < this->chunks; chunk++) { + this->copyIn(chunk); + } +} +void ChunkedMemoryObject::copyOut() { + for (unsigned int chunk = 0; chunk < this->chunks; chunk++) { + this->copyOut(chunk); + } +} +void ChunkedMemoryObject::copyInAsync(ExecutionFlow* executionFlow) { + CUstream cudaStream = ExecutionFlow::checkAndStartFlow(this->device, executionFlow); + for (unsigned int chunk = 0; chunk < this->chunks; chunk++) { + // We don't call copyInAsync(chunk) to avoid calling checkAndStartFlow for each chunk + throwExceptionIfFailed( cuMemcpyHtoDAsync((CUdeviceptr)((unsigned char*)(*this->devicePtr)+(chunk*this->getChunkSize())), this->hostPointers[chunk], this->getChunkSize(), cudaStream) ); + } + this->setBaseAsyncObject(cudaStream); +} +void ChunkedMemoryObject::copyOutAsync(ExecutionFlow* executionFlow) { + CUstream cudaStream = ExecutionFlow::checkAndStartFlow(this->device, executionFlow); + for (unsigned int chunk = 0; chunk < this->chunks; chunk++) { + // We don't call copyOutAsync(chunk) to avoid calling checkAndStartFlow for each chunk + throwExceptionIfFailed( cuMemcpyDtoHAsync(this->hostPointers[chunk], (CUdeviceptr)((unsigned char*)(*this->devicePtr)+(chunk*this->getChunkSize())), this->getChunkSize(), cudaStream) ); + } + this->setBaseAsyncObject(cudaStream); +} +void ChunkedMemoryObject::copyIn(unsigned int chunk) { + throwExceptionIfFailed( cuMemcpyHtoD((CUdeviceptr)((unsigned char*)(*this->devicePtr)+(chunk*this->getChunkSize())), this->hostPointers[chunk], this->getChunkSize()) ); +} +void ChunkedMemoryObject::copyOut(unsigned int chunk) { + throwExceptionIfFailed( cuMemcpyDtoH(this->hostPointers[chunk], (CUdeviceptr)((unsigned char*)(*this->devicePtr)+(chunk*this->getChunkSize())), this->getChunkSize()) ); +} +void ChunkedMemoryObject::copyInAsync(unsigned int chunk, ExecutionFlow* executionFlow) { + CUstream cudaStream = ExecutionFlow::checkAndStartFlow(this->device, executionFlow); + throwExceptionIfFailed( cuMemcpyHtoDAsync((CUdeviceptr)((unsigned char*)(*this->devicePtr)+(chunk*this->getChunkSize())), this->hostPointers[chunk], this->getChunkSize(), cudaStream) ); + this->setBaseAsyncObject(cudaStream); +} +void ChunkedMemoryObject::copyOutAsync(unsigned int chunk, ExecutionFlow* executionFlow) { + CUstream cudaStream = ExecutionFlow::checkAndStartFlow(this->device, executionFlow); + throwExceptionIfFailed( cuMemcpyDtoHAsync(this->hostPointers[chunk], (CUdeviceptr)((unsigned char*)(*this->devicePtr)+(chunk*this->getChunkSize())), this->getChunkSize(), cudaStream) ); + this->setBaseAsyncObject(cudaStream); +} + + +///// StreamElement ///// + +StreamElement::StreamElement(Device* device) : BaseStreamElement(device) { + // Can't call this virtual function in the base constructor + this->start(); +} + +StreamElement::~StreamElement() { } + + +///// KernelGenerator ///// + +const std::string KernelGenerator::KERNEL_PREFIX = "extern \"C\" __global__"; +const std::string KernelGenerator::GLOBAL_MEMORY_PREFIX = ""; +const std::string KernelGenerator::SHARED_MEMORY_PREFIX = "extern __shared__"; +const std::string KernelGenerator::CONSTANT_PREFIX = "const"; +const std::string KernelGenerator::DEVICE_FUNCTION_PREFIX = "__device__"; +const std::string KernelGenerator::ATOMIC_ADD_POLYFILL = "" + "__device__ double atomicAdd(double* address, double val){ \n" + " unsigned long long int* address_as_ull = (unsigned long long int*)address; \n" + " unsigned long long int old = *address_as_ull, assumed; \n" + " do { \n" + " assumed = old; \n" + " old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); \n" + " } while (assumed != old); \n" + " return __longlong_as_double(old); \n" + "} \n"; + +const std::string KernelGenerator::getKernelPrefix() { + return KernelGenerator::KERNEL_PREFIX + " void"; +} +std::string KernelGenerator::generateStdFunctions() { + Dimensions dims({1, 1, 1}); + std::string gspar_get_gid = "__device__ size_t gspar_get_global_id(unsigned int dimension) { \n"; + std::string gspar_get_tid = "__device__ size_t gspar_get_thread_id(unsigned int dimension) { \n"; + std::string gspar_get_bid = "__device__ size_t gspar_get_block_id(unsigned int dimension) { \n"; + std::string gspar_get_bsize = "__device__ size_t gspar_get_block_size(unsigned int dimension) { \n"; + std::string gspar_get_gridsize = "__device__ size_t gspar_get_grid_size(unsigned int dimension) { \n"; + for (int d = 0; d < dims.getCount(); d++) { + std::string dimName = dims.getName(d); + gspar_get_gid += " if (dimension == " + std::to_string(d) + ") return blockIdx." + dimName + " * blockDim."+dimName+" + threadIdx." + dimName + "; \n"; + gspar_get_tid += " if (dimension == " + std::to_string(d) + ") return threadIdx." + dimName + "; \n"; + gspar_get_bid += " if (dimension == " + std::to_string(d) + ") return blockIdx." + dimName + "; \n"; + gspar_get_bsize += " if (dimension == " + std::to_string(d) + ") return blockDim." + dimName + "; \n"; + gspar_get_gridsize += " if (dimension == " + std::to_string(d) + ") return gridDim." + dimName + "; \n"; + } + gspar_get_gid += " return 0; } \n"; + gspar_get_tid += " return 0; } \n"; + gspar_get_bid += " return 0; } \n"; + gspar_get_bsize += " return 0; } \n"; + gspar_get_gridsize += " return 0; } \n"; + + return gspar_get_gid + gspar_get_tid + gspar_get_bid + gspar_get_bsize + gspar_get_gridsize + + "extern \"C\" __device__ void gspar_synchronize_local_threads() { __syncthreads(); } \n" + // Atomic functions + "__device__ int gspar_atomic_add_int(int* valq, int delta) { return atomicAdd(valq, delta); } \n" + "__device__ double gspar_atomic_add_double(double* valq, double delta) { return atomicAdd(valq, delta); } \n" + ; +} +std::string KernelGenerator::replaceMacroKeywords(std::string kernelSource) { + kernelSource = std::regex_replace(kernelSource, std::regex("GSPAR_DEVICE_MACRO_BEGIN"), "#define"); + kernelSource = std::regex_replace(kernelSource, std::regex("GSPAR_DEVICE_MACRO_END"), "\n"); + return kernelSource; +} +std::string KernelGenerator::generateInitKernel(Pattern::BaseParallelPattern* pattern, Dimensions dims) { + std::string r = ""; + if (pattern->isUsingSharedMemory()) { + auto shmem = pattern->getSharedMemoryParameter(); + r += KernelGenerator::SHARED_MEMORY_PREFIX + " " + shmem->getNonPointerTypeName() + " " + shmem->name + "[];"; + } + return r; +} +std::string KernelGenerator::generateParams(Pattern::BaseParallelPattern* pattern, Dimensions dims) { + std::string r = ""; + for(int d = 0; d < dims.getCount(); d++) { + if (dims.is(d)) { + std::string varName = this->getStdVarNameForDimension(pattern->getStdVarNames(), d); + r += "const unsigned long gspar_max_" + varName + ","; + if (dims[d].min && !pattern->isBatched()) { // Same check as generateStdVariables + // TODO Support min in batches + r += "const unsigned long gspar_min_" + varName + ","; + } + } + } + if (pattern->isBatched()) { + // This names are used in other methods + r += "unsigned int gspar_batch_size,"; + } + for(auto ¶m : pattern->getParameterList()) { + if (param->direction != Pattern::ParameterDirection::GSPAR_PARAM_NONE) { + if (param->direction == Pattern::ParameterDirection::GSPAR_PARAM_IN && param->isConstant()) { + r += "const "; + } + r += param->toKernelParameter() + ","; + } + } + if (!r.empty()) r.pop_back(); // removes last comma + return r; +} +std::string KernelGenerator::generateStdVariables(Pattern::BaseParallelPattern* pattern, Dimensions dims) { + std::array patternNames = pattern->getStdVarNames(); + + std::string r; + for(int d = 0; d < dims.getCount(); d++) { + if (dims[d]) { + std::string varName = this->getStdVarNameForDimension(patternNames, d); + // Standard variables are uint3 according do CUDA specification + // By using size_t we can keep the same type of OpenCL driver + if (pattern->isBatched()) { + r += "size_t gspar_global_" + varName; + } else { + r += "size_t " + varName; + } + r += " = gspar_get_global_id(" + std::to_string(d) + ")"; + if (dims[d].min && !pattern->isBatched()) { // Same check as generateParams + // TODO Support min in batches + r += " + gspar_min_" + varName; + } + r += "; \n"; + // TODO Support multi-dimensional batches + if (pattern->isBatched()) { + // Intended implicit floor(gspar_global/dims) + r += "size_t gspar_batch_" + varName + " = ((size_t)(gspar_global_" + varName + " / gspar_max_" + varName + ")); \n"; + r += "size_t gspar_offset_" + varName + " = gspar_batch_" + varName + " * gspar_max_" + varName + "; \n"; + // This variable names are used in other methods, keep track + r += "size_t " + varName + " = gspar_global_" + varName + " - gspar_offset_" + varName + "; \n"; + } + } + } + return r; +} +std::string KernelGenerator::generateBatchedParametersInitialization(Pattern::BaseParallelPattern* pattern, Dimensions max) { + std::array patternNames = pattern->getStdVarNames(); + // TODO Support multi-dimensional batches + std::string stdVarFirstDimension = this->getStdVarNameForDimension(patternNames, 0); + + std::string r = ""; + for(auto ¶m : pattern->getParameterList()) { + if (param->isBatched()) { + if (param->direction == Pattern::ParameterDirection::GSPAR_PARAM_IN && param->isConstant()) { + r += "const "; + } + r += param->type.getFullName() + " " + param->name + " = "; + if (param->paramValueType == Pattern::ParameterValueType::GSPAR_PARAM_POINTER) { + r += "&" + param->getKernelParameterName() + "[gspar_offset_" + stdVarFirstDimension + "]"; + } else if (param->paramValueType == Pattern::ParameterValueType::GSPAR_PARAM_VALUE) { + r += param->getKernelParameterName() + "[gspar_batch_" + stdVarFirstDimension + "]"; + } + r += ";\n"; + } + } + return r; +} diff --git a/src/GSPar_CUDA.hpp b/src/GSPar_CUDA.hpp new file mode 100644 index 0000000..fc91c02 --- /dev/null +++ b/src/GSPar_CUDA.hpp @@ -0,0 +1,262 @@ + +#ifndef __GSPAR_CUDA_INCLUDED__ +#define __GSPAR_CUDA_INCLUDED__ + +#include +#include +#include +#include +#include +#include + +///// Forward declarations ///// + +namespace GSPar { + namespace Driver { + namespace CUDA { + class Exception; + class ExecutionFlow; + class AsyncExecutionSupport; + class Instance; + class Device; + class Kernel; + class MemoryObject; + class ChunkedMemoryObject; + class StreamElement; + class KernelGenerator; + } + } +} + +#include "GSPar_BaseGPUDriver.hpp" + +namespace GSPar { + namespace Driver { + namespace CUDA { + + ///// Exception ///// + + class Exception : + public BaseException { + protected: + std::string getErrorString(CUresult code) override; + + public: + explicit Exception(std::string msg, std::string details = ""); + explicit Exception(CUresult code, std::string details = ""); + + static Exception* checkError(CUresult code, std::string details = ""); + static void throwIfFailed(CUresult code, std::string details = ""); + }; + + #define throwCompilationExceptionIfFailed( code, cudaProgram ) CompilationException::throwIfFailed( code, cudaProgram, defaultExceptionDetails() ) + + class CompilationException : + public BaseException { + protected: + std::string getErrorString(nvrtcResult code) override; + + public: + explicit CompilationException(std::string msg, std::string details = ""); + explicit CompilationException(nvrtcResult code, std::string details = ""); + + static CompilationException* checkError(nvrtcResult code, std::string details = ""); + static void throwIfFailed(nvrtcResult code, std::string details = ""); + static void throwIfFailed(nvrtcResult code, nvrtcProgram cudaProgram, std::string details = ""); + }; + + ///// ExecutionFlow ///// + + class ExecutionFlow : + virtual public BaseExecutionFlow { + public: + ExecutionFlow(); + explicit ExecutionFlow(Device* device); + virtual ~ExecutionFlow(); + CUstream start() override; + void synchronize() override; + + static CUstream checkAndStartFlow(Device* device, ExecutionFlow* executionFlow = NULL); + }; + + ///// AsyncExecutionSupport ///// + + class AsyncExecutionSupport : + virtual public BaseAsyncExecutionSupport { + public: + AsyncExecutionSupport(CUstream asyncObj = NULL); + void waitAsync() override; + + static void waitAllAsync(std::initializer_list asyncs); + }; + + ///// Instance ///// + + class Instance : + public BaseInstance { + protected: + static Instance *instance; + void loadGpuList() override; + + public: + Instance(); + virtual ~Instance(); + void init() override; + unsigned int getGpuCount() override; + + static Instance* getInstance(); + }; + + ///// Device ///// + + class Device : + public BaseDevice { + private: + mutable std::mutex attributeCacheMutex; + std::map attributeCache; + int deviceId; + + public: + Device(); + explicit Device(int ordinal); + virtual ~Device(); + ExecutionFlow* getDefaultExecutionFlow() override; + CUcontext getContext() override; + CUstream startDefaultExecutionFlow() override; + unsigned int getDeviceId(); + const std::string getName() override; + unsigned int getComputeUnitsCount() override; + unsigned int getWarpSize() override; + unsigned int getMaxThreadsPerBlock() override; + unsigned long getGlobalMemorySizeBytes() override; + unsigned long getLocalMemorySizeBytes() override; + unsigned long getSharedMemoryPerComputeUnitSizeBytes() override; + unsigned int getClockRateMHz() override; + bool isIntegratedMainMemory() override; + MemoryObject* malloc(long size, void* hostPtr = nullptr, bool readOnly = false, bool writeOnly = false) override; + MemoryObject* malloc(long size, const void* hostPtr = nullptr) override; + ChunkedMemoryObject* mallocChunked(unsigned int chunks, long chunkSize, void** hostPtr = nullptr, bool readOnly = false, bool writeOnly = false) override; + ChunkedMemoryObject* mallocChunked(unsigned int chunks, long chunkSize, const void** hostPtr = nullptr) override; + Kernel* prepareKernel(const std::string kernelSource, const std::string kernelName) override; + std::vector prepareKernels(const std::string kernelSource, const std::vector kernelNames) override; + + // const char* queryInfoText(cl_device_info paramName); + const int queryInfoNumeric(CUdevice_attribute paramName, bool cacheable = true); + std::tuple compileCudaProgramAndLoadModule(std::string source, const std::string programName); + }; + + ///// Kernel ///// + + class Kernel : + public BaseKernel, + public AsyncExecutionSupport { + private: + nvrtcProgram cudaProgram = NULL; + CUmodule cudaModule = NULL; + CUfunction cudaFunction = NULL; + std::vector kernelParams; + bool isPrecompiled; + std::map attributeCache; + + void loadCudaFunction(const std::string kernelName); + + public: + Kernel(); + Kernel(Device* device, const std::string kernelSource, const std::string kernelName); + virtual ~Kernel(); + virtual void cloneInto(BaseKernelBase* baseOther) override; + int setParameter(MemoryObject* memoryObject) override; + int setParameter(ChunkedMemoryObject* chunkedMemoryObject) override; + int setParameter(size_t parmSize, void* parm) override; + int setParameter(size_t parmSize, const void* parm) override; + void clearParameters() override; + Dimensions getNumBlocksAndThreadsFor(Dimensions dims) override; + void runAsync(Dimensions max, ExecutionFlow* executionFlow = NULL) override; + + Kernel(Device* device, nvrtcProgram cudaProgram, CUmodule cudaModule, const std::string kernelName); + const int queryInfoNumeric(CUfunction_attribute paramName, bool cacheable = true); + }; + + ///// MemoryObject ///// + + class MemoryObject : + public BaseMemoryObject, + public AsyncExecutionSupport { + private: + void allocDeviceMemory(); + public: + MemoryObject(Device* device, size_t size, void* hostPtr, bool readOnly, bool writeOnly); + MemoryObject(Device* device, size_t size, const void* hostPtr); + virtual ~MemoryObject(); + virtual void pinHostMemory() override; + virtual void copyIn() override; + virtual void copyOut() override; + virtual void copyInAsync(ExecutionFlow* executionFlow = NULL) override; + virtual void copyOutAsync(ExecutionFlow* executionFlow = NULL) override; + }; + + ///// ChunkedMemoryObject ///// + + class ChunkedMemoryObject : + public BaseChunkedMemoryObject, + public AsyncExecutionSupport { + private: + void allocDeviceMemory(); + + public: + ChunkedMemoryObject(Device* device, unsigned int chunks, size_t chunkSize, void** hostPointers, bool readOnly, bool writeOnly); + ChunkedMemoryObject(Device* device, unsigned int chunks, size_t chunkSize, const void** hostPointers); + virtual ~ChunkedMemoryObject(); + virtual void pinHostMemory() override; + // Copy all chunks + virtual void copyIn() override; + virtual void copyOut() override; + virtual void copyInAsync(ExecutionFlow* executionFlow = NULL) override; + virtual void copyOutAsync(ExecutionFlow* executionFlow = NULL) override; + // Copy specific chunks of memory. We can't use function overloading due to the override. + virtual void copyIn(unsigned int chunk); + virtual void copyOut(unsigned int chunk); + virtual void copyInAsync(unsigned int chunk, ExecutionFlow* executionFlow = NULL); + virtual void copyOutAsync(unsigned int chunk, ExecutionFlow* executionFlow = NULL); + }; + + ///// StreamElement ///// + + class StreamElement : + public BaseStreamElement, + public AsyncExecutionSupport, + public ExecutionFlow { + private: + Kernel* kernel; + + public: + explicit StreamElement(Device* device); + ~StreamElement(); + }; + + ///// KernelGenerator ///// + + class KernelGenerator : + public BaseKernelGenerator { + public: + static const std::string KERNEL_PREFIX; + static const std::string GLOBAL_MEMORY_PREFIX; + static const std::string SHARED_MEMORY_PREFIX; + static const std::string CONSTANT_PREFIX; + static const std::string DEVICE_FUNCTION_PREFIX; + static const std::string ATOMIC_ADD_POLYFILL; + const std::string getKernelPrefix() override; + std::string generateStdFunctions() override; + std::string replaceMacroKeywords(std::string kernelSource) override; + std::string generateInitKernel(Pattern::BaseParallelPattern* pattern, Dimensions dims) override; + std::string generateParams(Pattern::BaseParallelPattern* pattern, Dimensions dims) override; + std::string generateStdVariables(Pattern::BaseParallelPattern* pattern, Dimensions dims) override; + std::string generateBatchedParametersInitialization(Pattern::BaseParallelPattern* pattern, Dimensions dims) override; + + }; + + } + } +} + +#endif diff --git a/src/GSPar_OpenCL.cpp b/src/GSPar_OpenCL.cpp new file mode 100644 index 0000000..258bd02 --- /dev/null +++ b/src/GSPar_OpenCL.cpp @@ -0,0 +1,1051 @@ + +#include +#include +#include +#include +#ifdef GSPAR_DEBUG +#include +#include +#endif + +#include "GSPar_OpenCL.hpp" + +using namespace GSPar::Driver::OpenCL; + +// extern "C" void CL_CALLBACK ocl_pfn_notify(const char *errinfo, const void *private_info, size_t cb, void *user_data) { +// std::cerr << "OpenCL notified an error: " << errinfo << std::endl; +// } + +///// Exception ///// + +std::string Exception::getErrorString(cl_int code) { + switch(code) { + // run-time and JIT compiler errors + case 0: return "CL_SUCCESS"; + case -1: return "CL_DEVICE_NOT_FOUND"; + case -2: return "CL_DEVICE_NOT_AVAILABLE"; + case -3: return "CL_COMPILER_NOT_AVAILABLE"; + case -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE"; + case -5: return "CL_OUT_OF_RESOURCES"; + case -6: return "CL_OUT_OF_HOST_MEMORY"; + case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE"; + case -8: return "CL_MEM_COPY_OVERLAP"; + case -9: return "CL_IMAGE_FORMAT_MISMATCH"; + case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED"; + case -11: return "CL_BUILD_PROGRAM_FAILURE"; + case -12: return "CL_MAP_FAILURE"; + case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET"; + case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST"; + case -15: return "CL_COMPILE_PROGRAM_FAILURE"; + case -16: return "CL_LINKER_NOT_AVAILABLE"; + case -17: return "CL_LINK_PROGRAM_FAILURE"; + case -18: return "CL_DEVICE_PARTITION_FAILED"; + case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE"; + + // compile-time errors + case -30: return "CL_INVALID_VALUE"; + case -31: return "CL_INVALID_DEVICE_TYPE"; + case -32: return "CL_INVALID_PLATFORM"; + case -33: return "CL_INVALID_DEVICE"; + case -34: return "CL_INVALID_CONTEXT"; + case -35: return "CL_INVALID_QUEUE_PROPERTIES"; + case -36: return "CL_INVALID_COMMAND_QUEUE"; + case -37: return "CL_INVALID_HOST_PTR"; + case -38: return "CL_INVALID_MEM_OBJECT"; + case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR"; + case -40: return "CL_INVALID_IMAGE_SIZE"; + case -41: return "CL_INVALID_SAMPLER"; + case -42: return "CL_INVALID_BINARY"; + case -43: return "CL_INVALID_BUILD_OPTIONS"; + case -44: return "CL_INVALID_PROGRAM"; + case -45: return "CL_INVALID_PROGRAM_EXECUTABLE"; + case -46: return "CL_INVALID_KERNEL_NAME"; + case -47: return "CL_INVALID_KERNEL_DEFINITION"; + case -48: return "CL_INVALID_KERNEL"; + case -49: return "CL_INVALID_ARG_INDEX"; + case -50: return "CL_INVALID_ARG_VALUE"; + case -51: return "CL_INVALID_ARG_SIZE"; + case -52: return "CL_INVALID_KERNEL_ARGS"; + case -53: return "CL_INVALID_WORK_DIMENSION"; + case -54: return "CL_INVALID_WORK_GROUP_SIZE"; + case -55: return "CL_INVALID_WORK_ITEM_SIZE"; + case -56: return "CL_INVALID_GLOBAL_OFFSET"; + case -57: return "CL_INVALID_EVENT_WAIT_LIST"; + case -58: return "CL_INVALID_EVENT"; + case -59: return "CL_INVALID_OPERATION"; + case -60: return "CL_INVALID_GL_OBJECT"; + case -61: return "CL_INVALID_BUFFER_SIZE"; + case -62: return "CL_INVALID_MIP_LEVEL"; + case -63: return "CL_INVALID_GLOBAL_WORK_SIZE"; + case -64: return "CL_INVALID_PROPERTY"; + case -65: return "CL_INVALID_IMAGE_DESCRIPTOR"; + case -66: return "CL_INVALID_COMPILER_OPTIONS"; + case -67: return "CL_INVALID_LINKER_OPTIONS"; + case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT"; + + // extension errors + case -1000: return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR"; + case -1001: return "CL_PLATFORM_NOT_FOUND_KHR"; + case -1002: return "CL_INVALID_D3D10_DEVICE_KHR"; + case -1003: return "CL_INVALID_D3D10_RESOURCE_KHR"; + case -1004: return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR"; + case -1005: return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR"; + default: return "Unknown OpenCL error"; + } +} + +Exception::Exception(std::string msg, std::string details) : BaseException(msg, details) { } +Exception::Exception(cl_int code, std::string details) : BaseException(code, details) { + // Can't call this virtual function in the base constructor + this->msg = this->getErrorString(code); +} +// static +Exception* Exception::checkError(cl_int code, std::string details) { + return BaseException::checkError(code, CL_SUCCESS, details); +} +// static +void Exception::throwIfFailed(cl_int code, std::string details) { + BaseException::throwIfFailed(code, CL_SUCCESS, details); +} + +Exception::Exception(cl_int code, cl_program program, cl_device_id device) : Exception(code) { + if (code == CL_BUILD_PROGRAM_FAILURE) { + size_t log_size; + clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); + char *log = new char[log_size]; + clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); + this->msg += std::string(" - ") + std::string(log); + } +} +// static +Exception* Exception::checkError(cl_int code, cl_program program, cl_device_id device) { + if (code != CL_SUCCESS) { + return new Exception(code, program, device); + } + return NULL; +} +// static +void Exception::throwIfFailed(cl_int code, cl_program program, cl_device_id device) { + Exception* ex = Exception::checkError(code, program, device); + if (ex != NULL) { + throw *ex; + } +} + + +///// ExecutionFlow ///// + +ExecutionFlow::ExecutionFlow() : BaseExecutionFlow() { } +ExecutionFlow::ExecutionFlow(Device* device) : BaseExecutionFlow(device) { } +ExecutionFlow::~ExecutionFlow() { + // We don't throw exceptions on destructors + if (this->flowObject) { + #ifdef GSPAR_DEBUG + std::stringstream ss; // Using stringstream eases multi-threaded debugging + ss << "[" << std::this_thread::get_id() << " GSPar ExFlow] Releasing command queue " << this << std::endl; + std::cout << ss.str(); + ss.str(""); + #endif + Exception* ex = Exception::checkError( clReleaseCommandQueue(this->flowObject) ); + if (ex != nullptr) { + std::cerr << "Failed when releasing OpenCL command queue of execution flow: "; + std::cerr << ex->what() << " - " << ex->getDetails() << std::endl; + delete ex; + } + } +} +cl_command_queue ExecutionFlow::start() { + if (!this->device) { + // Can't start flow on a NULL device + throw Exception("A device is required to start an execution flow", defaultExceptionDetails()); + } + if (!this->flowObject) { + this->device = device; + cl_int status; + this->flowObject = clCreateCommandQueue(device->getContext(), device->getBaseDeviceObject(), 0, &status); + throwExceptionIfFailed(status); + } + return this->getBaseFlowObject(); +} +void ExecutionFlow::synchronize() { + #ifdef GSPAR_DEBUG + std::stringstream ss; // Using stringstream eases multi-threaded debugging + ss << "[" << std::this_thread::get_id() << " GSPar ExFlow " << this << "] Synchronizing" << std::endl; + std::cout << ss.str(); + ss.str(""); + #endif + // clEnqueueMarker(cl_command_queue, cl_event) is deprecated in OpenCL 1.2 + + // cl_event evt; + // throwExceptionIfFailed( clEnqueueMarkerWithWaitList(this->getBaseFlowObject(), 0, NULL, &evt) ); + // throwExceptionIfFailed( clWaitForEvents(1, &evt) ); + // throwExceptionIfFailed( clReleaseEvent(evt) ); + throwExceptionIfFailed( clFinish(this->flowObject) ); +} +cl_command_queue ExecutionFlow::checkAndStartFlow(Device* device, ExecutionFlow* executionFlow) { + return BaseExecutionFlow::checkAndStartFlow(device, executionFlow); +} + + + +///// AsyncExecutionSupport ///// + +AsyncExecutionSupport::AsyncExecutionSupport(cl_event *asyncObjs, unsigned int numAsyncEvents) : + BaseAsyncExecutionSupport(asyncObjs), numAsyncEvents(numAsyncEvents) { } +AsyncExecutionSupport::~AsyncExecutionSupport() { + try { + this->releaseBaseAsyncObject(); + } catch (GSPar::GSParException &ex) { // We don't throw exceptions on destructors + std::cerr << "Failed when releasing OpenCL event on AsyncExecutionSupport destructor: "; + std::cerr << ex.what() << " - " << ex.getDetails() << std::endl; + this->asyncObject = NULL; + } +} +void AsyncExecutionSupport::setBaseAsyncObject(cl_event *asyncObject) { + this->setBaseAsyncObject(asyncObject, 1); +} +void AsyncExecutionSupport::setBaseAsyncObject(cl_event *asyncObject, unsigned int numAsyncEvents) { + this->releaseBaseAsyncObject(); // Release current object + BaseAsyncExecutionSupport::setBaseAsyncObject(asyncObject); + this->numAsyncEvents = numAsyncEvents; +} +void AsyncExecutionSupport::waitAsync() { + if (this->executionFlow) { + this->executionFlow->synchronize(); + } else if (this->asyncObject) { + #ifdef GSPAR_DEBUG + std::stringstream ss; // Using stringstream eases multi-threaded debugging + ss << "[" << std::this_thread::get_id() << " GSPar Async " << this << "] Waiting for " << this->numAsyncEvents << " events: " << this->asyncObject << std::endl; + std::cout << ss.str(); + ss.str(""); + + // CL_QUEUED: 3 + // CL_SUBMITTED: 2 + // CL_RUNNING: 1 + // CL_COMPLETE: 0 + cl_int status; + throwExceptionIfFailed( clGetEventInfo(*this->asyncObject, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &status, NULL) ); + // CL_COMMAND_NDRANGE_KERNEL: 4592 + // CL_COMMAND_TASK: 4593 + // CL_COMMAND_NATIVE_KERNEL: 4594 + // CL_COMMAND_READ_BUFFER: 4595 + // CL_COMMAND_WRITE_BUFFER: 4596 + // CL_COMMAND_COPY_BUFFER: 4597 + // CL_COMMAND_READ_IMAGE: 4598 + // CL_COMMAND_WRITE_IMAGE: 4599 + // CL_COMMAND_COPY_IMAGE: 4600 + // CL_COMMAND_COPY_BUFFER_TO_IMAGE: 4602 + // CL_COMMAND_COPY_IMAGE_TO_BUFFER: 4601 + // CL_COMMAND_MAP_BUFFER: 4603 + // CL_COMMAND_MAP_IMAGE: 4604 + // CL_COMMAND_UNMAP_MEM_OBJECT: 4605 + // CL_COMMAND_MARKER: 4606 + // CL_COMMAND_ACQUIRE_GL_OBJECTS: 4607 + // CL_COMMAND_RELEASE_GL_OBJECTS: 4608 + // CL_COMMAND_READ_BUFFER_RECT: 4609 + // CL_COMMAND_WRITE_BUFFER_RECT: 4610 + // CL_COMMAND_COPY_BUFFER_RECT: 4611 + // CL_COMMAND_USER: 4612 + // CL_COMMAND_BARRIER: 4613 + // CL_COMMAND_MIGRATE_MEM_OBJECTS: 4614 + // CL_COMMAND_FILL_BUFFER: 4615 + // CL_COMMAND_FILL_IMAGE: 4616 + // CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR: 8205 + cl_command_type type; + throwExceptionIfFailed( clGetEventInfo(*this->asyncObject, CL_EVENT_COMMAND_TYPE, sizeof(cl_command_type), &type, NULL) ); + + ss << "[" << std::this_thread::get_id() << " GSPar Async " << this << "] Event " << this->asyncObject << " of type " << type << " is of status " << status << std::endl; + std::cout << ss.str(); + ss.str(""); + #endif + + throwExceptionIfFailed( clWaitForEvents(this->numAsyncEvents, this->asyncObject) ); + } + this->releaseBaseAsyncObject(); +} +void AsyncExecutionSupport::releaseBaseAsyncObject() { + if (this->executionFlow) { + // We don't own this ExecutionFlow, it's just a weak reference, so we don't delete it + this->executionFlow = nullptr; + } + if (this->asyncObject) { + #ifdef GSPAR_DEBUG + std::stringstream ss; // Using stringstream eases multi-threaded debugging + ss << "[" << std::this_thread::get_id() << " GSPar Async " << this << "] Releasing " << this->numAsyncEvents << " events: " << this->asyncObject << std::endl; + std::cout << ss.str(); + ss.str(""); + #endif + for (unsigned int i = 0; i < this->numAsyncEvents; i++) { + throwExceptionIfFailed( clReleaseEvent(this->asyncObject[i]) ); + } + this->asyncObject = NULL; + } + this->clearRunningAsync(); // We can't be running async since we don't have the async objects anymore +} +// static +void AsyncExecutionSupport::waitAllAsync(std::initializer_list asyncs) { + std::vector oclEvents; + #ifdef GSPAR_DEBUG + std::stringstream ss; // Using stringstream eases multi-threaded debugging + ss << "[" << std::this_thread::get_id() << " GSPar Async] Waiting for all async events" << std::endl; + std::cout << ss.str(); + ss.str(""); + #endif + for (auto async : asyncs) { + // std::cout << "Waiting for all cl_events " << async->asyncObject << " " << *async->asyncObject << std::endl; + oclEvents.insert(oclEvents.end(), async->getBaseAsyncObject(), async->getBaseAsyncObject()+async->numAsyncEvents); + } + if (oclEvents.size() > 0) { + throwExceptionIfFailed( clWaitForEvents(oclEvents.size(), oclEvents.data()) ); + } + for (auto async : asyncs) { + async->releaseBaseAsyncObject(); + } +} + + +///// Instance ///// + +Instance *Instance::instance = nullptr; + +void Instance::loadGpuList() { + this->clearGpuList(); + + cl_uint platformCount; + throwExceptionIfFailed( clGetPlatformIDs(0, NULL, &platformCount) ); + + cl_platform_id* platforms = new cl_platform_id[platformCount]; + throwExceptionIfFailed( clGetPlatformIDs(platformCount, platforms, NULL) ); + + for (unsigned int i = 0; i < platformCount; ++i) { + cl_uint deviceCount; + throwExceptionIfFailed( clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, 0, NULL, &deviceCount) ); + + cl_device_id* deviceIds = new cl_device_id[deviceCount]; + throwExceptionIfFailed( clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, deviceCount, deviceIds, NULL) ); + + for (unsigned int d = 0; d < deviceCount; ++d) { + this->devices.push_back(new Device(deviceIds[d])); + } + } + + delete[] platforms; +} + +Instance::Instance() : BaseInstance(Runtime::GSPAR_RT_OPENCL) { } +Instance::~Instance() { + #ifdef GSPAR_DEBUG + std::stringstream ss; // Using stringstream eases multi-threaded debugging + ss << "[" << std::this_thread::get_id() << " GSPar Instance] Deleting Singleton instance " << this << std::endl; + std::cout << ss.str(); + ss.str(""); + #endif + Instance::instance = nullptr; +} +Instance* Instance::getInstance() { + // TODO implement thread-safety + if (!instance) { + instance = new Instance(); + } + return instance; +} + +void Instance::init() { + this->instanceInitiated = true; +} + +unsigned int Instance::getGpuCount() { + unsigned int gpuCount = 0; + + cl_uint platformCount; + throwExceptionIfFailed( clGetPlatformIDs(0, NULL, &platformCount) ); + + cl_platform_id* platforms = new cl_platform_id[platformCount]; + throwExceptionIfFailed( clGetPlatformIDs(platformCount, platforms, NULL) ); + + for (unsigned int i = 0; i < platformCount; ++i) { + cl_uint deviceCount; + throwExceptionIfFailed( clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, 0, NULL, &deviceCount) ); + + gpuCount += deviceCount; + } + + delete[] platforms; + + return gpuCount; +} + + +///// Device ///// + +Device::Device() : BaseDevice() { } +Device::Device(cl_device_id device) { + this->setBaseDeviceObject(device); +} +Device::~Device() { + // We don't throw exceptions on destructors + if (this->defaultExecutionFlow) { + delete this->defaultExecutionFlow; + this->defaultExecutionFlow = NULL; + } + + if (this->libContext) { + #ifdef GSPAR_DEBUG + std::stringstream ss; // Using stringstream eases multi-threaded debugging + ss << "[" << std::this_thread::get_id() << " GSPar Device] Releasing context " << this << std::endl; + std::cout << ss.str(); + ss.str(""); + #endif + Exception* ex = Exception::checkError( clReleaseContext(this->libContext) ); + if (ex) { + std::cerr << "Failed when releasing device context on Device's destructor: "; + std::cerr << ex->what() << " - " << ex->getDetails() << std::endl; + delete ex; + } + this->libContext = NULL; + } +} +ExecutionFlow* Device::getDefaultExecutionFlow() { + if (!this->defaultExecutionFlow) { + this->defaultExecutionFlow = new ExecutionFlow(this); + } + return this->defaultExecutionFlow; +} +cl_context Device::getContext() { + if (!this->libContext) { + std::lock_guard lock(this->libContextMutex); + if (!this->libContext) { // Check if someone changed it while we were waiting for the lock + cl_int status; + // TODO add a CL_CALLBACK to get notified of errors. Check opencl versions in test/comparison for an example + cl_context context = clCreateContext(NULL, 1, &this->libDevice, NULL, NULL, &status); + throwExceptionIfFailed(status); + this->setContext(context); + } + // Auto-unlock of libContextMutex, RAII + } + return this->libContext; +} +cl_command_queue Device::startDefaultExecutionFlow() { + return this->getDefaultExecutionFlow()->start(); +} +const std::string Device::getName() { + return this->queryInfoDevice(CL_DEVICE_NAME); +} +unsigned int Device::getComputeUnitsCount() { + return *(this->queryInfoDevice(CL_DEVICE_MAX_COMPUTE_UNITS)); +} +unsigned int Device::getWarpSize() { + // TODO warp size is available only for NVIDIA GPUs + return *(this->queryInfoDevice(CL_DEVICE_WARP_SIZE_NV)); +} +unsigned int Device::getMaxThreadsPerBlock() { + return *(this->queryInfoDevice(CL_DEVICE_MAX_WORK_GROUP_SIZE)); +} +unsigned long Device::getGlobalMemorySizeBytes() { + return *(this->queryInfoDevice(CL_DEVICE_GLOBAL_MEM_SIZE)); +} +unsigned long Device::getLocalMemorySizeBytes() { + return *(this->queryInfoDevice(CL_DEVICE_LOCAL_MEM_SIZE)); +} +unsigned long Device::getSharedMemoryPerComputeUnitSizeBytes() { + return *(this->queryInfoDevice(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE)); +} +unsigned int Device::getClockRateMHz() { + return *(this->queryInfoDevice(CL_DEVICE_MAX_CLOCK_FREQUENCY)); +} +bool Device::isIntegratedMainMemory() { + // CL_DEVICE_HOST_UNIFIED_MEMORY is deprecated in OpenCL 1.2 + // should probably use CL_DEVICE_SVM_CAPABILITIES instead in OpenCL 2.0 + return *(this->queryInfoDevice(CL_DEVICE_HOST_UNIFIED_MEMORY)); +} +MemoryObject* Device::malloc(long size, void* hostPtr, bool readOnly, bool writeOnly) { + return new MemoryObject(this, size, hostPtr, readOnly, writeOnly); +} +MemoryObject* Device::malloc(long size, const void* hostPtr) { + return new MemoryObject(this, size, hostPtr); +} +ChunkedMemoryObject* Device::mallocChunked(unsigned int chunks, long chunkSize, void** hostPointers, bool readOnly, bool writeOnly) { + return new ChunkedMemoryObject(this, chunks, chunkSize, hostPointers, readOnly, writeOnly); +} +ChunkedMemoryObject* Device::mallocChunked(unsigned int chunks, long chunkSize, const void** hostPointers) { + return new ChunkedMemoryObject(this, chunks, chunkSize, hostPointers); +} +Kernel* Device::prepareKernel(const std::string kernel_source, const std::string kernel_name) { + return new Kernel(this, kernel_source, kernel_name); +} +std::vector Device::prepareKernels(const std::string kernelSource, const std::vector kernelNames) { + cl_program oclProgram = this->compileOCLProgram(kernelSource); + + std::vector kernels; + for (auto name : kernelNames) { + kernels.push_back(new Kernel(this, oclProgram, name)); + } + return kernels; +} +template +const T* Device::queryInfoDevice(cl_device_info paramName, bool cacheable) { + //https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/clGetDeviceInfo.html + if (cacheable) { // Check if the attribute is cached + std::lock_guard lock(this->attributeCacheMutex); // Auto-unlock, RAII + auto it = this->attributeCache.find(paramName); + if (it != this->attributeCache.end()) { + return (T*)it->second; + } + } + + size_t valueSize; + clGetDeviceInfo(this->getBaseDeviceObject(), paramName, 0, NULL, &valueSize); + T* value = new T[valueSize]; + clGetDeviceInfo(this->getBaseDeviceObject(), paramName, valueSize, value, NULL); + if (cacheable) { // Stores the attribute in cache + std::lock_guard lock(this->attributeCacheMutex); // Auto-unlock, RAII + this->attributeCache[paramName] = value; + } + return value; +} +cl_program Device::compileOCLProgram(std::string source) { +#ifdef GSPAR_DEBUG + std::stringstream ss; // Using stringstream eases multi-threaded debugging + ss << "[GSPar Device " << this << "] Kernel received to compile: \n" << source << std::endl; + std::cout << ss.str(); + ss.str(""); +#endif + + std::string openclExtensions = "#pragma OPENCL EXTENSION all: enable\n"; + std::string completeKernelSource = ""; + completeKernelSource.append(openclExtensions); + completeKernelSource.append(Instance::getInstance()->getKernelGenerator()->generateStdFunctions()); + completeKernelSource.append(Instance::getInstance()->getKernelGenerator()->replaceMacroKeywords(source)); + +#ifdef GSPAR_DEBUG + ss << "[GSPar Device " << this << "] Complete kernel for compilation: \n" << completeKernelSource << std::endl; + std::cout << ss.str(); + ss.str(""); +#endif + + cl_program oclProgram; + cl_device_id devId = this->getBaseDeviceObject(); + + // Place for inserting any additional macros + std::string macrosGspar = ""; + macrosGspar.append("-D GSPAR_DEVICE_KERNEL=" + KernelGenerator::KERNEL_PREFIX); + macrosGspar.append(" -D GSPAR_DEVICE_GLOBAL_MEMORY=" + KernelGenerator::GLOBAL_MEMORY_PREFIX); + macrosGspar.append(" -D GSPAR_DEVICE_SHARED_MEMORY=" + KernelGenerator::SHARED_MEMORY_PREFIX); + macrosGspar.append(" -D GSPAR_DEVICE_CONSTANT=" + KernelGenerator::CONSTANT_PREFIX); + macrosGspar.append(" -D GSPAR_DEVICE_FUNCTION=" + KernelGenerator::DEVICE_FUNCTION_PREFIX); + const char *compilationOptions = macrosGspar.c_str(); + +#ifdef GSPAR_DEBUG + ss << "[GSPar Device " << this << "] Compiling kernel with arguments: " << compilationOptions; + ss << std::endl; + std::cout << ss.str(); + ss.str(""); +#endif + + cl_int status; + const char* src = completeKernelSource.c_str(); + oclProgram = clCreateProgramWithSource(this->getContext(), 1, &src, NULL, &status); + Exception::throwIfFailed(status, oclProgram, devId); + + status = clBuildProgram(oclProgram, 1, &devId, compilationOptions, NULL, NULL); + Exception::throwIfFailed(status, oclProgram, devId); + + return oclProgram; +} + +///// Kernel ///// + +void Kernel::loadOclKernel(const std::string kernelName) { + cl_int status; + this->oclKernel = clCreateKernel(this->oclProgram, kernelName.c_str(), &status); + Exception::throwIfFailed(status, this->oclProgram, this->device->getBaseDeviceObject()); + this->kernelName = kernelName; +} + +Kernel::Kernel() : BaseKernel() { } +Kernel::Kernel(Device* device, const std::string kernelSource, const std::string kernelName) : BaseKernel(device, kernelSource, kernelName) { + this->oclProgram = device->compileOCLProgram(kernelSource); + + this->isPrecompiled = false; //Kernel owns oclProgram + + this->loadOclKernel(kernelName); +} +Kernel::Kernel(Device* device, cl_program oclProgram, const std::string kernelName) : BaseKernel(device) { + this->oclProgram = oclProgram; + this->isPrecompiled = true; //Kernel shares oclProgram + + this->loadOclKernel(kernelName); +} +Kernel::~Kernel() { + #ifdef GSPAR_DEBUG + std::stringstream ss; // Using stringstream eases multi-threaded debugging + #endif + if (!this->isPrecompiled && this->oclProgram) { + #ifdef GSPAR_DEBUG + ss << "[" << std::this_thread::get_id() << " GSPar Kernel] Releasing oclProgram " << this << std::endl; + std::cout << ss.str(); + ss.str(""); + #endif + Exception* ex = Exception::checkError( clReleaseProgram(this->oclProgram) ); // We don't throw exceptions on destructors + if (ex != nullptr) { + std::cerr << "Failed when releasing OpenCL program on Kernel destructor: "; + std::cerr << ex->what() << " - " << ex->getDetails() << std::endl; + delete ex; + } + this->oclProgram = NULL; + } + if (this->oclKernel) { + #ifdef GSPAR_DEBUG + ss << "[" << std::this_thread::get_id() << " GSPar Kernel " << this << "] Releasing oclKernel" << std::endl; + std::cout << ss.str(); + ss.str(""); + #endif + Exception* ex = Exception::checkError( clReleaseKernel(this->oclKernel) ); // We don't throw exceptions on destructors + if (ex != nullptr) { + std::cerr << "Failed when releasing OpenCL kernel on Kernel destructor: "; + std::cerr << ex->what() << " - " << ex->getDetails() << std::endl; + delete ex; + } + this->oclKernel = NULL; + } +} +void Kernel::cloneInto(BaseKernelBase* baseOther) { + BaseKernel::cloneInto(baseOther); + Kernel* other = static_cast(baseOther); + other->oclProgram = this->oclProgram; + // cl_kernel objects are not thread-safe (OpenCL 1.2 Specification p. 360) + other->loadOclKernel(this->kernelName); + // We do not mark this kernel as precompiled, so it destroys the cl_program on destructor. + // However, once it is destroyed, the cloned pattern cannot be further cloned because we need the program to call clCreateKernel (called during the clone process) + // TODO I haven't tested, but this probably causes issues + // this->isPrecompiled = true; + other->isPrecompiled = true; +} +int Kernel::setParameter(MemoryObject* memoryObject) { + cl_mem oclObject = memoryObject->getBaseMemoryObject(); + throwExceptionIfFailed( clSetKernelArg(this->oclKernel, this->parameterCount++, sizeof(cl_mem), &oclObject) ); + return this->parameterCount; +} +int Kernel::setParameter(ChunkedMemoryObject* chunkedMemoryObject) { + cl_mem oclObject = chunkedMemoryObject->getBaseMemoryObject(); + throwExceptionIfFailed( clSetKernelArg(this->oclKernel, this->parameterCount++, sizeof(cl_mem), &oclObject) ); + return this->parameterCount; +} +int Kernel::setParameter(size_t parm_size, void* parm) { + // clSetKernelArg expects a const void*, so we can treat all pointers as const + return this->setParameter(parm_size, const_cast(parm)); +} +int Kernel::setParameter(size_t parm_size, const void* parm) { + // The argument data pointed to by arg_value is copied and the arg_value pointer can therefore be reused by the application after clSetKernelArg returns. + // https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/clSetKernelArg.html + throwExceptionIfFailed( clSetKernelArg(this->oclKernel, this->parameterCount++, parm_size, parm) ); + return this->parameterCount; +} +GSPar::Driver::Dimensions Kernel::getNumBlocksAndThreadsFor(Dimensions dims) { + + // CL_DEVICE_MAX_WORK_GROUP_SIZE is usually 1024, but CL_KERNEL_WORK_GROUP_SIZE is 256. + // In general, the kernels works just fine with 1024 even with the 256 limitation reported by CL_KERNEL_WORK_GROUP_SIZE. + // What limit should we use? + // unsigned int maxThreadsPerBlock = this->device->getMaxThreadsPerBlock(); //CL_DEVICE_MAX_WORK_GROUP_SIZE + const size_t *kernelWorkGroupSize = this->queryInfo(CL_KERNEL_WORK_GROUP_SIZE); + unsigned int maxThreadsPerBlock = *kernelWorkGroupSize; + + // Should we check CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS? We only support 3 dimensions anyway. + const size_t* maxWorkItemSizes = this->device->queryInfoDevice(CL_DEVICE_MAX_WORK_ITEM_SIZES); + size_t maxThreadsDimension[3]; // Copying data to remove constness + memcpy(maxThreadsDimension, maxWorkItemSizes, sizeof(size_t) * 3); + + return this->getNumBlocksAndThreads(dims, maxThreadsPerBlock, maxThreadsDimension); +} +void Kernel::runAsync(Dimensions dims, ExecutionFlow* executionFlow) { + + #ifdef GSPAR_DEBUG + std::stringstream ss; // Using stringstream eases multi-threaded debugging + ss << "[" << std::this_thread::get_id() << " GSPar Kernel " << this << "] Running kernel async with " << this->parameterCount << " parameters for " << dims.toString() << " in flow " << executionFlow << std::endl; + std::cout << ss.str(); + ss.str(""); + #endif + + cl_command_queue oclQueue = ExecutionFlow::checkAndStartFlow(this->device, executionFlow); + + if (!dims.x) { + throw Exception("The first dimension is required to run a kernel"); + } + + Dimensions blocksAndThreads = this->getNumBlocksAndThreadsFor(dims); + + int dimensions = dims.getCount(); + + size_t localSize[dimensions]; + size_t globalSize[dimensions]; + for (int d = 0; d < dimensions; d++) { + localSize[d] = blocksAndThreads[d].max; + globalSize[d] = blocksAndThreads[d].min * localSize[d]; + } + + // Set shared memory - https://community.khronos.org/t/dynamically-allocated-shared-memory/1562 + if (this->sharedMemoryBytes > 0) { + throwExceptionIfFailed( clSetKernelArg(this->oclKernel, this->parameterCount++, this->sharedMemoryBytes, NULL) ); + } + + #ifdef GSPAR_DEBUG + ss << "[" << std::this_thread::get_id() << " GSPar Kernel " << this << "] Shall start " << dims.toString() << " threads: "; + ss << "starting (" << globalSize[0]; + if (dims.y) ss << "," << globalSize[1]; + if (dims.z) ss << "," << globalSize[2]; + ss << ") threads "; + ss << "divided in blocks of (" << localSize[0]; + if (dims.y) ss << "," << localSize[1]; + if (dims.z) ss << "," << localSize[2]; + ss << ") threads "; + ss << "using " << this->sharedMemoryBytes << " bytes of shared memory in execution flow " << executionFlow << std::endl; + std::cout << ss.str(); + ss.str(""); + #endif + + cl_event *evt = new cl_event; + throwExceptionIfFailed( clEnqueueNDRangeKernel(oclQueue, this->oclKernel, dimensions, NULL, globalSize, localSize, 0, NULL, evt) ); + #ifdef GSPAR_DEBUG + ss << "[" << std::this_thread::get_id() << " GSPar Kernel " << this << "] Setting evt to wait: " << evt << std::endl; + std::cout << ss.str(); + ss.str(""); + #endif + + this->setBaseAsyncObject(evt); // setBaseAsyncObject sets runningAsync to false + // Use Execution Flow instead of the event for synchronization. See comment on executionFlow attribute. + this->setExecutionFlowToSynchronize(executionFlow ? executionFlow : this->device->getDefaultExecutionFlow()); + this->runningAsync = true; +} +template +T* Kernel::queryInfo(cl_kernel_work_group_info param, bool cacheable) { + // https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/clGetKernelWorkGroupInfo.html + if (cacheable) { // Check if the attribute is cached + auto it = this->attributeCache.find(param); + if (it != this->attributeCache.end()) { + return (T*)it->second; + } + } + + size_t valueSize; + throwExceptionIfFailed( clGetKernelWorkGroupInfo(this->oclKernel, this->device->getBaseDeviceObject(), param, 0, NULL, &valueSize) ); + T* value = new T[valueSize]; + throwExceptionIfFailed( clGetKernelWorkGroupInfo(this->oclKernel, this->device->getBaseDeviceObject(), param, valueSize, value, NULL) ); + if (cacheable) { // Stores the attribute in cache + this->attributeCache[param] = value; + } + return value; +} + + +///// MemoryObject ///// + +void MemoryObject::copy(bool in, bool async, ExecutionFlow* executionFlow) { + cl_event *evt = new cl_event; + cl_bool blocking = async ? CL_FALSE : CL_TRUE; + int numEvtsToWait = 0; + cl_event *evtToWait = NULL; + if (this->getBaseAsyncObject()) { + numEvtsToWait = this->numAsyncEvents; + evtToWait = this->asyncObject; + #ifdef GSPAR_DEBUG + std::stringstream ss; // Using stringstream eases multi-threaded debugging + ss << "[" << std::this_thread::get_id() << " GSPar MemObj " << this << "] Already has an async event: " << evtToWait << ", binding two events" << std::endl; + std::cout << ss.str(); + ss.str(""); + #endif + } + + cl_command_queue oclQueue = ExecutionFlow::checkAndStartFlow(this->device, executionFlow); + + if (in) { + throwExceptionIfFailed( clEnqueueWriteBuffer( + oclQueue, this->devicePtr, + blocking, 0, this->size, this->hostPtr, + numEvtsToWait, evtToWait, evt) ); + } else { //copy out + throwExceptionIfFailed( clEnqueueReadBuffer( + oclQueue, this->devicePtr, + blocking, 0, this->size, this->hostPtr, + numEvtsToWait, evtToWait, evt) ); + } + if (this->getBaseAsyncObject()) { // Releases old async event handler + this->releaseBaseAsyncObject(); + } + if (async) { + #ifdef GSPAR_DEBUG + std::stringstream ss; // Using stringstream eases multi-threaded debugging + ss << "[" << std::this_thread::get_id() << " GSPar MemObj " << this << "] Setting evt " << evt << " from queue " << oclQueue << " to wait" << std::endl; + std::cout << ss.str(); + ss.str(""); + #endif + + this->setBaseAsyncObject(evt); // setBaseAsyncObject sets runningAsync to false + // Use Execution Flow instead of the event for synchronization. See comment on executionFlow attribute. + this->setExecutionFlowToSynchronize(executionFlow ? executionFlow : this->device->getDefaultExecutionFlow()); + this->runningAsync = true; + } +} + +void MemoryObject::allocDeviceMemory() { + cl_int status; + + // Security check is already done in base class + cl_mem_flags ocl_flags = CL_MEM_READ_WRITE; + if (this->isReadOnly()) { + ocl_flags = CL_MEM_READ_ONLY; + } else if (this->isWriteOnly()) { + ocl_flags = CL_MEM_WRITE_ONLY; + } + + this->devicePtr = clCreateBuffer(device->getContext(), ocl_flags, size, NULL, &status); + throwExceptionIfFailed(status); +} +MemoryObject::MemoryObject(Device* device, size_t size, void* hostPtr, bool readOnly, bool writeOnly) : BaseMemoryObject(device, size, hostPtr, readOnly, writeOnly) { + this->allocDeviceMemory(); +} +MemoryObject::MemoryObject(Device* device, size_t size, const void* hostPtr) : BaseMemoryObject(device, size, hostPtr) { + this->allocDeviceMemory(); +} +MemoryObject::~MemoryObject() { + if (this->devicePtr) { + #ifdef GSPAR_DEBUG + std::stringstream ss; // Using stringstream eases multi-threaded debugging + ss << "[" << std::this_thread::get_id() << " GSPar MemObj] Releasing Memory Object " << this << std::endl; + std::cout << ss.str(); + ss.str(""); + #endif + Exception* ex = Exception::checkError( clReleaseMemObject(this->devicePtr) ); // We don't throw exceptions on destructors + if (ex != nullptr) { + std::cerr << "Failed when releasing OpenCL memory object: "; + std::cerr << ex->what() << " - " << ex->getDetails() << std::endl; + delete ex; + } + this->devicePtr = NULL; + } +} +void MemoryObject::copyIn() { copy(true, false); } +void MemoryObject::copyOut() { copy(false, false); } +void MemoryObject::copyInAsync(ExecutionFlow* executionFlow) { copy(true, true, executionFlow); } +void MemoryObject::copyOutAsync(ExecutionFlow* executionFlow) { copy(false, true, executionFlow); } + + +///// ChunkedMemoryObject ///// + +void ChunkedMemoryObject::copy(bool in, bool async, unsigned int chunkFrom, unsigned int chunkTo, ExecutionFlow* executionFlow) { + unsigned int numChunksToCopy = chunkTo - chunkFrom; + cl_event *newEvents = new cl_event[numChunksToCopy]; + + cl_bool blocking = async ? CL_FALSE : CL_TRUE; + unsigned int currentNumEvents = 0; + cl_event *currentEvents = NULL; + if (this->getBaseAsyncObject()) { + currentNumEvents = this->numAsyncEvents; + currentEvents = this->asyncObject; + } + + cl_command_queue oclQueue = ExecutionFlow::checkAndStartFlow(this->device, executionFlow); + + for (unsigned int chunk = chunkFrom, evtIdx = 0; chunk < chunkTo; chunk++, evtIdx++) { + if (in) { + throwExceptionIfFailed( clEnqueueWriteBuffer( + oclQueue, this->devicePtr, + blocking, chunk * this->getChunkSize(), this->getChunkSize(), this->hostPointers[chunk], + currentNumEvents, currentEvents, &newEvents[evtIdx]) ); + } else { //copy out + throwExceptionIfFailed( clEnqueueReadBuffer( + oclQueue, this->devicePtr, + blocking, chunk * this->getChunkSize(), this->getChunkSize(), this->hostPointers[chunk], + currentNumEvents, currentEvents, &newEvents[evtIdx]) ); + } + } + if (this->getBaseAsyncObject()) { // Releases old async event handler + this->releaseBaseAsyncObject(); + } + if (async) { + #ifdef GSPAR_DEBUG + std::stringstream ss; // Using stringstream eases multi-threaded debugging + ss << "[" << std::this_thread::get_id() << " GSPar ChunkedMemObj " << this << "] Setting evts (" << numChunksToCopy << ") to wait: " << newEvents << std::endl; + std::cout << ss.str(); + ss.str(""); + #endif + this->setBaseAsyncObject(newEvents, numChunksToCopy); // setBaseAsyncObject sets runningAsync to false + // Use Execution Flow instead of the event for synchronization. See comment on executionFlow attribute. + this->setExecutionFlowToSynchronize(executionFlow ? executionFlow : this->device->getDefaultExecutionFlow()); + this->runningAsync = true; + } +} + +void ChunkedMemoryObject::allocDeviceMemory() { + cl_int status; + + // Security check is already done in base class + cl_mem_flags ocl_flags = CL_MEM_READ_WRITE; + if (this->isReadOnly()) { + ocl_flags = CL_MEM_READ_ONLY; + } else if (this->isWriteOnly()) { + ocl_flags = CL_MEM_WRITE_ONLY; + } + + // We allocate space for all the memory chunks + this->devicePtr = clCreateBuffer(device->getContext(), ocl_flags, this->getChunkSize() * this->chunks, NULL, &status); + throwExceptionIfFailed(status); +} +ChunkedMemoryObject::ChunkedMemoryObject(Device* device, unsigned int chunks, size_t chunkSize, void** hostPointers, bool readOnly, bool writeOnly) : + BaseChunkedMemoryObject(device, chunks, chunkSize, hostPointers, readOnly, writeOnly) { + this->allocDeviceMemory(); +} +ChunkedMemoryObject::ChunkedMemoryObject(Device* device, unsigned int chunks, size_t chunkSize, const void** hostPointers) : + BaseChunkedMemoryObject(device, chunks, chunkSize, hostPointers) { + this->allocDeviceMemory(); +} +ChunkedMemoryObject::~ChunkedMemoryObject() { + //devicePtr is released in ~MemoryObject +} +void ChunkedMemoryObject::copyIn() { copy(true, false, 0, this->chunks); } +void ChunkedMemoryObject::copyOut() { copy(false, false, 0, this->chunks); } +void ChunkedMemoryObject::copyInAsync(ExecutionFlow* executionFlow) { copy(true, true, 0, this->chunks, executionFlow); } +void ChunkedMemoryObject::copyOutAsync(ExecutionFlow* executionFlow) { copy(false, true, 0, this->chunks, executionFlow); } +void ChunkedMemoryObject::copyIn(unsigned int chunk) { copy(true, false, chunk, chunk+1); } +void ChunkedMemoryObject::copyOut(unsigned int chunk) { copy(false, false, chunk, chunk+1); } +void ChunkedMemoryObject::copyInAsync(unsigned int chunk, ExecutionFlow* executionFlow) { copy(true, true, chunk, chunk+1, executionFlow); } +void ChunkedMemoryObject::copyOutAsync(unsigned int chunk, ExecutionFlow* executionFlow) { copy(false, true, chunk, chunk+1, executionFlow); } + + +///// StreamElement ///// + +StreamElement::StreamElement(Device* device) : BaseStreamElement(device) { + // Can't call this virtual function in the base constructor + this->start(); +} + +StreamElement::~StreamElement() { } + + +///// KernelGenerator ///// + +const std::string KernelGenerator::KERNEL_PREFIX = "__kernel"; +const std::string KernelGenerator::GLOBAL_MEMORY_PREFIX = "__global"; +const std::string KernelGenerator::SHARED_MEMORY_PREFIX = "__local"; +const std::string KernelGenerator::CONSTANT_PREFIX = "__constant"; +const std::string KernelGenerator::DEVICE_FUNCTION_PREFIX = ""; + +const std::string KernelGenerator::getKernelPrefix() { + return KernelGenerator::KERNEL_PREFIX + " void"; +} +std::string KernelGenerator::generateStdFunctions() { + return "" + "size_t gspar_get_global_id(unsigned int dimension) { return get_global_id(dimension); } \n" + "size_t gspar_get_thread_id(unsigned int dimension) { return get_local_id(dimension); } \n" + "size_t gspar_get_block_id(unsigned int dimension) { return get_group_id(dimension); } \n" + "size_t gspar_get_block_size(unsigned int dimension) { return get_local_size(dimension); } \n" + "size_t gspar_get_grid_size(unsigned int dimension) { return get_num_groups(dimension); } \n" + "void gspar_synchronize_local_threads() { barrier(CLK_LOCAL_MEM_FENCE); } \n" + "int gspar_atomic_add_int(__global int *valq, int delta){ atomic_add(valq, delta); } \n" + "double gspar_atomic_add_double(__global double *valq, double delta){ \n " + " union { double f; unsigned long i; } old; \n" + " union { double f; unsigned long i; } new1; \n" + " do { \n" + " old.f = *valq; \n" + " new1.f = old.f + delta; \n" + " } while (atom_cmpxchg((volatile __global unsigned long *)valq, old.i, new1.i) != old.i); \n" + " return old.f; \n" + "} \n" + ; +} +std::string KernelGenerator::replaceMacroKeywords(std::string kernelSource) { + kernelSource = std::regex_replace(kernelSource, std::regex("GSPAR_DEVICE_MACRO_BEGIN"), "#define"); + kernelSource = std::regex_replace(kernelSource, std::regex("GSPAR_DEVICE_MACRO_END"), "\n"); + return kernelSource; +} +std::string KernelGenerator::generateInitKernel(Pattern::BaseParallelPattern* pattern, Dimensions max) { + return ""; +} +std::string KernelGenerator::generateParams(Pattern::BaseParallelPattern* pattern, Dimensions dims) { + std::string r = ""; + for(int d = 0; d < dims.getCount(); d++) { + if (dims.is(d)) { + std::string varName = this->getStdVarNameForDimension(pattern->getStdVarNames(), d); + r += "const unsigned long gspar_max_" + varName + ","; + if (dims[d].min && !pattern->isBatched()) { + // TODO Support min in batches + r += "const unsigned long gspar_min_" + varName + ","; + } + } + } + if (pattern->isBatched()) { + // This names are used in other methods + r += "unsigned int gspar_batch_size,"; + } + for(auto ¶m : pattern->getParameterList()) { + if (param->direction != Pattern::ParameterDirection::GSPAR_PARAM_NONE) { + if (param->paramValueType == Pattern::ParameterValueType::GSPAR_PARAM_POINTER || param->isBatched()) { // Batched values are always pointers + r += KernelGenerator::GLOBAL_MEMORY_PREFIX + " "; + } + if (param->direction == Pattern::ParameterDirection::GSPAR_PARAM_IN && param->isConstant()) { + r += "const "; + } + r += param->toKernelParameter() + ","; + } + } + if (pattern->isUsingSharedMemory()) { + auto shmem = pattern->getSharedMemoryParameter(); + r += KernelGenerator::SHARED_MEMORY_PREFIX + " " + shmem->toString(); + } else { + if (!r.empty()) r.pop_back(); // removes last comma + } + return r; +} +std::string KernelGenerator::generateStdVariables(Pattern::BaseParallelPattern* pattern, Dimensions dims) { + std::array patternNames = pattern->getStdVarNames(); + + // OpenCL get_global_id returns a size_t, so this is the type of our std variables + // https://www.khronos.org/registry/OpenCL/specs/opencl-1.2.pdf#page=244 + std::string r; + for(int d = 0; d < dims.getCount(); d++) { + if (dims.is(d)) { + std::string varName = this->getStdVarNameForDimension(patternNames, d); + if (pattern->isBatched()) { + r += "size_t gspar_global_" + varName; + } else { + r += "size_t " + varName; + } + r += " = gspar_get_global_id(" + std::to_string(d) + ")"; + if (dims[d].min && !pattern->isBatched()) { + // TODO Support min in batches + r += " + gspar_min_" + varName; + } + r += "; \n"; + // TODO Support multi-dimensional batches + if (pattern->isBatched()) { + // Intended implicit floor(gspar_global/dims) + r += "size_t gspar_batch_" + varName + " = ((size_t)(gspar_global_" + varName + " / " + std::to_string(dims[d].max) + ")); \n"; + r += "size_t gspar_offset_" + varName + " = gspar_batch_" + varName + " * " + std::to_string(dims[d].max) + "; \n"; + // This variable names are used in other methods, keep track + r += "size_t " + varName + " = gspar_global_" + varName + " - gspar_offset_" + varName + "; \n"; + } + } + } + return r; +} +std::string KernelGenerator::generateBatchedParametersInitialization(Pattern::BaseParallelPattern* pattern, Dimensions dims) { + std::array patternNames = pattern->getStdVarNames(); + // TODO Support multi-dimensional batches + std::string stdVarFirstDimension = this->getStdVarNameForDimension(patternNames, 0); + + std::string r = ""; + for(auto ¶m : pattern->getParameterList()) { + if (param->isBatched()) { + if (param->paramValueType == Pattern::ParameterValueType::GSPAR_PARAM_POINTER) { + r += "__global "; + } + r += param->type.getFullName() + " " + param->name + " = "; + if (param->paramValueType == Pattern::ParameterValueType::GSPAR_PARAM_POINTER) { + r += "&" + param->getKernelParameterName() + "[gspar_offset_" + stdVarFirstDimension + "]"; + } else if (param->paramValueType == Pattern::ParameterValueType::GSPAR_PARAM_VALUE) { + r += param->getKernelParameterName() + "[gspar_batch_" + stdVarFirstDimension + "]"; + } + r += ";\n"; + } + } + return r; +} diff --git a/src/GSPar_OpenCL.hpp b/src/GSPar_OpenCL.hpp new file mode 100644 index 0000000..a0327c4 --- /dev/null +++ b/src/GSPar_OpenCL.hpp @@ -0,0 +1,260 @@ + +#ifndef __GSPAR_OPENCL_INCLUDED__ +#define __GSPAR_OPENCL_INCLUDED__ + +#include +#include +#include +#include + +///// Forward declarations ///// + +namespace GSPar { + namespace Driver { + namespace OpenCL { + class Exception; + class ExecutionFlow; + class AsyncExecutionSupport; + class Instance; + class Device; + class Kernel; + class MemoryObject; + class ChunkedMemoryObject; + class StreamElement; + class KernelGenerator; + } + } +} + +#include "GSPar_BaseGPUDriver.hpp" + +namespace GSPar { + namespace Driver { + namespace OpenCL { + + ///// Exception ///// + + class Exception : + public BaseException { + protected: + std::string getErrorString(cl_int code) override; + + public: + explicit Exception(std::string msg, std::string details = ""); + explicit Exception(cl_int code, std::string details = ""); + + static Exception* checkError(cl_int code, std::string details = ""); + static void throwIfFailed(cl_int code, std::string details = ""); + + explicit Exception(cl_int code, cl_program program, cl_device_id device); + static Exception* checkError(cl_int code, cl_program program, cl_device_id device); + static void throwIfFailed(cl_int code, cl_program program, cl_device_id device); + }; + + ///// ExecutionFlow ///// + + class ExecutionFlow : + virtual public BaseExecutionFlow { + public: + ExecutionFlow(); + explicit ExecutionFlow(Device* device); + virtual ~ExecutionFlow(); + cl_command_queue start() override; + void synchronize() override; + + static cl_command_queue checkAndStartFlow(Device* device, ExecutionFlow* executionFlow = NULL); + }; + + ///// AsyncExecutionSupport ///// + + class AsyncExecutionSupport : + virtual public BaseAsyncExecutionSupport { + protected: + unsigned int numAsyncEvents = 0; + /// OpenCL sometimes simply hangs on clWaitForEvents + /// I've seen it happen when using multithread and 3 kernels (pattern->run) called sequentially by each thread + /// The internet are full of people complaining over similar issues, and one of them used clFinish instead of clWaitForEvents, so that's what we're gonna do + /// https://github.com/fangq/mcxcl/commit/135dc825e2905253ab0626a2b335dfee8b6e741e + /// https://community.intel.com/t5/OpenCL/Is-there-a-driver-watchdog-time-limit-for-Intel-GPU-on-Linux/td-p/1108291 + /// Whenever an Execution Flow is filled here, we'll synchronize it instead of waiting for the event + ExecutionFlow *executionFlow = nullptr; + public: + AsyncExecutionSupport(cl_event *asyncObjs = NULL, unsigned int numAsyncEvents = 0); + virtual ~AsyncExecutionSupport(); + void setBaseAsyncObject(cl_event *asyncObject) override; + void waitAsync() override; + + void releaseBaseAsyncObject(); + void setBaseAsyncObject(cl_event *asyncObject, unsigned int numAsyncEvents); + void setExecutionFlowToSynchronize(ExecutionFlow *flow) { + this->executionFlow = flow; + } + static void waitAllAsync(std::initializer_list asyncs); + }; + + ///// Instance ///// + + class Instance : public BaseInstance { + protected: + static Instance *instance; + void loadGpuList() override; + + public: + Instance(); + virtual ~Instance(); + void init() override; + unsigned int getGpuCount() override; + + static Instance* getInstance(); + }; + + ///// Device ///// + + class Device : + public BaseDevice { + private: + mutable std::mutex attributeCacheMutex; + std::map attributeCache; + + public: + Device(); + explicit Device(cl_device_id deviceId); + virtual ~Device(); + ExecutionFlow* getDefaultExecutionFlow() override; + cl_context getContext() override; + cl_command_queue startDefaultExecutionFlow() override; + const std::string getName() override; + unsigned int getComputeUnitsCount() override; + unsigned int getWarpSize() override; + unsigned int getMaxThreadsPerBlock() override; + unsigned long getGlobalMemorySizeBytes() override; + unsigned long getLocalMemorySizeBytes() override; + unsigned long getSharedMemoryPerComputeUnitSizeBytes() override; + unsigned int getClockRateMHz() override; + bool isIntegratedMainMemory() override; + MemoryObject* malloc(long size, void* hostPtr = nullptr, bool readOnly = false, bool writeOnly = false) override; + MemoryObject* malloc(long size, const void* hostPtr = nullptr) override; + ChunkedMemoryObject* mallocChunked(unsigned int chunks, long chunkSize, void** hostPtr = nullptr, bool readOnly = false, bool writeOnly = false) override; + ChunkedMemoryObject* mallocChunked(unsigned int chunks, long chunkSize, const void** hostPtr = nullptr) override; + Kernel* prepareKernel(const std::string kernelSource, const std::string kernelName) override; + std::vector prepareKernels(const std::string kernelSource, const std::vector kernelNames) override; + + template + const T* queryInfoDevice(cl_device_info paramName, bool cacheable = true); + cl_program compileOCLProgram(std::string source); + }; + + ///// Kernel ///// + + class Kernel : + public BaseKernel, + public AsyncExecutionSupport { + private: + cl_program oclProgram; + cl_kernel oclKernel; + bool isPrecompiled; + std::map attributeCache; + + void loadOclKernel(const std::string kernelName); + + public: + Kernel(); + Kernel(Device* device, const std::string kernelSource, const std::string kernelName); + virtual ~Kernel(); + virtual void cloneInto(BaseKernelBase* baseOther) override; + int setParameter(MemoryObject* memoryObject) override; + int setParameter(ChunkedMemoryObject* chunkedMemoryObject) override; + int setParameter(size_t parmSize, void* parm) override; + int setParameter(size_t parmSize, const void* parm) override; + Dimensions getNumBlocksAndThreadsFor(Dimensions dims) override; + void runAsync(Dimensions max, ExecutionFlow* executionFlow = NULL) override; + + template + T* queryInfo(cl_kernel_work_group_info param, bool cacheable = true); + Kernel(Device* device, cl_program oclProgram, const std::string kernelName); + }; + + ///// MemoryObject ///// + + class MemoryObject : + public BaseMemoryObject, + public AsyncExecutionSupport { + private: + void copy(bool in, bool async, ExecutionFlow* executionFlow = NULL); + void allocDeviceMemory(); + + public: + MemoryObject(Device* device, size_t size, void* hostPtr, bool readOnly, bool writeOnly); + MemoryObject(Device* device, size_t size, const void* hostPtr); + virtual ~MemoryObject(); + void copyIn() override; + void copyOut() override; + void copyInAsync(ExecutionFlow* executionFlow = NULL) override; + void copyOutAsync(ExecutionFlow* executionFlow = NULL) override; + }; + + ///// ChunkedMemoryObject ///// + + class ChunkedMemoryObject : + public BaseChunkedMemoryObject, + public AsyncExecutionSupport { + private: + void copy(bool in, bool async, unsigned int chunkFrom, unsigned int chunkTo, ExecutionFlow* executionFlow = NULL); + void allocDeviceMemory(); + + public: + ChunkedMemoryObject(Device* device, unsigned int chunks, size_t chunkSize, void** hostPointers, bool readOnly, bool writeOnly); + ChunkedMemoryObject(Device* device, unsigned int chunks, size_t chunkSize, const void** hostPointers); + virtual ~ChunkedMemoryObject(); + // Copy all chunks + virtual void copyIn() override; + virtual void copyOut() override; + virtual void copyInAsync(ExecutionFlow* executionFlow = NULL) override; + virtual void copyOutAsync(ExecutionFlow* executionFlow = NULL) override; + // Copy specific chunks of memory. We can't use function overloading due to the override. + virtual void copyIn(unsigned int chunk); + virtual void copyOut(unsigned int chunk); + virtual void copyInAsync(unsigned int chunk, ExecutionFlow* executionFlow = NULL); + virtual void copyOutAsync(unsigned int chunk, ExecutionFlow* executionFlow = NULL); + }; + + ///// StreamElement ///// + + class StreamElement : + public BaseStreamElement, + public AsyncExecutionSupport, + public ExecutionFlow { + private: + Kernel* kernel; + cl_kernel oclKernel = NULL; + + public: + explicit StreamElement(Device* device); + ~StreamElement(); + }; + + ///// KernelGenerator ///// + + class KernelGenerator : + public BaseKernelGenerator { + public: + static const std::string KERNEL_PREFIX; + static const std::string GLOBAL_MEMORY_PREFIX; + static const std::string SHARED_MEMORY_PREFIX; + static const std::string CONSTANT_PREFIX; + static const std::string DEVICE_FUNCTION_PREFIX; + const std::string getKernelPrefix() override; + std::string generateStdFunctions() override; + std::string replaceMacroKeywords(std::string kernelSource) override; + std::string generateInitKernel(Pattern::BaseParallelPattern* pattern, Dimensions dims) override; + std::string generateParams(Pattern::BaseParallelPattern* pattern, Dimensions dims) override; + std::string generateStdVariables(Pattern::BaseParallelPattern* pattern, Dimensions dims) override; + std::string generateBatchedParametersInitialization(Pattern::BaseParallelPattern* pattern, Dimensions dims) override; + + }; + + } + } +} + +#endif diff --git a/src/GSPar_PatternComposition.hpp b/src/GSPar_PatternComposition.hpp new file mode 100644 index 0000000..6952e46 --- /dev/null +++ b/src/GSPar_PatternComposition.hpp @@ -0,0 +1,271 @@ + +#ifndef __GSPAR_PATTERNCOMPOSITION_INCLUDED__ +#define __GSPAR_PATTERNCOMPOSITION_INCLUDED__ + +#include +#include +#include +#include + +///// Forward declarations ///// + +namespace GSPar { + namespace Pattern { + class PatternComposition; + } +} + +#include "GSPar_Base.hpp" +#include "GSPar_BaseGPUDriver.hpp" +#include "GSPar_BaseParallelPattern.hpp" +#include "GSPar_PatternMap.hpp" +#include "GSPar_PatternReduce.hpp" + +namespace GSPar { + namespace Pattern { + + enum PatternType { + GSPAR_PATTERN_MAP, + GSPAR_PATTERN_REDUCE + }; + + class PatternComposition { + protected: + bool built = false; + std::string extraKernelCode; + std::array stdVarNames; + std::vector patterns; + std::map patternsTypes; + Driver::Dimensions compiledPatternsDimension; + + template + inline bool instanceof(const T*) { + return std::is_base_of::value; + } + + template + std::string generateKernelSource(Driver::Dimensions max, unsigned int gpuIndex = 0) { + + std::string kernelSource = this->extraKernelCode; + if (!this->extraKernelCode.empty()) { + kernelSource += "\n"; + } + bool addedKernel = false; + for(auto pattern : patterns) { + if (pattern->getGpuIndex() != gpuIndex) { + continue; + } + addedKernel = true; + + pattern->callbackBeforeGeneratingKernelSource(); + kernelSource += pattern->generateKernelSource(max); + kernelSource += "\n"; + } + + return addedKernel ? kernelSource : ""; + } + + template + PatternComposition& addPatternInverseOrder(T* pattern) { + this->assertValidParallelPattern(pattern); + //This has a terrible performance, but this vector shouldn't be that large for this to be a problem + patterns.insert(patterns.begin(), 1, pattern); + this->patternsTypes[pattern] = this->instanceof(pattern) ? GSPAR_PATTERN_MAP : GSPAR_PATTERN_REDUCE; + return *this; + } + + void assertAnyPatternAdded() { + if (this->patterns.empty()) { + throw GSParException("No patterns added in composition, interrupting"); + } + } + template + void assertValidParallelPattern(T* pattern) { + if (!this->instanceof(pattern)) { + throw GSParException("Trying to add invalid pattern. All patterns must inherit BaseParallelPattern."); + } + } + + template + void run(Driver::Dimensions pDims, bool useCompiledDim) { + this->assertAnyPatternAdded(); + Driver::Dimensions dims = useCompiledDim ? this->compiledPatternsDimension : pDims; + if (!dims.getCount()) { + throw GSParException("No dimensions set to run the pattern composition"); + } + + // TODO validade if dims is valid + + this->compilePatterns(dims); + + for (const auto& pattern : this->patterns) { + // We pass dims again in Run case we have other thread asking the pattern to compile to another dims (which shouldn't happen anyway) + switch (this->patternsTypes[pattern]) { + case GSPAR_PATTERN_MAP: + (static_cast(pattern))->run(dims); + break; + case GSPAR_PATTERN_REDUCE: + // Almost https://en.wikipedia.org/wiki/Curiously_recurring_template_pattern + (static_cast(pattern))->run(dims); + break; + } + } + } + + public: + PatternComposition() = default; + + template + PatternComposition(std::initializer_list patterns) { + for (auto p : patterns) { + this->addPattern(p); + } + } + + template + PatternComposition(TFirst pattern, TArgs... args) : PatternComposition(args...) { + this->addPatternInverseOrder(pattern); // The elements are processed from last to first + } + + virtual ~PatternComposition() { } + + + template + PatternComposition* clone() const { + PatternComposition* other = new PatternComposition(); + for (const auto &pattern : this->patterns) { + switch (this->patternsTypes.at(pattern)) { + case GSPAR_PATTERN_MAP: + other->addPattern((static_cast(pattern))->clone()); + break; + case GSPAR_PATTERN_REDUCE: + other->addPattern((static_cast(pattern))->clone()); + break; + } + } + other->built = this->built; + other->extraKernelCode = this->extraKernelCode; + other->stdVarNames = this->stdVarNames; + if (this->compiledPatternsDimension.getCount()) { + Driver::Dimensions compiledPatternsDimension = this->compiledPatternsDimension; + other->compiledPatternsDimension = compiledPatternsDimension; + } + return other; + } + + virtual PatternComposition& addExtraKernelCode(std::string extraKernelCode) { + this->extraKernelCode += extraKernelCode; + return *this; + } + + virtual BaseParallelPattern* getPattern(size_t index) { + return patterns[index]; + } + + template + PatternComposition& addPattern(T* pattern) { + this->assertValidParallelPattern(pattern); + patterns.push_back(pattern); + this->patternsTypes[pattern] = this->instanceof(pattern) ? GSPAR_PATTERN_MAP : GSPAR_PATTERN_REDUCE; + return *this; + } + + virtual bool isAllPatternsCompiledFor(Driver::Dimensions dims) { + if (this->compiledPatternsDimension != dims) { // We are compiled with a different dims + return false; + } + for (auto pattern : this->patterns) { + if (!pattern->isKernelCompiledFor(dims)) { + return false; + } + } + return true; + } + + template + PatternComposition& compilePatterns(Driver::Dimensions dims) { + this->assertAnyPatternAdded(); + if (this->isAllPatternsCompiledFor(dims)) { + // The kernels are already compiled + return *this; + } + + // Init GPU driver + TDriverInstance* driver = TDriverInstance::getInstance(); + // Driver::OpenCL::Instance driver = TDriverInstance::getInstance(); //Provides autocomplete + driver->init(); + + if (driver->getGpuCount() == 0) { + throw GSParException("No GPU found, interrupting"); + } + + auto gpus = driver->getGpuList(); + + unsigned int gpuIndex = 0; + for (const auto& gpu : gpus) { + // Prepare kernels + std::string kernelSource = this->generateKernelSource(dims, gpuIndex); + if (kernelSource.empty()) { + continue; // If there's no patterns in this GPU, we can move on + } + + std::vector kernelNames; + for (auto pattern : this->patterns) { + if (pattern->getGpuIndex() != gpuIndex) { + continue; + } + kernelNames.push_back(pattern->getKernelName()); + } + + #ifdef GSPAR_DEBUG + std::stringstream ss; + ss << "[GSPar "<prepareKernels(kernelSource.c_str(), kernelNames); + int patternIndex = 0; + for (auto pattern : this->patterns) { + if (pattern->getGpuIndex() != gpuIndex) { + continue; + } + pattern->setCompiledKernel(kernels.at(patternIndex), dims); + patternIndex++; + } + } + gpuIndex++; + } + + this->compiledPatternsDimension = dims; + + return *this; + } + + template + void run() { + this->run(Driver::Dimensions(), true); + } + + template + void run(unsigned long dims[3][2]) { + this->run(Driver::Dimensions(dims), false); + } + + template + void run(unsigned long max[3]) { + this->run(Driver::Dimensions(max), false); + } + + template + void run(Driver::Dimensions dims) { + this->run(dims, false); + } + }; + + } +} + +#endif diff --git a/src/GSPar_PatternMap.hpp b/src/GSPar_PatternMap.hpp new file mode 100644 index 0000000..b2a1dc6 --- /dev/null +++ b/src/GSPar_PatternMap.hpp @@ -0,0 +1,29 @@ + +#ifndef __GSPAR_PATTERNMAP_INCLUDED__ +#define __GSPAR_PATTERNMAP_INCLUDED__ + +#include "GSPar_BaseParallelPattern.hpp" + +namespace GSPar { + namespace Pattern { + + /** + * Map parallel pattern + */ + class Map : public BaseParallelPattern { + public: + Map() : BaseParallelPattern() { }; + Map(std::string source) : BaseParallelPattern(source) { }; + + template + Map* clone() const { + Map* other = new Map(); + this->cloneInto(other); + return other; + } + }; + + } +} + +#endif diff --git a/src/GSPar_PatternReduce.cpp b/src/GSPar_PatternReduce.cpp new file mode 100644 index 0000000..ab21161 --- /dev/null +++ b/src/GSPar_PatternReduce.cpp @@ -0,0 +1,155 @@ +#include + +#include "GSPar_PatternReduce.hpp" + +using namespace GSPar::Pattern; + +PointerParameter* Reduce::getOutputParameter() { + auto param = this->getParameter(this->outputParameterName); + if (!param) { + throw GSParException("Could not find output parameter with name '" + this->outputParameterName + "' in Reduce pattern"); + } + return static_cast(param); +} + +PointerParameter* Reduce::generateSharedMemoryParameter(Driver::Dimensions dims, Driver::BaseKernelBase *kernel) { + if (dims.y || dims.z) { + // TODO support multiple dimensions + throw GSParException("Reduce pattern currently does not support multi-dimensional kernels"); + } + + // if (this->sharedMemoryParameter == nullptr || !this->sharedMemoryParameter->isComplete()) { + this->getSharedMemoryParameter(); // Generate the placeholder parameter + + std::lock_guard lock(this->sharedMemoryParameterMutex); // Auto-unlock, RAII + if (!this->sharedMemoryParameter->isComplete()) { // Check if there was a race condition for this resource + Driver::Dimensions blocksAndThreads = kernel->getNumBlocksAndThreadsFor(dims); + size_t sharedMemSize = (dims.x.max > blocksAndThreads.x.max) ? blocksAndThreads.x.max : dims.x.max; + + auto outParam = this->getOutputParameter(); + this->sharedMemoryParameter->numberOfElements = sharedMemSize; + this->sharedMemoryParameter->size = outParam->size * sharedMemSize; + this->sharedMemoryParameter->setComplete(true); + } + // Auto-unlock of sharedMemoryParameterMutex, RAII + // } + return this->sharedMemoryParameter; +} + +PointerParameter* Reduce::getSharedMemoryParameter() { + if (this->sharedMemoryParameter == nullptr) { + std::lock_guard lock(this->sharedMemoryParameterMutex); // Auto-unlock, RAII + if (this->sharedMemoryParameter == nullptr) { // Check if there was a race condition for this resource + auto outParam = this->getOutputParameter(); + std::string paramName = "gspar_shared_" + getRandomString(5); + this->sharedMemoryParameter = new PointerParameter(paramName, outParam->type, 0, nullptr); + } + // Auto-unlock of sharedMemoryParameterMutex, RAII + } + return this->sharedMemoryParameter; +}; + +std::string Reduce::getKernelCore(Driver::Dimensions dims, std::array stdVarNames) { + if (dims.y || dims.z) { + // TODO support multiple dimensions + throw GSParException("Reduce pattern currently does not support multi-dimensional kernels"); + } + + PointerParameter *outParam = this->getOutputParameter(); + auto shmemParam = this->getSharedMemoryParameter(); + std::string shmem = shmemParam->name; + + std::string op = this->binaryOperation; + std::string gid = stdVarNames[0]; + std::string max = "gspar_max_" + stdVarNames[0]; + std::string tid = "gspar_tid_" + stdVarNames[0]; + std::string bid = "gspar_bid_" + stdVarNames[0]; + std::string bsize = "gspar_bsize_" + stdVarNames[0]; + + // TODO support batches and min-max in Reduce + + // https://devblogs.nvidia.com/using-shared-memory-cuda-cc/ + // https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf + std::string kernelSource = + " size_t " + tid + " = gspar_get_thread_id(0); \n" + " size_t " + bid + " = gspar_get_block_id(0); \n" + " size_t " + bsize + " = gspar_get_block_size(0); \n" + " " + shmem + "["+tid+"] = " + this->vectorName + "["+gid+"]; \n" + " gspar_synchronize_local_threads(); \n" + + " for (unsigned int s="+bsize+"/2; s>0; s>>=1) { \n" + " if ("+tid+" < s && "+gid+"+s < "+max+") { \n" + " "+shmem+"["+tid+"] = "+shmem+"["+tid+"]" + op + shmem+"["+tid+"+s]; \n" + " } \n" + " gspar_synchronize_local_threads(); \n" + " if ("+tid+" == 0 && s > 1 && s % 2 != 0) { \n" + " "+shmem+"["+tid+"] = "+shmem+"["+tid+"]" + op + shmem+"[s-1]; \n" + " } \n" + " gspar_synchronize_local_threads(); \n" + " } \n" + " if ("+tid+" == 0) { \n" + " if ("+bsize+" % 2 != 0) { \n" + " "+shmem+"[0] = "+shmem+"[0]" + op + shmem+"["+max+"-1]; \n" + " } \n" + " " + this->partialTotalsParamName + "["+bid+"] = "+shmem+"[0]; \n" + // If the param is input, we reduce it together in the end + + (outParam->isIn() ? + " if (gspar_get_grid_size(0) == 1) { \n" + " " + this->partialTotalsParamName+"["+bid+"] = " + this->partialTotalsParamName+"["+bid+"]" + op + "*" + outParam->name + "; \n" + " } \n" + : "") + + " } \n" + ; + + return kernelSource; +}; + +bool Reduce::isKernelCompiledFor(Driver::Dimensions dims) { + // We only compile if the kernel wasn't compiled yet and the configuration didn't change + return this->_isKernelCompiled && !this->isKernelStale && this->compiledKernelDimension.getCount() == dims.getCount(); +} + +void Reduce::callbackBeforeGeneratingKernelSource() { + auto partialTotalsParam = this->getParameter(this->partialTotalsParamName); + if (!partialTotalsParam) { + #ifdef GSPAR_DEBUG + std::stringstream ss; + ss << "[GSPar Reduce "<partialTotalsParamName << ")" << std::endl; + std::cout << ss.str(); + ss.str(""); + #endif + // It is a ParameterPlaceholder, but we don't have the type here to call the proper function + auto outParam = this->getOutputParameter(); + VarType partialsTotalsType = outParam->type; + if (!partialsTotalsType.isPointer) { + partialsTotalsType.name += "*"; + partialsTotalsType.isPointer = true; + } + this->setPointerParameter(this->partialTotalsParamName, partialsTotalsType, 0, nullptr, GSPAR_PARAM_OUT); + } +} + +void Reduce::callbackBeforeAllocatingMemoryOnGpu(Driver::Dimensions dims, Driver::BaseKernelBase *kernel) { + auto partialTotalsParam = this->getParameter(this->partialTotalsParamName); + if (!partialTotalsParam || !partialTotalsParam->isComplete()) { + // TODO we could use the previous value (~15 lines above) + Driver::Dimensions blocksAndThreads = kernel->getNumBlocksAndThreadsFor(dims); + auto outParam = this->getOutputParameter(); + + size_t partialTotalsSize = blocksAndThreads.x.min * outParam->size; // Number of blocks * data size + // Should we store this pointer in a class-wide attribute? + void *partialTotals = malloc(partialTotalsSize); + #ifdef GSPAR_DEBUG + std::stringstream ss; + ss << "[GSPar Reduce "<partialTotalsParamName << ") as " << partialTotals << " (pointer of " << partialTotalsSize << " bytes)" << std::endl; + std::cout << ss.str(); + ss.str(""); + #endif + VarType partialsTotalsType = outParam->type; + if (!partialsTotalsType.isPointer) { + partialsTotalsType.name += "*"; + partialsTotalsType.isPointer = true; + } + this->setPointerParameter(this->partialTotalsParamName, partialsTotalsType, partialTotalsSize, partialTotals, GSPAR_PARAM_OUT); + } +} diff --git a/src/GSPar_PatternReduce.hpp b/src/GSPar_PatternReduce.hpp new file mode 100644 index 0000000..cc9f074 --- /dev/null +++ b/src/GSPar_PatternReduce.hpp @@ -0,0 +1,174 @@ + +#ifndef __GSPAR_PATTERNREDUCE_INCLUDED__ +#define __GSPAR_PATTERNREDUCE_INCLUDED__ + +#include "GSPar_BaseParallelPattern.hpp" + +namespace GSPar { + namespace Pattern { + + /** + * Reduce parallel pattern + */ + class Reduce : public BaseParallelPattern { + private: + const std::string partialTotalsParamName = "gspar_partial_reductions"; + PointerParameter* getOutputParameter(); + + protected: + std::string vectorName; + std::string binaryOperation; // https://northstar-www.dartmouth.edu/doc/ibmcxx/en_US/doc/language/ref/ruclxbin.htm + std::string outputParameterName; + + PointerParameter* generateSharedMemoryParameter(Driver::Dimensions dims, Driver::BaseKernelBase *kernel) override; + PointerParameter* getSharedMemoryParameter() override; + + public: + Reduce() : BaseParallelPattern() { }; + Reduce(std::string vectorName, std::string binaryOperation, std::string outputParameterName) : BaseParallelPattern("") { + this->vectorName = vectorName; + this->binaryOperation = binaryOperation; + this->outputParameterName = outputParameterName; + this->useSharedMemory = true; + }; + + template + Reduce* clone() const { + Reduce* other = new Reduce(); + this->cloneInto(other); + other->vectorName = this->vectorName; + other->binaryOperation = this->binaryOperation; + other->outputParameterName = this->outputParameterName; + return other; + }; + + std::string getKernelCore(Driver::Dimensions dims, std::array stdVarNames) override; + + bool isKernelCompiledFor(Driver::Dimensions dims) override; + + // Callback override + void callbackBeforeGeneratingKernelSource() override; + void callbackBeforeAllocatingMemoryOnGpu(Driver::Dimensions dims, Driver::BaseKernelBase *kernel) override; + + // Main run function for Reduce Pattern + // TODO this does not override base class due to templates. Fix this. + template + void run(Driver::Dimensions dimsToUse) { + if (dimsToUse.y || dimsToUse.z) { + // TODO support multiple dimensions + throw GSParException("Reduce pattern currently does not support multi-dimensional kernels"); + } + + // TODO support batched Reduce pattern + + #ifdef GSPAR_DEBUG + std::stringstream ss; + #endif + this->compile(dimsToUse); + + // #ifdef GSPAR_DEBUG + // auto gpu = this->getGpu(); + // ss << "[GSPar Reduce "<getName() << std::endl; + // std::cout << ss.str(); + // ss.str(""); + // #endif + + auto kernel = this->getCompiledKernel(); + kernel->clearParameters(); + + this->callbackBeforeAllocatingMemoryOnGpu(dimsToUse, kernel); + + this->mallocParametersInGpu(); + + this->copyParametersFromHostToGpuAsync(); + + auto executionFlow = this->getExecutionFlow(); + + Driver::Dimensions dimsToRun = dimsToUse; + + // We start reducing the input vector + PointerParameter *inputVector = static_cast(this->getParameter(this->vectorName)); + if (inputVector == nullptr) { + throw GSParException("Could not find input parameter with name '" + this->vectorName + "' in Reduce pattern"); + } + decltype(TDriverInstance::getMemoryObjectType())* inputMemoryObject = dynamic_cast(inputVector->getMemoryObject()); + + // In the first iteration, partialTotals is the output. After the first iteration, it is the input and output parameters + PointerParameter *partialTotals = static_cast(this->getParameter(this->partialTotalsParamName)); + if (partialTotals == nullptr) { + throw GSParException("Could not find partial totals parameter with name '" + this->partialTotalsParamName + "' in Reduce pattern"); + } + + while (true) { + + Driver::Dimensions blocksAndThreads = kernel->getNumBlocksAndThreadsFor(dimsToRun); + + this->setSharedMemoryInKernel(kernel, dimsToRun); + + // Init this->setParametersInKernel + this->setDimsParametersInKernel(kernel, dimsToRun); + + // Sets Pattern parameters in Kernel object + for (auto& paramName : this->paramsOrder) { + if (paramName == this->vectorName) { // Input parameter + if (inputMemoryObject) { + inputMemoryObject->waitAsync(); // Waits for async copy to finish + } + kernel->setParameter(inputMemoryObject); // We can simply set the memory object + } else { + auto param = this->getParameter(paramName); + this->setParameterInKernel(kernel, param); + } + + } + // Finish this->setParametersInKernel + + this->callbackAfterCopyDataFromHostToGpu(); + this->callbackBeforeRunInGpu(); + + #ifdef GSPAR_DEBUG + ss << "[GSPar Reduce "<runAsync(dimsToRun, executionFlow); + + kernel->waitAsync(); + + #ifdef GSPAR_DEBUG + ss << "[GSPar Reduce "<(partialTotals->getMemoryObject()); + + kernel->clearParameters(); + } + + // "Hack" to copy partial totals into output parameter + PointerParameter *outParam = this->getOutputParameter(); + decltype(TDriverInstance::getMemoryObjectType())* outputMemoryObject = dynamic_cast(partialTotals->getMemoryObject()); + outputMemoryObject->bindTo(outParam->getPointer(), outParam->size); + outputMemoryObject->copyOut(); + outParam->direction = GSPAR_PARAM_NONE; // We already copied the parameter out, copyParametersFromGpuToHostAsync should ignore it + + this->callbackAfterRunInGpu(); + + this->copyParametersFromGpuToHostAsync(); + + this->callbackAfterCopyDataFromGpuToHost(dimsToUse, kernel); + } + }; + + } +} + +#endif diff --git a/thirdpt/marX2/marX2.c b/thirdpt/marX2/marX2.c new file mode 100644 index 0000000..4bc12ed --- /dev/null +++ b/thirdpt/marX2/marX2.c @@ -0,0 +1,434 @@ +/* *************************************************************************** + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * As a special exception, you may use this file as part of a free software + * library without restriction. Specifically, if other files instantiate + * templates or use macros or inline functions from this file, or you compile + * this file and link it with other files to produce an executable, this + * file does not by itself cause the resulting executable to be covered by + * the GNU General Public License. This exception does not however + * invalidate any other reasons why the executable file might be covered by + * the GNU General Public License. + * + **************************************************************************** + */ + +/* + Author: Marco Aldinucci. + email: aldinuc@di.unipi.it + marco@pisa.quadrics.com + date : 15/11/97 + + Modified: Massimo Coppola + email: coppola@di.unipi.it + date: 23/11/2001 + +*/ + +/* gcc -Wall -ansi -c -O2 -I/usr/X11R6/include marX2.c for Linux LAN */ +/* cc -fast -Xc -c -I/usr/openwin/include marX2.c for SUN or MEIKO */ +/* Specify the path where X11 include are ... */ + +/* ar -rv marX2.a marX2.o */ +/* ranlib marX2.a */ + + +#include "marX2.h" + +#ifndef PI +#define PI 3.1415926535 +#endif + +/*#define DEBUG(x) x;fflush(stdout);*/ +/* defined empty to remove any debug printf */ +#define DEBUG(x) + +void ShowTM (); +void ChooseColorMap(int); +void RainbowColorMap(int); + +static Display * display; +static Window window; + +#define SCREEN DefaultScreen(display) + +static GC gc; +static int fg; +static int bg; +static XImage * image; +static Colormap colormap; + + +static char *m_image_buffer=NULL; /* pointer to the image buffer area */ + +static int m_h,m_w; /* size of our window */ + +static int iScreen; /* integer id of the default screen of display */ +static int dDepth; /* display bit depth: only 8,12,24 bits are supported */ +static unsigned int sizeofPixel; /* rounded up pixel size */ +static int rounded_length; /* rounded up length of a line in pixels */ + + +static int primo=0; /* ?? 0 if the colormap has not yet been changed ?? */ + +/* function that opens a window over the default screen */ +void SetupXWindows(int w, int h, int setup_color, char *display_name, + const char *window_title) +{ + XEvent event; + XGCValues gcvalues; + int mask; + + display = XOpenDisplay((display_name ? display_name : "")); + if (!display) + { + printf("Error in open X display\n"); + exit (1); + } + + iScreen = DefaultScreen(display); + dDepth = DefaultDepth(display,iScreen); + switch (dDepth) + { + case 8: + sizeofPixel=1; + break; + case 16: + sizeofPixel=2; + break; + case 24: + case 32: /* It should work; on my system 24 bits pixels are word-aligned */ + sizeofPixel=4; + break; + default: + printf("SetupXWindows : unsupported Display depth %d\n",dDepth); + } + + DEBUG(printf("Default display depth %d\n", dDepth)); + + m_w=w; + m_h=h; + + /* scanlines are rounded to 32 pixels to avoid any misalignment */ + rounded_length = (m_w+31)&(~31); + /* allocate window buffer space */ + m_image_buffer = /*(XImage *)*/ + (char *) calloc(sizeof(char)*sizeofPixel,rounded_length*m_h); + + fg=Black(); + bg=White(); + + window = XCreateSimpleWindow(display, DefaultRootWindow(display), + 0,0,w,h,2,fg,bg); + + mask = ExposureMask | ButtonPressMask | KeyPressMask; + XSelectInput(display, window,mask); + XStoreName(display, window, window_title); + XMapWindow(display, window); + + for (;;) + { + XNextEvent(display, &event); + if (event.type == Expose) + break; + } + + gcvalues.foreground = fg; + gcvalues.background = bg; + mask = GCForeground | GCBackground; + gc = XCreateGC(display, window, mask, &gcvalues); + + DEBUG(printf("Window opened\n")); +} + + +void CloseXWindows() +{ + /* should close the window and free all resources */ + + XCloseDisplay(display); + +} + +/* service function to convert an array of bytes to an array of dDepth + depth pixels. +*/ +static void ConvertLine(unsigned char *line, int line_len, XImage * dest) +{ + int i, value, mult=0; + switch (dDepth) + { + case 32: + mult = 0x01010101; + break; + case 24: + mult = 0x000f0704; + break; + case 16: + mult = 0x00000101; + break; + case 8: + mult = 1; + break; + } + for (i=0;irounded_length) line_len=rounded_length; + if (position>m_h) return; + + /* alloc temp image area and XImage structure */ + DEBUG(printf("Showline - 2 pt %x len %d pos %d rlen %d \n", + line, line_len, position, rounded_length)); + image_buffer = (char *)(calloc(sizeof(char)*sizeofPixel, rounded_length)); + + DEBUG(printf("Showline - 3 tmpimage %x siz %x len %x \n", + image_buffer, sizeof(char)*sizeofPixel, rounded_length)); + + image_line = XCreateImage(display,DefaultVisual(display, SCREEN), + dDepth,ZPixmap,0, + (char *)image_buffer,rounded_length,1,32,0); + + DEBUG(printf("Showline - 4.1 image_line %x \n",image_line)); + + /* convert the input data into the image */ + ConvertLine(line, line_len, image_line); + + /* Put the line into the window */ + XPutImage(display,window,gc,image_line,0,0,0,position,line_len,1); + XFlush(display); + + /* Save the line into our backing store */ + memcpy((m_image_buffer+(position*rounded_length*sizeofPixel)), + image_buffer, + line_len*sizeofPixel); + /* destroy temporary image */ + XDestroyImage (image_line); + DEBUG(printf("Showline - 9\n")); +} + + +/* manage some events coming from the window: + * refresh (Expose) events, + * button press, + * key presses: q,Q,c,C,r,R,m,M + */ +void HXI(int *px, int *py,int *dim,int *done) +{ + XEvent event; + int something =0; + static int next=0; + const int clicks = 10; /* how many clicks before exiting anyway */ + + *dim=1; + *done=0; + + while ((XEventsQueued(display, QueuedAfterReading) > 0)|| somethingx; + *py=bpe->y; + button=bpe->button; + *dim = (button == Button1 ? 2 : + button == Button2 ? 4 : 8); + something++; + } + break; + case Expose: + if (m_image_buffer!=NULL) + { + if (image == NULL) + { + image = + XCreateImage(display, + DefaultVisual(display,SCREEN), + dDepth,ZPixmap,0, + (char*) m_image_buffer, + rounded_length,m_h,32,0); + DEBUG(printf("image %x \n",image)); + } + XPutImage(display,window,gc,image,0,0,0,0,m_w,m_h); + XFlush(display); + } + break; + case KeyPress: + { + XKeyEvent * kpe = (XKeyEvent *) &event; + KeySym ks = XLookupKeysym(kpe, 0); + + switch (ks) + { + case 'q': + case 'Q': + *done=1; + something=clicks; + break; + case 'r': + case 'R': + RainbowColorMap(128); + break; + case 'c': + case 'C': + ChooseColorMap(next++); + break; + case 'M': + case 'm': + ShowTM (); + break; + default: + printf("Keys:\n\n"); + printf( + "q) Quit !\n" + "c) change colormap (8bits display)\n" + "c + r) rainbow colormap (8bits display)\n"); + fflush(stdout); + + } + } + break; + } + } +} + + +/* two functions to get the default fg/gb colours on the default screen*/ +int Black(void){return(BlackPixelOfScreen(DefaultScreenOfDisplay(display)));} +int White(void){return(WhitePixelOfScreen(DefaultScreenOfDisplay(display)));} + + +/* show a short message on the window */ +void ShowTM () +{ + /* should clear the window */ + + XGCValues gcvalues,tmp_val; + int mask; + char s1[]="Bacci Cantalupo Ravazzolo"; + char s2[]="Riaudo Pesciullesi"; + char s3[]="Aldinucci Coppola Torquati"; + + mask=GCForeground|GCFunction; + XGetGCValues(display,gc,mask,&tmp_val); + gcvalues.foreground=1; + gcvalues.function=GXcopy; + XChangeGC(display,gc,mask,&gcvalues); + XDrawString(display,window,gc,5,10,"QSW PISA are:",12); + XDrawString(display,window,gc,10,25,s1,strlen(s1)); + XDrawString(display,window,gc,10,40,s2,strlen(s2)); + XDrawString(display,window,gc,10,55,s3,strlen(s3)); + /* Restore Graphic Context */ + XChangeGC(display,gc,mask,&tmp_val); + +} + +/* the following two functions change the colormap for 8-bit + displays. They should check and do nothing on true color displays */ +void ChooseColorMap(int which) +{ + + Visual visual; + XColor color; + int i; + + if (dDepth!=8)return; /* only for 256 color display! */ + + if (primo==0) + { + visual = *DefaultVisual(display, SCREEN); + colormap= XCreateColormap(display, window, &visual, AllocAll); + } + + color.flags=DoRed | DoGreen | DoBlue; + for (i=0; i< 256; i++) + { + color.pixel=i; + color.red=0xffffL * ((long) (i+which) * 101 %256)/255L; + color.green=0xffffL * ((long) (i+which) * 151 %256)/255L; + color.blue=0xffffL * ((long) (i+which) * 171 %256)/255L; + + XStoreColor(display, colormap, &color); + } + + if (primo==0) + { + XInstallColormap(display,colormap); + XSetWindowColormap(display,window,colormap); + primo=1; + } +} + +void RainbowColorMap(int n) +{ + int i, j; + double d, e; + XColor color; + + if (dDepth!=8)return; /* only for 256 color display! */ + + if (primo!=0) + { + color.flags=DoRed | DoGreen | DoBlue; + for (i = 1; i < n - 1; i++) { + j = n - 1 - i; + d = (d = cos((double)((j - n * 0.16) * (PI / n)))) < 0.0 + ? 0.0 : d; + color.blue = d * n; + d = (d = cos((double)((j - n * 0.52) * (PI / n)))) < 0.0 + ? 0.0 : d; + color.green = d * n; + d = (d = cos((double)((j - n * .83) * (PI / n)))) < 0.0 + ? 0.0 : d; + e = (e = cos((double)(j * (PI / n)))) < 0.0 + ? 0.0 : e; + color.red = d * n + e * (n / 2); + color.pixel=i; + XStoreColor(display, colormap, &color); + } + color.green=color.blue=color.red=color.pixel=i; + XStoreColor(display, colormap, &color); + color.green=color.blue=color.red=color.pixel=0; + XStoreColor(display, colormap, &color); + } +} diff --git a/thirdpt/marX2/marX2.h b/thirdpt/marX2/marX2.h new file mode 100644 index 0000000..990e815 --- /dev/null +++ b/thirdpt/marX2/marX2.h @@ -0,0 +1,29 @@ + +#ifndef MARCO_X +#define MARCO_X + +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif +void CloseXWindows(); +void ShowLine(void *line,int line_len,int position); +void SetupXWindows(int w, int h, int setup_color, char *display_name, + const char *window_title); + +void HXI(int *px, int *py,int *dim,int *done); + +int Black(); +int White(); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/thirdpt/rapidxml-1.13/license.txt b/thirdpt/rapidxml-1.13/license.txt new file mode 100644 index 0000000..1409831 --- /dev/null +++ b/thirdpt/rapidxml-1.13/license.txt @@ -0,0 +1,52 @@ +Use of this software is granted under one of the following two licenses, +to be chosen freely by the user. + +1. Boost Software License - Version 1.0 - August 17th, 2003 +=============================================================================== + +Copyright (c) 2006, 2007 Marcin Kalicinski + +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. + +2. The MIT License +=============================================================================== + +Copyright (c) 2006, 2007 Marcin Kalicinski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +IN THE SOFTWARE. diff --git a/thirdpt/rapidxml-1.13/manual.html b/thirdpt/rapidxml-1.13/manual.html new file mode 100644 index 0000000..2c42270 --- /dev/null +++ b/thirdpt/rapidxml-1.13/manual.html @@ -0,0 +1,406 @@ +

RAPIDXML Manual

Version 1.13

Copyright (C) 2006, 2009 Marcin Kalicinski
See accompanying file license.txt for license information.

Table of Contents

1. What is RapidXml?
1.1 Dependencies And Compatibility
1.2 Character Types And Encodings
1.3 Error Handling
1.4 Memory Allocation
1.5 W3C Compliance
1.6 API Design
1.7 Reliability
1.8 Acknowledgements
2. Two Minute Tutorial
2.1 Parsing
2.2 Accessing The DOM Tree
2.3 Modifying The DOM Tree
2.4 Printing XML
3. Differences From Regular XML Parsers
3.1 Lifetime Of Source Text
3.2 Ownership Of Strings
3.3 Destructive Vs Non-Destructive Mode
4. Performance
4.1 Comparison With Other Parsers
5. Reference

1. What is RapidXml?

RapidXml is an attempt to create the fastest XML DOM parser possible, while retaining useability, portability and reasonable W3C compatibility. It is an in-situ parser written in C++, with parsing speed approaching that of strlen() function executed on the same data.

+ Entire parser is contained in a single header file, so no building or linking is neccesary. To use it you just need to copy rapidxml.hpp file to a convenient place (such as your project directory), and include it where needed. You may also want to use printing functions contained in header rapidxml_print.hpp.

1.1 Dependencies And Compatibility

RapidXml has no dependencies other than a very small subset of standard C++ library (<cassert>, <cstdlib>, <new> and <exception>, unless exceptions are disabled). It should compile on any reasonably conformant compiler, and was tested on Visual C++ 2003, Visual C++ 2005, Visual C++ 2008, gcc 3, gcc 4, and Comeau 4.3.3. Care was taken that no warnings are produced on these compilers, even with highest warning levels enabled.

1.2 Character Types And Encodings

RapidXml is character type agnostic, and can work both with narrow and wide characters. Current version does not fully support UTF-16 or UTF-32, so use of wide characters is somewhat incapacitated. However, it should succesfully parse wchar_t strings containing UTF-16 or UTF-32 if endianness of the data matches that of the machine. UTF-8 is fully supported, including all numeric character references, which are expanded into appropriate UTF-8 byte sequences (unless you enable parse_no_utf8 flag).

+ Note that RapidXml performs no decoding - strings returned by name() and value() functions will contain text encoded using the same encoding as source file. Rapidxml understands and expands the following character references: &apos; &amp; &quot; &lt; &gt; &#...; Other character references are not expanded.

1.3 Error Handling

By default, RapidXml uses C++ exceptions to report errors. If this behaviour is undesirable, RAPIDXML_NO_EXCEPTIONS can be defined to suppress exception code. See parse_error class and parse_error_handler() function for more information.

1.4 Memory Allocation

RapidXml uses a special memory pool object to allocate nodes and attributes, because direct allocation using new operator would be far too slow. Underlying memory allocations performed by the pool can be customized by use of memory_pool::set_allocator() function. See class memory_pool for more information.

1.5 W3C Compliance

RapidXml is not a W3C compliant parser, primarily because it ignores DOCTYPE declarations. There is a number of other, minor incompatibilities as well. Still, it can successfully parse and produce complete trees of all valid XML files in W3C conformance suite (over 1000 files specially designed to find flaws in XML processors). In destructive mode it performs whitespace normalization and character entity substitution for a small set of built-in entities.

1.6 API Design

RapidXml API is minimalistic, to reduce code size as much as possible, and facilitate use in embedded environments. Additional convenience functions are provided in separate headers: rapidxml_utils.hpp and rapidxml_print.hpp. Contents of these headers is not an essential part of the library, and is currently not documented (otherwise than with comments in code).

1.7 Reliability

RapidXml is very robust and comes with a large harness of unit tests. Special care has been taken to ensure stability of the parser no matter what source text is thrown at it. One of the unit tests produces 100,000 randomly corrupted variants of XML document, which (when uncorrupted) contains all constructs recognized by RapidXml. RapidXml passes this test when it correctly recognizes that errors have been introduced, and does not crash or loop indefinitely.

+ Another unit test puts RapidXml head-to-head with another, well estabilished XML parser, and verifies that their outputs match across a wide variety of small and large documents.

+ Yet another test feeds RapidXml with over 1000 test files from W3C compliance suite, and verifies that correct results are obtained. There are also additional tests that verify each API function separately, and test that various parsing modes work as expected.

1.8 Acknowledgements

I would like to thank Arseny Kapoulkine for his work on pugixml, which was an inspiration for this project. Additional thanks go to Kristen Wegner for creating pugxml, from which pugixml was derived. Janusz Wohlfeil kindly ran RapidXml speed tests on hardware that I did not have access to, allowing me to expand performance comparison table.

2. Two Minute Tutorial

2.1 Parsing

The following code causes RapidXml to parse a zero-terminated string named text:
using namespace rapidxml;
+xml_document<> doc;    // character type defaults to char
+doc.parse<0>(text);    // 0 means default parse flags
+
doc object is now a root of DOM tree containing representation of the parsed XML. Because all RapidXml interface is contained inside namespace rapidxml, users must either bring contents of this namespace into scope, or fully qualify all the names. Class xml_document represents a root of the DOM hierarchy. By means of public inheritance, it is also an xml_node and a memory_pool. Template parameter of xml_document::parse() function is used to specify parsing flags, with which you can fine-tune behaviour of the parser. Note that flags must be a compile-time constant.

2.2 Accessing The DOM Tree

To access the DOM tree, use methods of xml_node and xml_attribute classes:
cout << "Name of my first node is: " << doc.first_node()->name() << "\n";
+xml_node<> *node = doc.first_node("foobar");
+cout << "Node foobar has value " << node->value() << "\n";
+for (xml_attribute<> *attr = node->first_attribute();
+     attr; attr = attr->next_attribute())
+{
+    cout << "Node foobar has attribute " << attr->name() << " ";
+    cout << "with value " << attr->value() << "\n";
+}
+

2.3 Modifying The DOM Tree

DOM tree produced by the parser is fully modifiable. Nodes and attributes can be added/removed, and their contents changed. The below example creates a HTML document, whose sole contents is a link to google.com website:
xml_document<> doc;
+xml_node<> *node = doc.allocate_node(node_element, "a", "Google");
+doc.append_node(node);
+xml_attribute<> *attr = doc.allocate_attribute("href", "google.com");
+node->append_attribute(attr);
+
One quirk is that nodes and attributes do not own the text of their names and values. This is because normally they only store pointers to the source text. So, when assigning a new name or value to the node, care must be taken to ensure proper lifetime of the string. The easiest way to achieve it is to allocate the string from the xml_document memory pool. In the above example this is not necessary, because we are only assigning character constants. But the code below uses memory_pool::allocate_string() function to allocate node name (which will have the same lifetime as the document), and assigns it to a new node:
xml_document<> doc;
+char *node_name = doc.allocate_string(name);        // Allocate string and copy name into it
+xml_node<> *node = doc.allocate_node(node_element, node_name);  // Set node name to node_name
+
Check Reference section for description of the entire interface.

2.4 Printing XML

You can print xml_document and xml_node objects into an XML string. Use print() function or operator <<, which are defined in rapidxml_print.hpp header.
using namespace rapidxml;
+xml_document<> doc;    // character type defaults to char
+// ... some code to fill the document
+
+// Print to stream using operator <<
+std::cout << doc;   
+
+// Print to stream using print function, specifying printing flags
+print(std::cout, doc, 0);   // 0 means default printing flags
+
+// Print to string using output iterator
+std::string s;
+print(std::back_inserter(s), doc, 0);
+
+// Print to memory buffer using output iterator
+char buffer[4096];                      // You are responsible for making the buffer large enough!
+char *end = print(buffer, doc, 0);      // end contains pointer to character after last printed character
+*end = 0;                               // Add string terminator after XML
+

3. Differences From Regular XML Parsers

RapidXml is an in-situ parser, which allows it to achieve very high parsing speed. In-situ means that parser does not make copies of strings. Instead, it places pointers to the source text in the DOM hierarchy.

3.1 Lifetime Of Source Text

In-situ parsing requires that source text lives at least as long as the document object. If source text is destroyed, names and values of nodes in DOM tree will become destroyed as well. Additionally, whitespace processing, character entity translation, and zero-termination of strings require that source text be modified during parsing (but see non-destructive mode). This makes the text useless for further processing once it was parsed by RapidXml.

+ In many cases however, these are not serious issues.

3.2 Ownership Of Strings

Nodes and attributes produced by RapidXml do not own their name and value strings. They merely hold the pointers to them. This means you have to be careful when setting these values manually, by using xml_base::name(const Ch *) or xml_base::value(const Ch *) functions. Care must be taken to ensure that lifetime of the string passed is at least as long as lifetime of the node/attribute. The easiest way to achieve it is to allocate the string from memory_pool owned by the document. Use memory_pool::allocate_string() function for this purpose.

3.3 Destructive Vs Non-Destructive Mode

By default, the parser modifies source text during the parsing process. This is required to achieve character entity translation, whitespace normalization, and zero-termination of strings.

+ In some cases this behaviour may be undesirable, for example if source text resides in read only memory, or is mapped to memory directly from file. By using appropriate parser flags (parse_non_destructive), source text modifications can be disabled. However, because RapidXml does in-situ parsing, it obviously has the following side-effects:

4. Performance

RapidXml achieves its speed through use of several techniques:
  • In-situ parsing. When building DOM tree, RapidXml does not make copies of string data, such as node names and values. Instead, it stores pointers to interior of the source text.
  • Use of template metaprogramming techniques. This allows it to move much of the work to compile time. Through magic of the templates, C++ compiler generates a separate copy of parsing code for any combination of parser flags you use. In each copy, all possible decisions are made at compile time and all unused code is omitted.
  • Extensive use of lookup tables for parsing.
  • Hand-tuned C++ with profiling done on several most popular CPUs.
This results in a very small and fast code: a parser which is custom tailored to exact needs with each invocation.

4.1 Comparison With Other Parsers

The table below compares speed of RapidXml to some other parsers, and to strlen() function executed on the same data. On a modern CPU (as of 2007), you can expect parsing throughput to be close to 1 GB/s. As a rule of thumb, parsing speed is about 50-100x faster than Xerces DOM, 30-60x faster than TinyXml, 3-12x faster than pugxml, and about 5% - 30% faster than pugixml, the fastest XML parser I know of.
  • The test file is a real-world, 50kB large, moderately dense XML file.
  • All timing is done by using RDTSC instruction present in Pentium-compatible CPUs.
  • No profile-guided optimizations are used.
  • All parsers are running in their fastest modes.
  • The results are given in CPU cycles per character, so frequency of CPUs is irrelevant.
  • The results are minimum values from a large number of runs, to minimize effects of operating system activity, task switching, interrupt handling etc.
  • A single parse of the test file takes about 1/10th of a millisecond, so with large number of runs there is a good chance of hitting at least one no-interrupt streak, and obtaining undisturbed results.
Platform
Compiler
strlen() RapidXml pugixml 0.3 pugxml TinyXml
Pentium 4
MSVC 8.0
2.5
5.4
7.0
61.7
298.8
Pentium 4
gcc 4.1.1
0.8
6.1
9.5
67.0
413.2
Core 2
MSVC 8.0
1.0
4.5
5.0
24.6
154.8
Core 2
gcc 4.1.1
0.6
4.6
5.4
28.3
229.3
Athlon XP
MSVC 8.0
3.1
7.7
8.0
25.5
182.6
Athlon XP
gcc 4.1.1
0.9
8.2
9.2
33.7
265.2
Pentium 3
MSVC 8.0
2.0
6.3
7.0
30.9
211.9
Pentium 3
gcc 4.1.1
1.0
6.7
8.9
35.3
316.0
(*) All results are in CPU cycles per character of source text

5. Reference

This section lists all classes, functions, constants etc. and describes them in detail.
class + template + rapidxml::memory_pool
+ constructor + memory_pool()
+ destructor + ~memory_pool()
function allocate_node(node_type type, const Ch *name=0, const Ch *value=0, std::size_t name_size=0, std::size_t value_size=0)
function allocate_attribute(const Ch *name=0, const Ch *value=0, std::size_t name_size=0, std::size_t value_size=0)
function allocate_string(const Ch *source=0, std::size_t size=0)
function clone_node(const xml_node< Ch > *source, xml_node< Ch > *result=0)
function clear()
function set_allocator(alloc_func *af, free_func *ff)

class rapidxml::parse_error
+ constructor + parse_error(const char *what, void *where)
function what() const
function where() const

class + template + rapidxml::xml_attribute
+ constructor + xml_attribute()
function document() const
function previous_attribute(const Ch *name=0, std::size_t name_size=0, bool case_sensitive=true) const
function next_attribute(const Ch *name=0, std::size_t name_size=0, bool case_sensitive=true) const

class + template + rapidxml::xml_base
+ constructor + xml_base()
function name() const
function name_size() const
function value() const
function value_size() const
function name(const Ch *name, std::size_t size)
function name(const Ch *name)
function value(const Ch *value, std::size_t size)
function value(const Ch *value)
function parent() const

class + template + rapidxml::xml_document
+ constructor + xml_document()
function parse(Ch *text)
function clear()

class + template + rapidxml::xml_node
+ constructor + xml_node(node_type type)
function type() const
function document() const
function first_node(const Ch *name=0, std::size_t name_size=0, bool case_sensitive=true) const
function last_node(const Ch *name=0, std::size_t name_size=0, bool case_sensitive=true) const
function previous_sibling(const Ch *name=0, std::size_t name_size=0, bool case_sensitive=true) const
function next_sibling(const Ch *name=0, std::size_t name_size=0, bool case_sensitive=true) const
function first_attribute(const Ch *name=0, std::size_t name_size=0, bool case_sensitive=true) const
function last_attribute(const Ch *name=0, std::size_t name_size=0, bool case_sensitive=true) const
function type(node_type type)
function prepend_node(xml_node< Ch > *child)
function append_node(xml_node< Ch > *child)
function insert_node(xml_node< Ch > *where, xml_node< Ch > *child)
function remove_first_node()
function remove_last_node()
function remove_node(xml_node< Ch > *where)
function remove_all_nodes()
function prepend_attribute(xml_attribute< Ch > *attribute)
function append_attribute(xml_attribute< Ch > *attribute)
function insert_attribute(xml_attribute< Ch > *where, xml_attribute< Ch > *attribute)
function remove_first_attribute()
function remove_last_attribute()
function remove_attribute(xml_attribute< Ch > *where)
function remove_all_attributes()

namespace rapidxml
enum node_type
function parse_error_handler(const char *what, void *where)
function print(OutIt out, const xml_node< Ch > &node, int flags=0)
function print(std::basic_ostream< Ch > &out, const xml_node< Ch > &node, int flags=0)
function operator<<(std::basic_ostream< Ch > &out, const xml_node< Ch > &node)
+ constant + parse_no_data_nodes
+ constant + parse_no_element_values
+ constant + parse_no_string_terminators
+ constant + parse_no_entity_translation
+ constant + parse_no_utf8
+ constant + parse_declaration_node
+ constant + parse_comment_nodes
+ constant + parse_doctype_node
+ constant + parse_pi_nodes
+ constant + parse_validate_closing_tags
+ constant + parse_trim_whitespace
+ constant + parse_normalize_whitespace
+ constant + parse_default
+ constant + parse_non_destructive
+ constant + parse_fastest
+ constant + parse_full
+ constant + print_no_indenting


class + template + rapidxml::memory_pool

+ + Defined in rapidxml.hpp
+ Base class for + xml_document

Description

This class is used by the parser to create new nodes and attributes, without overheads of dynamic memory allocation. In most cases, you will not need to use this class directly. However, if you need to create nodes manually or modify names/values of nodes, you are encouraged to use memory_pool of relevant xml_document to allocate the memory. Not only is this faster than allocating them by using new operator, but also their lifetime will be tied to the lifetime of document, possibly simplyfing memory management.

+ Call allocate_node() or allocate_attribute() functions to obtain new nodes or attributes from the pool. You can also call allocate_string() function to allocate strings. Such strings can then be used as names or values of nodes without worrying about their lifetime. Note that there is no free() function -- all allocations are freed at once when clear() function is called, or when the pool is destroyed.

+ It is also possible to create a standalone memory_pool, and use it to allocate nodes, whose lifetime will not be tied to any document.

+ Pool maintains RAPIDXML_STATIC_POOL_SIZE bytes of statically allocated memory. Until static memory is exhausted, no dynamic memory allocations are done. When static memory is exhausted, pool allocates additional blocks of memory of size RAPIDXML_DYNAMIC_POOL_SIZE each, by using global new[] and delete[] operators. This behaviour can be changed by setting custom allocation routines. Use set_allocator() function to set them.

+ Allocations for nodes, attributes and strings are aligned at RAPIDXML_ALIGNMENT bytes. This value defaults to the size of pointer on target architecture.

+ To obtain absolutely top performance from the parser, it is important that all nodes are allocated from a single, contiguous block of memory. Otherwise, cache misses when jumping between two (or more) disjoint blocks of memory can slow down parsing quite considerably. If required, you can tweak RAPIDXML_STATIC_POOL_SIZE, RAPIDXML_DYNAMIC_POOL_SIZE and RAPIDXML_ALIGNMENT to obtain best wasted memory to performance compromise. To do it, define their values before rapidxml.hpp file is included.

Parameters

Ch
Character type of created nodes.

+ constructor + memory_pool::memory_pool

Synopsis

memory_pool(); +

Description

Constructs empty pool with default allocator functions.

+ destructor + memory_pool::~memory_pool

Synopsis

~memory_pool(); +

Description

Destroys pool and frees all the memory. This causes memory occupied by nodes allocated by the pool to be freed. Nodes allocated from the pool are no longer valid.

function memory_pool::allocate_node

Synopsis

xml_node<Ch>* allocate_node(node_type type, const Ch *name=0, const Ch *value=0, std::size_t name_size=0, std::size_t value_size=0); +

Description

Allocates a new node from the pool, and optionally assigns name and value to it. If the allocation request cannot be accomodated, this function will throw std::bad_alloc. If exceptions are disabled by defining RAPIDXML_NO_EXCEPTIONS, this function will call rapidxml::parse_error_handler() function.

Parameters

type
Type of node to create.
name
Name to assign to the node, or 0 to assign no name.
value
Value to assign to the node, or 0 to assign no value.
name_size
Size of name to assign, or 0 to automatically calculate size from name string.
value_size
Size of value to assign, or 0 to automatically calculate size from value string.

Returns

Pointer to allocated node. This pointer will never be NULL.

function memory_pool::allocate_attribute

Synopsis

xml_attribute<Ch>* allocate_attribute(const Ch *name=0, const Ch *value=0, std::size_t name_size=0, std::size_t value_size=0); +

Description

Allocates a new attribute from the pool, and optionally assigns name and value to it. If the allocation request cannot be accomodated, this function will throw std::bad_alloc. If exceptions are disabled by defining RAPIDXML_NO_EXCEPTIONS, this function will call rapidxml::parse_error_handler() function.

Parameters

name
Name to assign to the attribute, or 0 to assign no name.
value
Value to assign to the attribute, or 0 to assign no value.
name_size
Size of name to assign, or 0 to automatically calculate size from name string.
value_size
Size of value to assign, or 0 to automatically calculate size from value string.

Returns

Pointer to allocated attribute. This pointer will never be NULL.

function memory_pool::allocate_string

Synopsis

Ch* allocate_string(const Ch *source=0, std::size_t size=0); +

Description

Allocates a char array of given size from the pool, and optionally copies a given string to it. If the allocation request cannot be accomodated, this function will throw std::bad_alloc. If exceptions are disabled by defining RAPIDXML_NO_EXCEPTIONS, this function will call rapidxml::parse_error_handler() function.

Parameters

source
String to initialize the allocated memory with, or 0 to not initialize it.
size
Number of characters to allocate, or zero to calculate it automatically from source string length; if size is 0, source string must be specified and null terminated.

Returns

Pointer to allocated char array. This pointer will never be NULL.

function memory_pool::clone_node

Synopsis

xml_node<Ch>* clone_node(const xml_node< Ch > *source, xml_node< Ch > *result=0); +

Description

Clones an xml_node and its hierarchy of child nodes and attributes. Nodes and attributes are allocated from this memory pool. Names and values are not cloned, they are shared between the clone and the source. Result node can be optionally specified as a second parameter, in which case its contents will be replaced with cloned source node. This is useful when you want to clone entire document.

Parameters

source
Node to clone.
result
Node to put results in, or 0 to automatically allocate result node

Returns

Pointer to cloned node. This pointer will never be NULL.

function memory_pool::clear

Synopsis

void clear(); +

Description

Clears the pool. This causes memory occupied by nodes allocated by the pool to be freed. Any nodes or strings allocated from the pool will no longer be valid.

function memory_pool::set_allocator

Synopsis

void set_allocator(alloc_func *af, free_func *ff); +

Description

Sets or resets the user-defined memory allocation functions for the pool. This can only be called when no memory is allocated from the pool yet, otherwise results are undefined. Allocation function must not return invalid pointer on failure. It should either throw, stop the program, or use longjmp() function to pass control to other place of program. If it returns invalid pointer, results are undefined.

+ User defined allocation functions must have the following forms:

+void *allocate(std::size_t size);
+void free(void *pointer);

Parameters

af
Allocation function, or 0 to restore default function
ff
Free function, or 0 to restore default function

class rapidxml::parse_error

+ + Defined in rapidxml.hpp

Description

Parse error exception. This exception is thrown by the parser when an error occurs. Use what() function to get human-readable error message. Use where() function to get a pointer to position within source text where error was detected.

+ If throwing exceptions by the parser is undesirable, it can be disabled by defining RAPIDXML_NO_EXCEPTIONS macro before rapidxml.hpp is included. This will cause the parser to call rapidxml::parse_error_handler() function instead of throwing an exception. This function must be defined by the user.

+ This class derives from std::exception class.

+ constructor + parse_error::parse_error

Synopsis

parse_error(const char *what, void *where); +

Description

Constructs parse error.

function parse_error::what

Synopsis

virtual const char* what() const; +

Description

Gets human readable description of error.

Returns

Pointer to null terminated description of the error.

function parse_error::where

Synopsis

Ch* where() const; +

Description

Gets pointer to character data where error happened. Ch should be the same as char type of xml_document that produced the error.

Returns

Pointer to location within the parsed string where error occured.

class + template + rapidxml::xml_attribute

+ + Defined in rapidxml.hpp
+ Inherits from + xml_base

Description

Class representing attribute node of XML document. Each attribute has name and value strings, which are available through name() and value() functions (inherited from xml_base). Note that after parse, both name and value of attribute will point to interior of source text used for parsing. Thus, this text must persist in memory for the lifetime of attribute.

Parameters

Ch
Character type to use.

+ constructor + xml_attribute::xml_attribute

Synopsis

xml_attribute(); +

Description

Constructs an empty attribute with the specified type. Consider using memory_pool of appropriate xml_document if allocating attributes manually.

function xml_attribute::document

Synopsis

xml_document<Ch>* document() const; +

Description

Gets document of which attribute is a child.

Returns

Pointer to document that contains this attribute, or 0 if there is no parent document.

function xml_attribute::previous_attribute

Synopsis

xml_attribute<Ch>* previous_attribute(const Ch *name=0, std::size_t name_size=0, bool case_sensitive=true) const; +

Description

Gets previous attribute, optionally matching attribute name.

Parameters

name
Name of attribute to find, or 0 to return previous attribute regardless of its name; this string doesn't have to be zero-terminated if name_size is non-zero
name_size
Size of name, in characters, or 0 to have size calculated automatically from string
case_sensitive
Should name comparison be case-sensitive; non case-sensitive comparison works properly only for ASCII characters

Returns

Pointer to found attribute, or 0 if not found.

function xml_attribute::next_attribute

Synopsis

xml_attribute<Ch>* next_attribute(const Ch *name=0, std::size_t name_size=0, bool case_sensitive=true) const; +

Description

Gets next attribute, optionally matching attribute name.

Parameters

name
Name of attribute to find, or 0 to return next attribute regardless of its name; this string doesn't have to be zero-terminated if name_size is non-zero
name_size
Size of name, in characters, or 0 to have size calculated automatically from string
case_sensitive
Should name comparison be case-sensitive; non case-sensitive comparison works properly only for ASCII characters

Returns

Pointer to found attribute, or 0 if not found.

class + template + rapidxml::xml_base

+ + Defined in rapidxml.hpp
+ Base class for + xml_attribute xml_node

Description

Base class for xml_node and xml_attribute implementing common functions: name(), name_size(), value(), value_size() and parent().

Parameters

Ch
Character type to use

+ constructor + xml_base::xml_base

Synopsis

xml_base(); +

function xml_base::name

Synopsis

Ch* name() const; +

Description

Gets name of the node. Interpretation of name depends on type of node. Note that name will not be zero-terminated if rapidxml::parse_no_string_terminators option was selected during parse.

+ Use name_size() function to determine length of the name.

Returns

Name of node, or empty string if node has no name.

function xml_base::name_size

Synopsis

std::size_t name_size() const; +

Description

Gets size of node name, not including terminator character. This function works correctly irrespective of whether name is or is not zero terminated.

Returns

Size of node name, in characters.

function xml_base::value

Synopsis

Ch* value() const; +

Description

Gets value of node. Interpretation of value depends on type of node. Note that value will not be zero-terminated if rapidxml::parse_no_string_terminators option was selected during parse.

+ Use value_size() function to determine length of the value.

Returns

Value of node, or empty string if node has no value.

function xml_base::value_size

Synopsis

std::size_t value_size() const; +

Description

Gets size of node value, not including terminator character. This function works correctly irrespective of whether value is or is not zero terminated.

Returns

Size of node value, in characters.

function xml_base::name

Synopsis

void name(const Ch *name, std::size_t size); +

Description

Sets name of node to a non zero-terminated string. See Ownership Of Strings .

+ Note that node does not own its name or value, it only stores a pointer to it. It will not delete or otherwise free the pointer on destruction. It is reponsibility of the user to properly manage lifetime of the string. The easiest way to achieve it is to use memory_pool of the document to allocate the string - on destruction of the document the string will be automatically freed.

+ Size of name must be specified separately, because name does not have to be zero terminated. Use name(const Ch *) function to have the length automatically calculated (string must be zero terminated).

Parameters

name
Name of node to set. Does not have to be zero terminated.
size
Size of name, in characters. This does not include zero terminator, if one is present.

function xml_base::name

Synopsis

void name(const Ch *name); +

Description

Sets name of node to a zero-terminated string. See also Ownership Of Strings and xml_node::name(const Ch *, std::size_t).

Parameters

name
Name of node to set. Must be zero terminated.

function xml_base::value

Synopsis

void value(const Ch *value, std::size_t size); +

Description

Sets value of node to a non zero-terminated string. See Ownership Of Strings .

+ Note that node does not own its name or value, it only stores a pointer to it. It will not delete or otherwise free the pointer on destruction. It is reponsibility of the user to properly manage lifetime of the string. The easiest way to achieve it is to use memory_pool of the document to allocate the string - on destruction of the document the string will be automatically freed.

+ Size of value must be specified separately, because it does not have to be zero terminated. Use value(const Ch *) function to have the length automatically calculated (string must be zero terminated).

+ If an element has a child node of type node_data, it will take precedence over element value when printing. If you want to manipulate data of elements using values, use parser flag rapidxml::parse_no_data_nodes to prevent creation of data nodes by the parser.

Parameters

value
value of node to set. Does not have to be zero terminated.
size
Size of value, in characters. This does not include zero terminator, if one is present.

function xml_base::value

Synopsis

void value(const Ch *value); +

Description

Sets value of node to a zero-terminated string. See also Ownership Of Strings and xml_node::value(const Ch *, std::size_t).

Parameters

value
Vame of node to set. Must be zero terminated.

function xml_base::parent

Synopsis

xml_node<Ch>* parent() const; +

Description

Gets node parent.

Returns

Pointer to parent node, or 0 if there is no parent.

class + template + rapidxml::xml_document

+ + Defined in rapidxml.hpp
+ Inherits from + xml_node memory_pool

Description

This class represents root of the DOM hierarchy. It is also an xml_node and a memory_pool through public inheritance. Use parse() function to build a DOM tree from a zero-terminated XML text string. parse() function allocates memory for nodes and attributes by using functions of xml_document, which are inherited from memory_pool. To access root node of the document, use the document itself, as if it was an xml_node.

Parameters

Ch
Character type to use.

+ constructor + xml_document::xml_document

Synopsis

xml_document(); +

Description

Constructs empty XML document.

function xml_document::parse

Synopsis

void parse(Ch *text); +

Description

Parses zero-terminated XML string according to given flags. Passed string will be modified by the parser, unless rapidxml::parse_non_destructive flag is used. The string must persist for the lifetime of the document. In case of error, rapidxml::parse_error exception will be thrown.

+ If you want to parse contents of a file, you must first load the file into the memory, and pass pointer to its beginning. Make sure that data is zero-terminated.

+ Document can be parsed into multiple times. Each new call to parse removes previous nodes and attributes (if any), but does not clear memory pool.

Parameters

text
XML data to parse; pointer is non-const to denote fact that this data may be modified by the parser.

function xml_document::clear

Synopsis

void clear(); +

Description

Clears the document by deleting all nodes and clearing the memory pool. All nodes owned by document pool are destroyed.

class + template + rapidxml::xml_node

+ + Defined in rapidxml.hpp
+ Inherits from + xml_base
+ Base class for + xml_document

Description

Class representing a node of XML document. Each node may have associated name and value strings, which are available through name() and value() functions. Interpretation of name and value depends on type of the node. Type of node can be determined by using type() function.

+ Note that after parse, both name and value of node, if any, will point interior of source text used for parsing. Thus, this text must persist in the memory for the lifetime of node.

Parameters

Ch
Character type to use.

+ constructor + xml_node::xml_node

Synopsis

xml_node(node_type type); +

Description

Constructs an empty node with the specified type. Consider using memory_pool of appropriate document to allocate nodes manually.

Parameters

type
Type of node to construct.

function xml_node::type

Synopsis

node_type type() const; +

Description

Gets type of node.

Returns

Type of node.

function xml_node::document

Synopsis

xml_document<Ch>* document() const; +

Description

Gets document of which node is a child.

Returns

Pointer to document that contains this node, or 0 if there is no parent document.

function xml_node::first_node

Synopsis

xml_node<Ch>* first_node(const Ch *name=0, std::size_t name_size=0, bool case_sensitive=true) const; +

Description

Gets first child node, optionally matching node name.

Parameters

name
Name of child to find, or 0 to return first child regardless of its name; this string doesn't have to be zero-terminated if name_size is non-zero
name_size
Size of name, in characters, or 0 to have size calculated automatically from string
case_sensitive
Should name comparison be case-sensitive; non case-sensitive comparison works properly only for ASCII characters

Returns

Pointer to found child, or 0 if not found.

function xml_node::last_node

Synopsis

xml_node<Ch>* last_node(const Ch *name=0, std::size_t name_size=0, bool case_sensitive=true) const; +

Description

Gets last child node, optionally matching node name. Behaviour is undefined if node has no children. Use first_node() to test if node has children.

Parameters

name
Name of child to find, or 0 to return last child regardless of its name; this string doesn't have to be zero-terminated if name_size is non-zero
name_size
Size of name, in characters, or 0 to have size calculated automatically from string
case_sensitive
Should name comparison be case-sensitive; non case-sensitive comparison works properly only for ASCII characters

Returns

Pointer to found child, or 0 if not found.

function xml_node::previous_sibling

Synopsis

xml_node<Ch>* previous_sibling(const Ch *name=0, std::size_t name_size=0, bool case_sensitive=true) const; +

Description

Gets previous sibling node, optionally matching node name. Behaviour is undefined if node has no parent. Use parent() to test if node has a parent.

Parameters

name
Name of sibling to find, or 0 to return previous sibling regardless of its name; this string doesn't have to be zero-terminated if name_size is non-zero
name_size
Size of name, in characters, or 0 to have size calculated automatically from string
case_sensitive
Should name comparison be case-sensitive; non case-sensitive comparison works properly only for ASCII characters

Returns

Pointer to found sibling, or 0 if not found.

function xml_node::next_sibling

Synopsis

xml_node<Ch>* next_sibling(const Ch *name=0, std::size_t name_size=0, bool case_sensitive=true) const; +

Description

Gets next sibling node, optionally matching node name. Behaviour is undefined if node has no parent. Use parent() to test if node has a parent.

Parameters

name
Name of sibling to find, or 0 to return next sibling regardless of its name; this string doesn't have to be zero-terminated if name_size is non-zero
name_size
Size of name, in characters, or 0 to have size calculated automatically from string
case_sensitive
Should name comparison be case-sensitive; non case-sensitive comparison works properly only for ASCII characters

Returns

Pointer to found sibling, or 0 if not found.

function xml_node::first_attribute

Synopsis

xml_attribute<Ch>* first_attribute(const Ch *name=0, std::size_t name_size=0, bool case_sensitive=true) const; +

Description

Gets first attribute of node, optionally matching attribute name.

Parameters

name
Name of attribute to find, or 0 to return first attribute regardless of its name; this string doesn't have to be zero-terminated if name_size is non-zero
name_size
Size of name, in characters, or 0 to have size calculated automatically from string
case_sensitive
Should name comparison be case-sensitive; non case-sensitive comparison works properly only for ASCII characters

Returns

Pointer to found attribute, or 0 if not found.

function xml_node::last_attribute

Synopsis

xml_attribute<Ch>* last_attribute(const Ch *name=0, std::size_t name_size=0, bool case_sensitive=true) const; +

Description

Gets last attribute of node, optionally matching attribute name.

Parameters

name
Name of attribute to find, or 0 to return last attribute regardless of its name; this string doesn't have to be zero-terminated if name_size is non-zero
name_size
Size of name, in characters, or 0 to have size calculated automatically from string
case_sensitive
Should name comparison be case-sensitive; non case-sensitive comparison works properly only for ASCII characters

Returns

Pointer to found attribute, or 0 if not found.

function xml_node::type

Synopsis

void type(node_type type); +

Description

Sets type of node.

Parameters

type
Type of node to set.

function xml_node::prepend_node

Synopsis

void prepend_node(xml_node< Ch > *child); +

Description

Prepends a new child node. The prepended child becomes the first child, and all existing children are moved one position back.

Parameters

child
Node to prepend.

function xml_node::append_node

Synopsis

void append_node(xml_node< Ch > *child); +

Description

Appends a new child node. The appended child becomes the last child.

Parameters

child
Node to append.

function xml_node::insert_node

Synopsis

void insert_node(xml_node< Ch > *where, xml_node< Ch > *child); +

Description

Inserts a new child node at specified place inside the node. All children after and including the specified node are moved one position back.

Parameters

where
Place where to insert the child, or 0 to insert at the back.
child
Node to insert.

function xml_node::remove_first_node

Synopsis

void remove_first_node(); +

Description

Removes first child node. If node has no children, behaviour is undefined. Use first_node() to test if node has children.

function xml_node::remove_last_node

Synopsis

void remove_last_node(); +

Description

Removes last child of the node. If node has no children, behaviour is undefined. Use first_node() to test if node has children.

function xml_node::remove_node

Synopsis

void remove_node(xml_node< Ch > *where); +

Description

Removes specified child from the node.

function xml_node::remove_all_nodes

Synopsis

void remove_all_nodes(); +

Description

Removes all child nodes (but not attributes).

function xml_node::prepend_attribute

Synopsis

void prepend_attribute(xml_attribute< Ch > *attribute); +

Description

Prepends a new attribute to the node.

Parameters

attribute
Attribute to prepend.

function xml_node::append_attribute

Synopsis

void append_attribute(xml_attribute< Ch > *attribute); +

Description

Appends a new attribute to the node.

Parameters

attribute
Attribute to append.

function xml_node::insert_attribute

Synopsis

void insert_attribute(xml_attribute< Ch > *where, xml_attribute< Ch > *attribute); +

Description

Inserts a new attribute at specified place inside the node. All attributes after and including the specified attribute are moved one position back.

Parameters

where
Place where to insert the attribute, or 0 to insert at the back.
attribute
Attribute to insert.

function xml_node::remove_first_attribute

Synopsis

void remove_first_attribute(); +

Description

Removes first attribute of the node. If node has no attributes, behaviour is undefined. Use first_attribute() to test if node has attributes.

function xml_node::remove_last_attribute

Synopsis

void remove_last_attribute(); +

Description

Removes last attribute of the node. If node has no attributes, behaviour is undefined. Use first_attribute() to test if node has attributes.

function xml_node::remove_attribute

Synopsis

void remove_attribute(xml_attribute< Ch > *where); +

Description

Removes specified attribute from node.

Parameters

where
Pointer to attribute to be removed.

function xml_node::remove_all_attributes

Synopsis

void remove_all_attributes(); +

Description

Removes all attributes of node.

enum node_type

Description

Enumeration listing all node types produced by the parser. Use xml_node::type() function to query node type.

Values

node_document
A document node. Name and value are empty.
node_element
An element node. Name contains element name. Value contains text of first data node.
node_data
A data node. Name is empty. Value contains data text.
node_cdata
A CDATA node. Name is empty. Value contains data text.
node_comment
A comment node. Name is empty. Value contains comment text.
node_declaration
A declaration node. Name and value are empty. Declaration parameters (version, encoding and standalone) are in node attributes.
node_doctype
A DOCTYPE node. Name is empty. Value contains DOCTYPE text.
node_pi
A PI node. Name contains target. Value contains instructions.

function parse_error_handler

Synopsis

void rapidxml::parse_error_handler(const char *what, void *where); +

Description

When exceptions are disabled by defining RAPIDXML_NO_EXCEPTIONS, this function is called to notify user about the error. It must be defined by the user.

+ This function cannot return. If it does, the results are undefined.

+ A very simple definition might look like that: + void rapidxml::parse_error_handler(const char *what, void *where) + { + std::cout << "Parse error: " << what << "\n"; + std::abort(); + } +

Parameters

what
Human readable description of the error.
where
Pointer to character data where error was detected.

function print

Synopsis

OutIt rapidxml::print(OutIt out, const xml_node< Ch > &node, int flags=0); +

Description

Prints XML to given output iterator.

Parameters

out
Output iterator to print to.
node
Node to be printed. Pass xml_document to print entire document.
flags
Flags controlling how XML is printed.

Returns

Output iterator pointing to position immediately after last character of printed text.

function print

Synopsis

std::basic_ostream<Ch>& rapidxml::print(std::basic_ostream< Ch > &out, const xml_node< Ch > &node, int flags=0); +

Description

Prints XML to given output stream.

Parameters

out
Output stream to print to.
node
Node to be printed. Pass xml_document to print entire document.
flags
Flags controlling how XML is printed.

Returns

Output stream.

function operator<<

Synopsis

std::basic_ostream<Ch>& rapidxml::operator<<(std::basic_ostream< Ch > &out, const xml_node< Ch > &node); +

Description

Prints formatted XML to given output stream. Uses default printing flags. Use print() function to customize printing process.

Parameters

out
Output stream to print to.
node
Node to be printed.

Returns

Output stream.

+ constant + parse_no_data_nodes

Synopsis

const int parse_no_data_nodes + = 0x1; +

Description

Parse flag instructing the parser to not create data nodes. Text of first data node will still be placed in value of parent element, unless rapidxml::parse_no_element_values flag is also specified. Can be combined with other flags by use of | operator.

+ See xml_document::parse() function.

+ constant + parse_no_element_values

Synopsis

const int parse_no_element_values + = 0x2; +

Description

Parse flag instructing the parser to not use text of first data node as a value of parent element. Can be combined with other flags by use of | operator. Note that child data nodes of element node take precendence over its value when printing. That is, if element has one or more child data nodes and a value, the value will be ignored. Use rapidxml::parse_no_data_nodes flag to prevent creation of data nodes if you want to manipulate data using values of elements.

+ See xml_document::parse() function.

+ constant + parse_no_string_terminators

Synopsis

const int parse_no_string_terminators + = 0x4; +

Description

Parse flag instructing the parser to not place zero terminators after strings in the source text. By default zero terminators are placed, modifying source text. Can be combined with other flags by use of | operator.

+ See xml_document::parse() function.

+ constant + parse_no_entity_translation

Synopsis

const int parse_no_entity_translation + = 0x8; +

Description

Parse flag instructing the parser to not translate entities in the source text. By default entities are translated, modifying source text. Can be combined with other flags by use of | operator.

+ See xml_document::parse() function.

+ constant + parse_no_utf8

Synopsis

const int parse_no_utf8 + = 0x10; +

Description

Parse flag instructing the parser to disable UTF-8 handling and assume plain 8 bit characters. By default, UTF-8 handling is enabled. Can be combined with other flags by use of | operator.

+ See xml_document::parse() function.

+ constant + parse_declaration_node

Synopsis

const int parse_declaration_node + = 0x20; +

Description

Parse flag instructing the parser to create XML declaration node. By default, declaration node is not created. Can be combined with other flags by use of | operator.

+ See xml_document::parse() function.

+ constant + parse_comment_nodes

Synopsis

const int parse_comment_nodes + = 0x40; +

Description

Parse flag instructing the parser to create comments nodes. By default, comment nodes are not created. Can be combined with other flags by use of | operator.

+ See xml_document::parse() function.

+ constant + parse_doctype_node

Synopsis

const int parse_doctype_node + = 0x80; +

Description

Parse flag instructing the parser to create DOCTYPE node. By default, doctype node is not created. Although W3C specification allows at most one DOCTYPE node, RapidXml will silently accept documents with more than one. Can be combined with other flags by use of | operator.

+ See xml_document::parse() function.

+ constant + parse_pi_nodes

Synopsis

const int parse_pi_nodes + = 0x100; +

Description

Parse flag instructing the parser to create PI nodes. By default, PI nodes are not created. Can be combined with other flags by use of | operator.

+ See xml_document::parse() function.

+ constant + parse_validate_closing_tags

Synopsis

const int parse_validate_closing_tags + = 0x200; +

Description

Parse flag instructing the parser to validate closing tag names. If not set, name inside closing tag is irrelevant to the parser. By default, closing tags are not validated. Can be combined with other flags by use of | operator.

+ See xml_document::parse() function.

+ constant + parse_trim_whitespace

Synopsis

const int parse_trim_whitespace + = 0x400; +

Description

Parse flag instructing the parser to trim all leading and trailing whitespace of data nodes. By default, whitespace is not trimmed. This flag does not cause the parser to modify source text. Can be combined with other flags by use of | operator.

+ See xml_document::parse() function.

+ constant + parse_normalize_whitespace

Synopsis

const int parse_normalize_whitespace + = 0x800; +

Description

Parse flag instructing the parser to condense all whitespace runs of data nodes to a single space character. Trimming of leading and trailing whitespace of data is controlled by rapidxml::parse_trim_whitespace flag. By default, whitespace is not normalized. If this flag is specified, source text will be modified. Can be combined with other flags by use of | operator.

+ See xml_document::parse() function.

+ constant + parse_default

Synopsis

const int parse_default + = 0; +

Description

Parse flags which represent default behaviour of the parser. This is always equal to 0, so that all other flags can be simply ored together. Normally there is no need to inconveniently disable flags by anding with their negated (~) values. This also means that meaning of each flag is a negation of the default setting. For example, if flag name is rapidxml::parse_no_utf8, it means that utf-8 is enabled by default, and using the flag will disable it.

+ See xml_document::parse() function.

+ constant + parse_non_destructive

Synopsis

const int parse_non_destructive + = parse_no_string_terminators | parse_no_entity_translation; +

Description

A combination of parse flags that forbids any modifications of the source text. This also results in faster parsing. However, note that the following will occur:
  • names and values of nodes will not be zero terminated, you have to use xml_base::name_size() and xml_base::value_size() functions to determine where name and value ends
  • entities will not be translated
  • whitespace will not be normalized
+See xml_document::parse() function.

+ constant + parse_fastest

Synopsis

const int parse_fastest + = parse_non_destructive | parse_no_data_nodes; +

Description

A combination of parse flags resulting in fastest possible parsing, without sacrificing important data.

+ See xml_document::parse() function.

+ constant + parse_full

Synopsis

const int parse_full + = parse_declaration_node | parse_comment_nodes | parse_doctype_node | parse_pi_nodes | parse_validate_closing_tags; +

Description

A combination of parse flags resulting in largest amount of data being extracted. This usually results in slowest parsing.

+ See xml_document::parse() function.

+ constant + print_no_indenting

Synopsis

const int print_no_indenting + = 0x1; +

Description

Printer flag instructing the printer to suppress indenting of XML. See print() function.

\ No newline at end of file diff --git a/thirdpt/rapidxml-1.13/rapidxml.hpp b/thirdpt/rapidxml-1.13/rapidxml.hpp new file mode 100644 index 0000000..ae91e08 --- /dev/null +++ b/thirdpt/rapidxml-1.13/rapidxml.hpp @@ -0,0 +1,2596 @@ +#ifndef RAPIDXML_HPP_INCLUDED +#define RAPIDXML_HPP_INCLUDED + +// Copyright (C) 2006, 2009 Marcin Kalicinski +// Version 1.13 +// Revision $DateTime: 2009/05/13 01:46:17 $ +//! \file rapidxml.hpp This file contains rapidxml parser and DOM implementation + +// If standard library is disabled, user must provide implementations of required functions and typedefs +#if !defined(RAPIDXML_NO_STDLIB) + #include // For std::size_t + #include // For assert + #include // For placement new +#endif + +// On MSVC, disable "conditional expression is constant" warning (level 4). +// This warning is almost impossible to avoid with certain types of templated code +#ifdef _MSC_VER + #pragma warning(push) + #pragma warning(disable:4127) // Conditional expression is constant +#endif + +/////////////////////////////////////////////////////////////////////////// +// RAPIDXML_PARSE_ERROR + +#if defined(RAPIDXML_NO_EXCEPTIONS) + +#define RAPIDXML_PARSE_ERROR(what, where) { parse_error_handler(what, where); assert(0); } + +namespace rapidxml +{ + //! When exceptions are disabled by defining RAPIDXML_NO_EXCEPTIONS, + //! this function is called to notify user about the error. + //! It must be defined by the user. + //!

+ //! This function cannot return. If it does, the results are undefined. + //!

+ //! A very simple definition might look like that: + //!

+    //! void %rapidxml::%parse_error_handler(const char *what, void *where)
+    //! {
+    //!     std::cout << "Parse error: " << what << "\n";
+    //!     std::abort();
+    //! }
+    //! 
+ //! \param what Human readable description of the error. + //! \param where Pointer to character data where error was detected. + void parse_error_handler(const char *what, void *where); +} + +#else + +#include // For std::exception + +#define RAPIDXML_PARSE_ERROR(what, where) throw parse_error(what, where) + +namespace rapidxml +{ + + //! Parse error exception. + //! This exception is thrown by the parser when an error occurs. + //! Use what() function to get human-readable error message. + //! Use where() function to get a pointer to position within source text where error was detected. + //!

+ //! If throwing exceptions by the parser is undesirable, + //! it can be disabled by defining RAPIDXML_NO_EXCEPTIONS macro before rapidxml.hpp is included. + //! This will cause the parser to call rapidxml::parse_error_handler() function instead of throwing an exception. + //! This function must be defined by the user. + //!

+ //! This class derives from std::exception class. + class parse_error: public std::exception + { + + public: + + //! Constructs parse error + parse_error(const char *what, void *where) + : m_what(what) + , m_where(where) + { + } + + //! Gets human readable description of error. + //! \return Pointer to null terminated description of the error. + virtual const char *what() const throw() + { + return m_what; + } + + //! Gets pointer to character data where error happened. + //! Ch should be the same as char type of xml_document that produced the error. + //! \return Pointer to location within the parsed string where error occured. + template + Ch *where() const + { + return reinterpret_cast(m_where); + } + + private: + + const char *m_what; + void *m_where; + + }; +} + +#endif + +/////////////////////////////////////////////////////////////////////////// +// Pool sizes + +#ifndef RAPIDXML_STATIC_POOL_SIZE + // Size of static memory block of memory_pool. + // Define RAPIDXML_STATIC_POOL_SIZE before including rapidxml.hpp if you want to override the default value. + // No dynamic memory allocations are performed by memory_pool until static memory is exhausted. + #define RAPIDXML_STATIC_POOL_SIZE (64 * 1024) +#endif + +#ifndef RAPIDXML_DYNAMIC_POOL_SIZE + // Size of dynamic memory block of memory_pool. + // Define RAPIDXML_DYNAMIC_POOL_SIZE before including rapidxml.hpp if you want to override the default value. + // After the static block is exhausted, dynamic blocks with approximately this size are allocated by memory_pool. + #define RAPIDXML_DYNAMIC_POOL_SIZE (64 * 1024) +#endif + +#ifndef RAPIDXML_ALIGNMENT + // Memory allocation alignment. + // Define RAPIDXML_ALIGNMENT before including rapidxml.hpp if you want to override the default value, which is the size of pointer. + // All memory allocations for nodes, attributes and strings will be aligned to this value. + // This must be a power of 2 and at least 1, otherwise memory_pool will not work. + #define RAPIDXML_ALIGNMENT sizeof(void *) +#endif + +namespace rapidxml +{ + // Forward declarations + template class xml_node; + template class xml_attribute; + template class xml_document; + + //! Enumeration listing all node types produced by the parser. + //! Use xml_node::type() function to query node type. + enum node_type + { + node_document, //!< A document node. Name and value are empty. + node_element, //!< An element node. Name contains element name. Value contains text of first data node. + node_data, //!< A data node. Name is empty. Value contains data text. + node_cdata, //!< A CDATA node. Name is empty. Value contains data text. + node_comment, //!< A comment node. Name is empty. Value contains comment text. + node_declaration, //!< A declaration node. Name and value are empty. Declaration parameters (version, encoding and standalone) are in node attributes. + node_doctype, //!< A DOCTYPE node. Name is empty. Value contains DOCTYPE text. + node_pi //!< A PI node. Name contains target. Value contains instructions. + }; + + /////////////////////////////////////////////////////////////////////// + // Parsing flags + + //! Parse flag instructing the parser to not create data nodes. + //! Text of first data node will still be placed in value of parent element, unless rapidxml::parse_no_element_values flag is also specified. + //! Can be combined with other flags by use of | operator. + //!

+ //! See xml_document::parse() function. + const int parse_no_data_nodes = 0x1; + + //! Parse flag instructing the parser to not use text of first data node as a value of parent element. + //! Can be combined with other flags by use of | operator. + //! Note that child data nodes of element node take precendence over its value when printing. + //! That is, if element has one or more child data nodes and a value, the value will be ignored. + //! Use rapidxml::parse_no_data_nodes flag to prevent creation of data nodes if you want to manipulate data using values of elements. + //!

+ //! See xml_document::parse() function. + const int parse_no_element_values = 0x2; + + //! Parse flag instructing the parser to not place zero terminators after strings in the source text. + //! By default zero terminators are placed, modifying source text. + //! Can be combined with other flags by use of | operator. + //!

+ //! See xml_document::parse() function. + const int parse_no_string_terminators = 0x4; + + //! Parse flag instructing the parser to not translate entities in the source text. + //! By default entities are translated, modifying source text. + //! Can be combined with other flags by use of | operator. + //!

+ //! See xml_document::parse() function. + const int parse_no_entity_translation = 0x8; + + //! Parse flag instructing the parser to disable UTF-8 handling and assume plain 8 bit characters. + //! By default, UTF-8 handling is enabled. + //! Can be combined with other flags by use of | operator. + //!

+ //! See xml_document::parse() function. + const int parse_no_utf8 = 0x10; + + //! Parse flag instructing the parser to create XML declaration node. + //! By default, declaration node is not created. + //! Can be combined with other flags by use of | operator. + //!

+ //! See xml_document::parse() function. + const int parse_declaration_node = 0x20; + + //! Parse flag instructing the parser to create comments nodes. + //! By default, comment nodes are not created. + //! Can be combined with other flags by use of | operator. + //!

+ //! See xml_document::parse() function. + const int parse_comment_nodes = 0x40; + + //! Parse flag instructing the parser to create DOCTYPE node. + //! By default, doctype node is not created. + //! Although W3C specification allows at most one DOCTYPE node, RapidXml will silently accept documents with more than one. + //! Can be combined with other flags by use of | operator. + //!

+ //! See xml_document::parse() function. + const int parse_doctype_node = 0x80; + + //! Parse flag instructing the parser to create PI nodes. + //! By default, PI nodes are not created. + //! Can be combined with other flags by use of | operator. + //!

+ //! See xml_document::parse() function. + const int parse_pi_nodes = 0x100; + + //! Parse flag instructing the parser to validate closing tag names. + //! If not set, name inside closing tag is irrelevant to the parser. + //! By default, closing tags are not validated. + //! Can be combined with other flags by use of | operator. + //!

+ //! See xml_document::parse() function. + const int parse_validate_closing_tags = 0x200; + + //! Parse flag instructing the parser to trim all leading and trailing whitespace of data nodes. + //! By default, whitespace is not trimmed. + //! This flag does not cause the parser to modify source text. + //! Can be combined with other flags by use of | operator. + //!

+ //! See xml_document::parse() function. + const int parse_trim_whitespace = 0x400; + + //! Parse flag instructing the parser to condense all whitespace runs of data nodes to a single space character. + //! Trimming of leading and trailing whitespace of data is controlled by rapidxml::parse_trim_whitespace flag. + //! By default, whitespace is not normalized. + //! If this flag is specified, source text will be modified. + //! Can be combined with other flags by use of | operator. + //!

+ //! See xml_document::parse() function. + const int parse_normalize_whitespace = 0x800; + + // Compound flags + + //! Parse flags which represent default behaviour of the parser. + //! This is always equal to 0, so that all other flags can be simply ored together. + //! Normally there is no need to inconveniently disable flags by anding with their negated (~) values. + //! This also means that meaning of each flag is a negation of the default setting. + //! For example, if flag name is rapidxml::parse_no_utf8, it means that utf-8 is enabled by default, + //! and using the flag will disable it. + //!

+ //! See xml_document::parse() function. + const int parse_default = 0; + + //! A combination of parse flags that forbids any modifications of the source text. + //! This also results in faster parsing. However, note that the following will occur: + //!
    + //!
  • names and values of nodes will not be zero terminated, you have to use xml_base::name_size() and xml_base::value_size() functions to determine where name and value ends
  • + //!
  • entities will not be translated
  • + //!
  • whitespace will not be normalized
  • + //!
+ //! See xml_document::parse() function. + const int parse_non_destructive = parse_no_string_terminators | parse_no_entity_translation; + + //! A combination of parse flags resulting in fastest possible parsing, without sacrificing important data. + //!

+ //! See xml_document::parse() function. + const int parse_fastest = parse_non_destructive | parse_no_data_nodes; + + //! A combination of parse flags resulting in largest amount of data being extracted. + //! This usually results in slowest parsing. + //!

+ //! See xml_document::parse() function. + const int parse_full = parse_declaration_node | parse_comment_nodes | parse_doctype_node | parse_pi_nodes | parse_validate_closing_tags; + + /////////////////////////////////////////////////////////////////////// + // Internals + + //! \cond internal + namespace internal + { + + // Struct that contains lookup tables for the parser + // It must be a template to allow correct linking (because it has static data members, which are defined in a header file). + template + struct lookup_tables + { + static const unsigned char lookup_whitespace[256]; // Whitespace table + static const unsigned char lookup_node_name[256]; // Node name table + static const unsigned char lookup_text[256]; // Text table + static const unsigned char lookup_text_pure_no_ws[256]; // Text table + static const unsigned char lookup_text_pure_with_ws[256]; // Text table + static const unsigned char lookup_attribute_name[256]; // Attribute name table + static const unsigned char lookup_attribute_data_1[256]; // Attribute data table with single quote + static const unsigned char lookup_attribute_data_1_pure[256]; // Attribute data table with single quote + static const unsigned char lookup_attribute_data_2[256]; // Attribute data table with double quotes + static const unsigned char lookup_attribute_data_2_pure[256]; // Attribute data table with double quotes + static const unsigned char lookup_digits[256]; // Digits + static const unsigned char lookup_upcase[256]; // To uppercase conversion table for ASCII characters + }; + + // Find length of the string + template + inline std::size_t measure(const Ch *p) + { + const Ch *tmp = p; + while (*tmp) + ++tmp; + return tmp - p; + } + + // Compare strings for equality + template + inline bool compare(const Ch *p1, std::size_t size1, const Ch *p2, std::size_t size2, bool case_sensitive) + { + if (size1 != size2) + return false; + if (case_sensitive) + { + for (const Ch *end = p1 + size1; p1 < end; ++p1, ++p2) + if (*p1 != *p2) + return false; + } + else + { + for (const Ch *end = p1 + size1; p1 < end; ++p1, ++p2) + if (lookup_tables<0>::lookup_upcase[static_cast(*p1)] != lookup_tables<0>::lookup_upcase[static_cast(*p2)]) + return false; + } + return true; + } + } + //! \endcond + + /////////////////////////////////////////////////////////////////////// + // Memory pool + + //! This class is used by the parser to create new nodes and attributes, without overheads of dynamic memory allocation. + //! In most cases, you will not need to use this class directly. + //! However, if you need to create nodes manually or modify names/values of nodes, + //! you are encouraged to use memory_pool of relevant xml_document to allocate the memory. + //! Not only is this faster than allocating them by using new operator, + //! but also their lifetime will be tied to the lifetime of document, + //! possibly simplyfing memory management. + //!

+ //! Call allocate_node() or allocate_attribute() functions to obtain new nodes or attributes from the pool. + //! You can also call allocate_string() function to allocate strings. + //! Such strings can then be used as names or values of nodes without worrying about their lifetime. + //! Note that there is no free() function -- all allocations are freed at once when clear() function is called, + //! or when the pool is destroyed. + //!

+ //! It is also possible to create a standalone memory_pool, and use it + //! to allocate nodes, whose lifetime will not be tied to any document. + //!

+ //! Pool maintains RAPIDXML_STATIC_POOL_SIZE bytes of statically allocated memory. + //! Until static memory is exhausted, no dynamic memory allocations are done. + //! When static memory is exhausted, pool allocates additional blocks of memory of size RAPIDXML_DYNAMIC_POOL_SIZE each, + //! by using global new[] and delete[] operators. + //! This behaviour can be changed by setting custom allocation routines. + //! Use set_allocator() function to set them. + //!

+ //! Allocations for nodes, attributes and strings are aligned at RAPIDXML_ALIGNMENT bytes. + //! This value defaults to the size of pointer on target architecture. + //!

+ //! To obtain absolutely top performance from the parser, + //! it is important that all nodes are allocated from a single, contiguous block of memory. + //! Otherwise, cache misses when jumping between two (or more) disjoint blocks of memory can slow down parsing quite considerably. + //! If required, you can tweak RAPIDXML_STATIC_POOL_SIZE, RAPIDXML_DYNAMIC_POOL_SIZE and RAPIDXML_ALIGNMENT + //! to obtain best wasted memory to performance compromise. + //! To do it, define their values before rapidxml.hpp file is included. + //! \param Ch Character type of created nodes. + template + class memory_pool + { + + public: + + //! \cond internal + typedef void *(alloc_func)(std::size_t); // Type of user-defined function used to allocate memory + typedef void (free_func)(void *); // Type of user-defined function used to free memory + //! \endcond + + //! Constructs empty pool with default allocator functions. + memory_pool() + : m_alloc_func(0) + , m_free_func(0) + { + init(); + } + + //! Destroys pool and frees all the memory. + //! This causes memory occupied by nodes allocated by the pool to be freed. + //! Nodes allocated from the pool are no longer valid. + ~memory_pool() + { + clear(); + } + + //! Allocates a new node from the pool, and optionally assigns name and value to it. + //! If the allocation request cannot be accomodated, this function will throw std::bad_alloc. + //! If exceptions are disabled by defining RAPIDXML_NO_EXCEPTIONS, this function + //! will call rapidxml::parse_error_handler() function. + //! \param type Type of node to create. + //! \param name Name to assign to the node, or 0 to assign no name. + //! \param value Value to assign to the node, or 0 to assign no value. + //! \param name_size Size of name to assign, or 0 to automatically calculate size from name string. + //! \param value_size Size of value to assign, or 0 to automatically calculate size from value string. + //! \return Pointer to allocated node. This pointer will never be NULL. + xml_node *allocate_node(node_type type, + const Ch *name = 0, const Ch *value = 0, + std::size_t name_size = 0, std::size_t value_size = 0) + { + void *memory = allocate_aligned(sizeof(xml_node)); + xml_node *node = new(memory) xml_node(type); + if (name) + { + if (name_size > 0) + node->name(name, name_size); + else + node->name(name); + } + if (value) + { + if (value_size > 0) + node->value(value, value_size); + else + node->value(value); + } + return node; + } + + //! Allocates a new attribute from the pool, and optionally assigns name and value to it. + //! If the allocation request cannot be accomodated, this function will throw std::bad_alloc. + //! If exceptions are disabled by defining RAPIDXML_NO_EXCEPTIONS, this function + //! will call rapidxml::parse_error_handler() function. + //! \param name Name to assign to the attribute, or 0 to assign no name. + //! \param value Value to assign to the attribute, or 0 to assign no value. + //! \param name_size Size of name to assign, or 0 to automatically calculate size from name string. + //! \param value_size Size of value to assign, or 0 to automatically calculate size from value string. + //! \return Pointer to allocated attribute. This pointer will never be NULL. + xml_attribute *allocate_attribute(const Ch *name = 0, const Ch *value = 0, + std::size_t name_size = 0, std::size_t value_size = 0) + { + void *memory = allocate_aligned(sizeof(xml_attribute)); + xml_attribute *attribute = new(memory) xml_attribute; + if (name) + { + if (name_size > 0) + attribute->name(name, name_size); + else + attribute->name(name); + } + if (value) + { + if (value_size > 0) + attribute->value(value, value_size); + else + attribute->value(value); + } + return attribute; + } + + //! Allocates a char array of given size from the pool, and optionally copies a given string to it. + //! If the allocation request cannot be accomodated, this function will throw std::bad_alloc. + //! If exceptions are disabled by defining RAPIDXML_NO_EXCEPTIONS, this function + //! will call rapidxml::parse_error_handler() function. + //! \param source String to initialize the allocated memory with, or 0 to not initialize it. + //! \param size Number of characters to allocate, or zero to calculate it automatically from source string length; if size is 0, source string must be specified and null terminated. + //! \return Pointer to allocated char array. This pointer will never be NULL. + Ch *allocate_string(const Ch *source = 0, std::size_t size = 0) + { + assert(source || size); // Either source or size (or both) must be specified + if (size == 0) + size = internal::measure(source) + 1; + Ch *result = static_cast(allocate_aligned(size * sizeof(Ch))); + if (source) + for (std::size_t i = 0; i < size; ++i) + result[i] = source[i]; + return result; + } + + //! Clones an xml_node and its hierarchy of child nodes and attributes. + //! Nodes and attributes are allocated from this memory pool. + //! Names and values are not cloned, they are shared between the clone and the source. + //! Result node can be optionally specified as a second parameter, + //! in which case its contents will be replaced with cloned source node. + //! This is useful when you want to clone entire document. + //! \param source Node to clone. + //! \param result Node to put results in, or 0 to automatically allocate result node + //! \return Pointer to cloned node. This pointer will never be NULL. + xml_node *clone_node(const xml_node *source, xml_node *result = 0) + { + // Prepare result node + if (result) + { + result->remove_all_attributes(); + result->remove_all_nodes(); + result->type(source->type()); + } + else + result = allocate_node(source->type()); + + // Clone name and value + result->name(source->name(), source->name_size()); + result->value(source->value(), source->value_size()); + + // Clone child nodes and attributes + for (xml_node *child = source->first_node(); child; child = child->next_sibling()) + result->append_node(clone_node(child)); + for (xml_attribute *attr = source->first_attribute(); attr; attr = attr->next_attribute()) + result->append_attribute(allocate_attribute(attr->name(), attr->value(), attr->name_size(), attr->value_size())); + + return result; + } + + //! Clears the pool. + //! This causes memory occupied by nodes allocated by the pool to be freed. + //! Any nodes or strings allocated from the pool will no longer be valid. + void clear() + { + while (m_begin != m_static_memory) + { + char *previous_begin = reinterpret_cast
(align(m_begin))->previous_begin; + if (m_free_func) + m_free_func(m_begin); + else + delete[] m_begin; + m_begin = previous_begin; + } + init(); + } + + //! Sets or resets the user-defined memory allocation functions for the pool. + //! This can only be called when no memory is allocated from the pool yet, otherwise results are undefined. + //! Allocation function must not return invalid pointer on failure. It should either throw, + //! stop the program, or use longjmp() function to pass control to other place of program. + //! If it returns invalid pointer, results are undefined. + //!

+ //! User defined allocation functions must have the following forms: + //!
+ //!
void *allocate(std::size_t size); + //!
void free(void *pointer); + //!

+ //! \param af Allocation function, or 0 to restore default function + //! \param ff Free function, or 0 to restore default function + void set_allocator(alloc_func *af, free_func *ff) + { + assert(m_begin == m_static_memory && m_ptr == align(m_begin)); // Verify that no memory is allocated yet + m_alloc_func = af; + m_free_func = ff; + } + + private: + + struct header + { + char *previous_begin; + }; + + void init() + { + m_begin = m_static_memory; + m_ptr = align(m_begin); + m_end = m_static_memory + sizeof(m_static_memory); + } + + char *align(char *ptr) + { + std::size_t alignment = ((RAPIDXML_ALIGNMENT - (std::size_t(ptr) & (RAPIDXML_ALIGNMENT - 1))) & (RAPIDXML_ALIGNMENT - 1)); + return ptr + alignment; + } + + char *allocate_raw(std::size_t size) + { + // Allocate + void *memory; + if (m_alloc_func) // Allocate memory using either user-specified allocation function or global operator new[] + { + memory = m_alloc_func(size); + assert(memory); // Allocator is not allowed to return 0, on failure it must either throw, stop the program or use longjmp + } + else + { + memory = new char[size]; +#ifdef RAPIDXML_NO_EXCEPTIONS + if (!memory) // If exceptions are disabled, verify memory allocation, because new will not be able to throw bad_alloc + RAPIDXML_PARSE_ERROR("out of memory", 0); +#endif + } + return static_cast(memory); + } + + void *allocate_aligned(std::size_t size) + { + // Calculate aligned pointer + char *result = align(m_ptr); + + // If not enough memory left in current pool, allocate a new pool + if (result + size > m_end) + { + // Calculate required pool size (may be bigger than RAPIDXML_DYNAMIC_POOL_SIZE) + std::size_t pool_size = RAPIDXML_DYNAMIC_POOL_SIZE; + if (pool_size < size) + pool_size = size; + + // Allocate + std::size_t alloc_size = sizeof(header) + (2 * RAPIDXML_ALIGNMENT - 2) + pool_size; // 2 alignments required in worst case: one for header, one for actual allocation + char *raw_memory = allocate_raw(alloc_size); + + // Setup new pool in allocated memory + char *pool = align(raw_memory); + header *new_header = reinterpret_cast
(pool); + new_header->previous_begin = m_begin; + m_begin = raw_memory; + m_ptr = pool + sizeof(header); + m_end = raw_memory + alloc_size; + + // Calculate aligned pointer again using new pool + result = align(m_ptr); + } + + // Update pool and return aligned pointer + m_ptr = result + size; + return result; + } + + char *m_begin; // Start of raw memory making up current pool + char *m_ptr; // First free byte in current pool + char *m_end; // One past last available byte in current pool + char m_static_memory[RAPIDXML_STATIC_POOL_SIZE]; // Static raw memory + alloc_func *m_alloc_func; // Allocator function, or 0 if default is to be used + free_func *m_free_func; // Free function, or 0 if default is to be used + }; + + /////////////////////////////////////////////////////////////////////////// + // XML base + + //! Base class for xml_node and xml_attribute implementing common functions: + //! name(), name_size(), value(), value_size() and parent(). + //! \param Ch Character type to use + template + class xml_base + { + + public: + + /////////////////////////////////////////////////////////////////////////// + // Construction & destruction + + // Construct a base with empty name, value and parent + xml_base() + : m_name(0) + , m_value(0) + , m_parent(0) + { + } + + /////////////////////////////////////////////////////////////////////////// + // Node data access + + //! Gets name of the node. + //! Interpretation of name depends on type of node. + //! Note that name will not be zero-terminated if rapidxml::parse_no_string_terminators option was selected during parse. + //!

+ //! Use name_size() function to determine length of the name. + //! \return Name of node, or empty string if node has no name. + Ch *name() const + { + return m_name ? m_name : nullstr(); + } + + //! Gets size of node name, not including terminator character. + //! This function works correctly irrespective of whether name is or is not zero terminated. + //! \return Size of node name, in characters. + std::size_t name_size() const + { + return m_name ? m_name_size : 0; + } + + //! Gets value of node. + //! Interpretation of value depends on type of node. + //! Note that value will not be zero-terminated if rapidxml::parse_no_string_terminators option was selected during parse. + //!

+ //! Use value_size() function to determine length of the value. + //! \return Value of node, or empty string if node has no value. + Ch *value() const + { + return m_value ? m_value : nullstr(); + } + + //! Gets size of node value, not including terminator character. + //! This function works correctly irrespective of whether value is or is not zero terminated. + //! \return Size of node value, in characters. + std::size_t value_size() const + { + return m_value ? m_value_size : 0; + } + + /////////////////////////////////////////////////////////////////////////// + // Node modification + + //! Sets name of node to a non zero-terminated string. + //! See \ref ownership_of_strings. + //!

+ //! Note that node does not own its name or value, it only stores a pointer to it. + //! It will not delete or otherwise free the pointer on destruction. + //! It is reponsibility of the user to properly manage lifetime of the string. + //! The easiest way to achieve it is to use memory_pool of the document to allocate the string - + //! on destruction of the document the string will be automatically freed. + //!

+ //! Size of name must be specified separately, because name does not have to be zero terminated. + //! Use name(const Ch *) function to have the length automatically calculated (string must be zero terminated). + //! \param name Name of node to set. Does not have to be zero terminated. + //! \param size Size of name, in characters. This does not include zero terminator, if one is present. + void name(const Ch *name, std::size_t size) + { + m_name = const_cast(name); + m_name_size = size; + } + + //! Sets name of node to a zero-terminated string. + //! See also \ref ownership_of_strings and xml_node::name(const Ch *, std::size_t). + //! \param name Name of node to set. Must be zero terminated. + void name(const Ch *name) + { + this->name(name, internal::measure(name)); + } + + //! Sets value of node to a non zero-terminated string. + //! See \ref ownership_of_strings. + //!

+ //! Note that node does not own its name or value, it only stores a pointer to it. + //! It will not delete or otherwise free the pointer on destruction. + //! It is reponsibility of the user to properly manage lifetime of the string. + //! The easiest way to achieve it is to use memory_pool of the document to allocate the string - + //! on destruction of the document the string will be automatically freed. + //!

+ //! Size of value must be specified separately, because it does not have to be zero terminated. + //! Use value(const Ch *) function to have the length automatically calculated (string must be zero terminated). + //!

+ //! If an element has a child node of type node_data, it will take precedence over element value when printing. + //! If you want to manipulate data of elements using values, use parser flag rapidxml::parse_no_data_nodes to prevent creation of data nodes by the parser. + //! \param value value of node to set. Does not have to be zero terminated. + //! \param size Size of value, in characters. This does not include zero terminator, if one is present. + void value(const Ch *value, std::size_t size) + { + m_value = const_cast(value); + m_value_size = size; + } + + //! Sets value of node to a zero-terminated string. + //! See also \ref ownership_of_strings and xml_node::value(const Ch *, std::size_t). + //! \param value Vame of node to set. Must be zero terminated. + void value(const Ch *value) + { + this->value(value, internal::measure(value)); + } + + /////////////////////////////////////////////////////////////////////////// + // Related nodes access + + //! Gets node parent. + //! \return Pointer to parent node, or 0 if there is no parent. + xml_node *parent() const + { + return m_parent; + } + + protected: + + // Return empty string + static Ch *nullstr() + { + static Ch zero = Ch('\0'); + return &zero; + } + + Ch *m_name; // Name of node, or 0 if no name + Ch *m_value; // Value of node, or 0 if no value + std::size_t m_name_size; // Length of node name, or undefined of no name + std::size_t m_value_size; // Length of node value, or undefined if no value + xml_node *m_parent; // Pointer to parent node, or 0 if none + + }; + + //! Class representing attribute node of XML document. + //! Each attribute has name and value strings, which are available through name() and value() functions (inherited from xml_base). + //! Note that after parse, both name and value of attribute will point to interior of source text used for parsing. + //! Thus, this text must persist in memory for the lifetime of attribute. + //! \param Ch Character type to use. + template + class xml_attribute: public xml_base + { + + friend class xml_node; + + public: + + /////////////////////////////////////////////////////////////////////////// + // Construction & destruction + + //! Constructs an empty attribute with the specified type. + //! Consider using memory_pool of appropriate xml_document if allocating attributes manually. + xml_attribute() + { + } + + /////////////////////////////////////////////////////////////////////////// + // Related nodes access + + //! Gets document of which attribute is a child. + //! \return Pointer to document that contains this attribute, or 0 if there is no parent document. + xml_document *document() const + { + if (xml_node *node = this->parent()) + { + while (node->parent()) + node = node->parent(); + return node->type() == node_document ? static_cast *>(node) : 0; + } + else + return 0; + } + + //! Gets previous attribute, optionally matching attribute name. + //! \param name Name of attribute to find, or 0 to return previous attribute regardless of its name; this string doesn't have to be zero-terminated if name_size is non-zero + //! \param name_size Size of name, in characters, or 0 to have size calculated automatically from string + //! \param case_sensitive Should name comparison be case-sensitive; non case-sensitive comparison works properly only for ASCII characters + //! \return Pointer to found attribute, or 0 if not found. + xml_attribute *previous_attribute(const Ch *name = 0, std::size_t name_size = 0, bool case_sensitive = true) const + { + if (name) + { + if (name_size == 0) + name_size = internal::measure(name); + for (xml_attribute *attribute = m_prev_attribute; attribute; attribute = attribute->m_prev_attribute) + if (internal::compare(attribute->name(), attribute->name_size(), name, name_size, case_sensitive)) + return attribute; + return 0; + } + else + return this->m_parent ? m_prev_attribute : 0; + } + + //! Gets next attribute, optionally matching attribute name. + //! \param name Name of attribute to find, or 0 to return next attribute regardless of its name; this string doesn't have to be zero-terminated if name_size is non-zero + //! \param name_size Size of name, in characters, or 0 to have size calculated automatically from string + //! \param case_sensitive Should name comparison be case-sensitive; non case-sensitive comparison works properly only for ASCII characters + //! \return Pointer to found attribute, or 0 if not found. + xml_attribute *next_attribute(const Ch *name = 0, std::size_t name_size = 0, bool case_sensitive = true) const + { + if (name) + { + if (name_size == 0) + name_size = internal::measure(name); + for (xml_attribute *attribute = m_next_attribute; attribute; attribute = attribute->m_next_attribute) + if (internal::compare(attribute->name(), attribute->name_size(), name, name_size, case_sensitive)) + return attribute; + return 0; + } + else + return this->m_parent ? m_next_attribute : 0; + } + + private: + + xml_attribute *m_prev_attribute; // Pointer to previous sibling of attribute, or 0 if none; only valid if parent is non-zero + xml_attribute *m_next_attribute; // Pointer to next sibling of attribute, or 0 if none; only valid if parent is non-zero + + }; + + /////////////////////////////////////////////////////////////////////////// + // XML node + + //! Class representing a node of XML document. + //! Each node may have associated name and value strings, which are available through name() and value() functions. + //! Interpretation of name and value depends on type of the node. + //! Type of node can be determined by using type() function. + //!

+ //! Note that after parse, both name and value of node, if any, will point interior of source text used for parsing. + //! Thus, this text must persist in the memory for the lifetime of node. + //! \param Ch Character type to use. + template + class xml_node: public xml_base + { + + public: + + /////////////////////////////////////////////////////////////////////////// + // Construction & destruction + + //! Constructs an empty node with the specified type. + //! Consider using memory_pool of appropriate document to allocate nodes manually. + //! \param type Type of node to construct. + xml_node(node_type type) + : m_type(type) + , m_first_node(0) + , m_first_attribute(0) + { + } + + /////////////////////////////////////////////////////////////////////////// + // Node data access + + //! Gets type of node. + //! \return Type of node. + node_type type() const + { + return m_type; + } + + /////////////////////////////////////////////////////////////////////////// + // Related nodes access + + //! Gets document of which node is a child. + //! \return Pointer to document that contains this node, or 0 if there is no parent document. + xml_document *document() const + { + xml_node *node = const_cast *>(this); + while (node->parent()) + node = node->parent(); + return node->type() == node_document ? static_cast *>(node) : 0; + } + + //! Gets first child node, optionally matching node name. + //! \param name Name of child to find, or 0 to return first child regardless of its name; this string doesn't have to be zero-terminated if name_size is non-zero + //! \param name_size Size of name, in characters, or 0 to have size calculated automatically from string + //! \param case_sensitive Should name comparison be case-sensitive; non case-sensitive comparison works properly only for ASCII characters + //! \return Pointer to found child, or 0 if not found. + xml_node *first_node(const Ch *name = 0, std::size_t name_size = 0, bool case_sensitive = true) const + { + if (name) + { + if (name_size == 0) + name_size = internal::measure(name); + for (xml_node *child = m_first_node; child; child = child->next_sibling()) + if (internal::compare(child->name(), child->name_size(), name, name_size, case_sensitive)) + return child; + return 0; + } + else + return m_first_node; + } + + //! Gets last child node, optionally matching node name. + //! Behaviour is undefined if node has no children. + //! Use first_node() to test if node has children. + //! \param name Name of child to find, or 0 to return last child regardless of its name; this string doesn't have to be zero-terminated if name_size is non-zero + //! \param name_size Size of name, in characters, or 0 to have size calculated automatically from string + //! \param case_sensitive Should name comparison be case-sensitive; non case-sensitive comparison works properly only for ASCII characters + //! \return Pointer to found child, or 0 if not found. + xml_node *last_node(const Ch *name = 0, std::size_t name_size = 0, bool case_sensitive = true) const + { + assert(m_first_node); // Cannot query for last child if node has no children + if (name) + { + if (name_size == 0) + name_size = internal::measure(name); + for (xml_node *child = m_last_node; child; child = child->previous_sibling()) + if (internal::compare(child->name(), child->name_size(), name, name_size, case_sensitive)) + return child; + return 0; + } + else + return m_last_node; + } + + //! Gets previous sibling node, optionally matching node name. + //! Behaviour is undefined if node has no parent. + //! Use parent() to test if node has a parent. + //! \param name Name of sibling to find, or 0 to return previous sibling regardless of its name; this string doesn't have to be zero-terminated if name_size is non-zero + //! \param name_size Size of name, in characters, or 0 to have size calculated automatically from string + //! \param case_sensitive Should name comparison be case-sensitive; non case-sensitive comparison works properly only for ASCII characters + //! \return Pointer to found sibling, or 0 if not found. + xml_node *previous_sibling(const Ch *name = 0, std::size_t name_size = 0, bool case_sensitive = true) const + { + assert(this->m_parent); // Cannot query for siblings if node has no parent + if (name) + { + if (name_size == 0) + name_size = internal::measure(name); + for (xml_node *sibling = m_prev_sibling; sibling; sibling = sibling->m_prev_sibling) + if (internal::compare(sibling->name(), sibling->name_size(), name, name_size, case_sensitive)) + return sibling; + return 0; + } + else + return m_prev_sibling; + } + + //! Gets next sibling node, optionally matching node name. + //! Behaviour is undefined if node has no parent. + //! Use parent() to test if node has a parent. + //! \param name Name of sibling to find, or 0 to return next sibling regardless of its name; this string doesn't have to be zero-terminated if name_size is non-zero + //! \param name_size Size of name, in characters, or 0 to have size calculated automatically from string + //! \param case_sensitive Should name comparison be case-sensitive; non case-sensitive comparison works properly only for ASCII characters + //! \return Pointer to found sibling, or 0 if not found. + xml_node *next_sibling(const Ch *name = 0, std::size_t name_size = 0, bool case_sensitive = true) const + { + assert(this->m_parent); // Cannot query for siblings if node has no parent + if (name) + { + if (name_size == 0) + name_size = internal::measure(name); + for (xml_node *sibling = m_next_sibling; sibling; sibling = sibling->m_next_sibling) + if (internal::compare(sibling->name(), sibling->name_size(), name, name_size, case_sensitive)) + return sibling; + return 0; + } + else + return m_next_sibling; + } + + //! Gets first attribute of node, optionally matching attribute name. + //! \param name Name of attribute to find, or 0 to return first attribute regardless of its name; this string doesn't have to be zero-terminated if name_size is non-zero + //! \param name_size Size of name, in characters, or 0 to have size calculated automatically from string + //! \param case_sensitive Should name comparison be case-sensitive; non case-sensitive comparison works properly only for ASCII characters + //! \return Pointer to found attribute, or 0 if not found. + xml_attribute *first_attribute(const Ch *name = 0, std::size_t name_size = 0, bool case_sensitive = true) const + { + if (name) + { + if (name_size == 0) + name_size = internal::measure(name); + for (xml_attribute *attribute = m_first_attribute; attribute; attribute = attribute->m_next_attribute) + if (internal::compare(attribute->name(), attribute->name_size(), name, name_size, case_sensitive)) + return attribute; + return 0; + } + else + return m_first_attribute; + } + + //! Gets last attribute of node, optionally matching attribute name. + //! \param name Name of attribute to find, or 0 to return last attribute regardless of its name; this string doesn't have to be zero-terminated if name_size is non-zero + //! \param name_size Size of name, in characters, or 0 to have size calculated automatically from string + //! \param case_sensitive Should name comparison be case-sensitive; non case-sensitive comparison works properly only for ASCII characters + //! \return Pointer to found attribute, or 0 if not found. + xml_attribute *last_attribute(const Ch *name = 0, std::size_t name_size = 0, bool case_sensitive = true) const + { + if (name) + { + if (name_size == 0) + name_size = internal::measure(name); + for (xml_attribute *attribute = m_last_attribute; attribute; attribute = attribute->m_prev_attribute) + if (internal::compare(attribute->name(), attribute->name_size(), name, name_size, case_sensitive)) + return attribute; + return 0; + } + else + return m_first_attribute ? m_last_attribute : 0; + } + + /////////////////////////////////////////////////////////////////////////// + // Node modification + + //! Sets type of node. + //! \param type Type of node to set. + void type(node_type type) + { + m_type = type; + } + + /////////////////////////////////////////////////////////////////////////// + // Node manipulation + + //! Prepends a new child node. + //! The prepended child becomes the first child, and all existing children are moved one position back. + //! \param child Node to prepend. + void prepend_node(xml_node *child) + { + assert(child && !child->parent() && child->type() != node_document); + if (first_node()) + { + child->m_next_sibling = m_first_node; + m_first_node->m_prev_sibling = child; + } + else + { + child->m_next_sibling = 0; + m_last_node = child; + } + m_first_node = child; + child->m_parent = this; + child->m_prev_sibling = 0; + } + + //! Appends a new child node. + //! The appended child becomes the last child. + //! \param child Node to append. + void append_node(xml_node *child) + { + assert(child && !child->parent() && child->type() != node_document); + if (first_node()) + { + child->m_prev_sibling = m_last_node; + m_last_node->m_next_sibling = child; + } + else + { + child->m_prev_sibling = 0; + m_first_node = child; + } + m_last_node = child; + child->m_parent = this; + child->m_next_sibling = 0; + } + + //! Inserts a new child node at specified place inside the node. + //! All children after and including the specified node are moved one position back. + //! \param where Place where to insert the child, or 0 to insert at the back. + //! \param child Node to insert. + void insert_node(xml_node *where, xml_node *child) + { + assert(!where || where->parent() == this); + assert(child && !child->parent() && child->type() != node_document); + if (where == m_first_node) + prepend_node(child); + else if (where == 0) + append_node(child); + else + { + child->m_prev_sibling = where->m_prev_sibling; + child->m_next_sibling = where; + where->m_prev_sibling->m_next_sibling = child; + where->m_prev_sibling = child; + child->m_parent = this; + } + } + + //! Removes first child node. + //! If node has no children, behaviour is undefined. + //! Use first_node() to test if node has children. + void remove_first_node() + { + assert(first_node()); + xml_node *child = m_first_node; + m_first_node = child->m_next_sibling; + if (child->m_next_sibling) + child->m_next_sibling->m_prev_sibling = 0; + else + m_last_node = 0; + child->m_parent = 0; + } + + //! Removes last child of the node. + //! If node has no children, behaviour is undefined. + //! Use first_node() to test if node has children. + void remove_last_node() + { + assert(first_node()); + xml_node *child = m_last_node; + if (child->m_prev_sibling) + { + m_last_node = child->m_prev_sibling; + child->m_prev_sibling->m_next_sibling = 0; + } + else + m_first_node = 0; + child->m_parent = 0; + } + + //! Removes specified child from the node + // \param where Pointer to child to be removed. + void remove_node(xml_node *where) + { + assert(where && where->parent() == this); + assert(first_node()); + if (where == m_first_node) + remove_first_node(); + else if (where == m_last_node) + remove_last_node(); + else + { + where->m_prev_sibling->m_next_sibling = where->m_next_sibling; + where->m_next_sibling->m_prev_sibling = where->m_prev_sibling; + where->m_parent = 0; + } + } + + //! Removes all child nodes (but not attributes). + void remove_all_nodes() + { + for (xml_node *node = first_node(); node; node = node->m_next_sibling) + node->m_parent = 0; + m_first_node = 0; + } + + //! Prepends a new attribute to the node. + //! \param attribute Attribute to prepend. + void prepend_attribute(xml_attribute *attribute) + { + assert(attribute && !attribute->parent()); + if (first_attribute()) + { + attribute->m_next_attribute = m_first_attribute; + m_first_attribute->m_prev_attribute = attribute; + } + else + { + attribute->m_next_attribute = 0; + m_last_attribute = attribute; + } + m_first_attribute = attribute; + attribute->m_parent = this; + attribute->m_prev_attribute = 0; + } + + //! Appends a new attribute to the node. + //! \param attribute Attribute to append. + void append_attribute(xml_attribute *attribute) + { + assert(attribute && !attribute->parent()); + if (first_attribute()) + { + attribute->m_prev_attribute = m_last_attribute; + m_last_attribute->m_next_attribute = attribute; + } + else + { + attribute->m_prev_attribute = 0; + m_first_attribute = attribute; + } + m_last_attribute = attribute; + attribute->m_parent = this; + attribute->m_next_attribute = 0; + } + + //! Inserts a new attribute at specified place inside the node. + //! All attributes after and including the specified attribute are moved one position back. + //! \param where Place where to insert the attribute, or 0 to insert at the back. + //! \param attribute Attribute to insert. + void insert_attribute(xml_attribute *where, xml_attribute *attribute) + { + assert(!where || where->parent() == this); + assert(attribute && !attribute->parent()); + if (where == m_first_attribute) + prepend_attribute(attribute); + else if (where == 0) + append_attribute(attribute); + else + { + attribute->m_prev_attribute = where->m_prev_attribute; + attribute->m_next_attribute = where; + where->m_prev_attribute->m_next_attribute = attribute; + where->m_prev_attribute = attribute; + attribute->m_parent = this; + } + } + + //! Removes first attribute of the node. + //! If node has no attributes, behaviour is undefined. + //! Use first_attribute() to test if node has attributes. + void remove_first_attribute() + { + assert(first_attribute()); + xml_attribute *attribute = m_first_attribute; + if (attribute->m_next_attribute) + { + attribute->m_next_attribute->m_prev_attribute = 0; + } + else + m_last_attribute = 0; + attribute->m_parent = 0; + m_first_attribute = attribute->m_next_attribute; + } + + //! Removes last attribute of the node. + //! If node has no attributes, behaviour is undefined. + //! Use first_attribute() to test if node has attributes. + void remove_last_attribute() + { + assert(first_attribute()); + xml_attribute *attribute = m_last_attribute; + if (attribute->m_prev_attribute) + { + attribute->m_prev_attribute->m_next_attribute = 0; + m_last_attribute = attribute->m_prev_attribute; + } + else + m_first_attribute = 0; + attribute->m_parent = 0; + } + + //! Removes specified attribute from node. + //! \param where Pointer to attribute to be removed. + void remove_attribute(xml_attribute *where) + { + assert(first_attribute() && where->parent() == this); + if (where == m_first_attribute) + remove_first_attribute(); + else if (where == m_last_attribute) + remove_last_attribute(); + else + { + where->m_prev_attribute->m_next_attribute = where->m_next_attribute; + where->m_next_attribute->m_prev_attribute = where->m_prev_attribute; + where->m_parent = 0; + } + } + + //! Removes all attributes of node. + void remove_all_attributes() + { + for (xml_attribute *attribute = first_attribute(); attribute; attribute = attribute->m_next_attribute) + attribute->m_parent = 0; + m_first_attribute = 0; + } + + private: + + /////////////////////////////////////////////////////////////////////////// + // Restrictions + + // No copying + xml_node(const xml_node &); + void operator =(const xml_node &); + + /////////////////////////////////////////////////////////////////////////// + // Data members + + // Note that some of the pointers below have UNDEFINED values if certain other pointers are 0. + // This is required for maximum performance, as it allows the parser to omit initialization of + // unneded/redundant values. + // + // The rules are as follows: + // 1. first_node and first_attribute contain valid pointers, or 0 if node has no children/attributes respectively + // 2. last_node and last_attribute are valid only if node has at least one child/attribute respectively, otherwise they contain garbage + // 3. prev_sibling and next_sibling are valid only if node has a parent, otherwise they contain garbage + + node_type m_type; // Type of node; always valid + xml_node *m_first_node; // Pointer to first child node, or 0 if none; always valid + xml_node *m_last_node; // Pointer to last child node, or 0 if none; this value is only valid if m_first_node is non-zero + xml_attribute *m_first_attribute; // Pointer to first attribute of node, or 0 if none; always valid + xml_attribute *m_last_attribute; // Pointer to last attribute of node, or 0 if none; this value is only valid if m_first_attribute is non-zero + xml_node *m_prev_sibling; // Pointer to previous sibling of node, or 0 if none; this value is only valid if m_parent is non-zero + xml_node *m_next_sibling; // Pointer to next sibling of node, or 0 if none; this value is only valid if m_parent is non-zero + + }; + + /////////////////////////////////////////////////////////////////////////// + // XML document + + //! This class represents root of the DOM hierarchy. + //! It is also an xml_node and a memory_pool through public inheritance. + //! Use parse() function to build a DOM tree from a zero-terminated XML text string. + //! parse() function allocates memory for nodes and attributes by using functions of xml_document, + //! which are inherited from memory_pool. + //! To access root node of the document, use the document itself, as if it was an xml_node. + //! \param Ch Character type to use. + template + class xml_document: public xml_node, public memory_pool + { + + public: + + //! Constructs empty XML document + xml_document() + : xml_node(node_document) + { + } + + //! Parses zero-terminated XML string according to given flags. + //! Passed string will be modified by the parser, unless rapidxml::parse_non_destructive flag is used. + //! The string must persist for the lifetime of the document. + //! In case of error, rapidxml::parse_error exception will be thrown. + //!

+ //! If you want to parse contents of a file, you must first load the file into the memory, and pass pointer to its beginning. + //! Make sure that data is zero-terminated. + //!

+ //! Document can be parsed into multiple times. + //! Each new call to parse removes previous nodes and attributes (if any), but does not clear memory pool. + //! \param text XML data to parse; pointer is non-const to denote fact that this data may be modified by the parser. + template + void parse(Ch *text) + { + assert(text); + + // Remove current contents + this->remove_all_nodes(); + this->remove_all_attributes(); + + // Parse BOM, if any + parse_bom(text); + + // Parse children + while (1) + { + // Skip whitespace before node + skip(text); + if (*text == 0) + break; + + // Parse and append new child + if (*text == Ch('<')) + { + ++text; // Skip '<' + if (xml_node *node = parse_node(text)) + this->append_node(node); + } + else + RAPIDXML_PARSE_ERROR("expected <", text); + } + + } + + //! Clears the document by deleting all nodes and clearing the memory pool. + //! All nodes owned by document pool are destroyed. + void clear() + { + this->remove_all_nodes(); + this->remove_all_attributes(); + memory_pool::clear(); + } + + private: + + /////////////////////////////////////////////////////////////////////// + // Internal character utility functions + + // Detect whitespace character + struct whitespace_pred + { + static unsigned char test(Ch ch) + { + return internal::lookup_tables<0>::lookup_whitespace[static_cast(ch)]; + } + }; + + // Detect node name character + struct node_name_pred + { + static unsigned char test(Ch ch) + { + return internal::lookup_tables<0>::lookup_node_name[static_cast(ch)]; + } + }; + + // Detect attribute name character + struct attribute_name_pred + { + static unsigned char test(Ch ch) + { + return internal::lookup_tables<0>::lookup_attribute_name[static_cast(ch)]; + } + }; + + // Detect text character (PCDATA) + struct text_pred + { + static unsigned char test(Ch ch) + { + return internal::lookup_tables<0>::lookup_text[static_cast(ch)]; + } + }; + + // Detect text character (PCDATA) that does not require processing + struct text_pure_no_ws_pred + { + static unsigned char test(Ch ch) + { + return internal::lookup_tables<0>::lookup_text_pure_no_ws[static_cast(ch)]; + } + }; + + // Detect text character (PCDATA) that does not require processing + struct text_pure_with_ws_pred + { + static unsigned char test(Ch ch) + { + return internal::lookup_tables<0>::lookup_text_pure_with_ws[static_cast(ch)]; + } + }; + + // Detect attribute value character + template + struct attribute_value_pred + { + static unsigned char test(Ch ch) + { + if (Quote == Ch('\'')) + return internal::lookup_tables<0>::lookup_attribute_data_1[static_cast(ch)]; + if (Quote == Ch('\"')) + return internal::lookup_tables<0>::lookup_attribute_data_2[static_cast(ch)]; + return 0; // Should never be executed, to avoid warnings on Comeau + } + }; + + // Detect attribute value character + template + struct attribute_value_pure_pred + { + static unsigned char test(Ch ch) + { + if (Quote == Ch('\'')) + return internal::lookup_tables<0>::lookup_attribute_data_1_pure[static_cast(ch)]; + if (Quote == Ch('\"')) + return internal::lookup_tables<0>::lookup_attribute_data_2_pure[static_cast(ch)]; + return 0; // Should never be executed, to avoid warnings on Comeau + } + }; + + // Insert coded character, using UTF8 or 8-bit ASCII + template + static void insert_coded_character(Ch *&text, unsigned long code) + { + if (Flags & parse_no_utf8) + { + // Insert 8-bit ASCII character + // Todo: possibly verify that code is less than 256 and use replacement char otherwise? + text[0] = static_cast(code); + text += 1; + } + else + { + // Insert UTF8 sequence + if (code < 0x80) // 1 byte sequence + { + text[0] = static_cast(code); + text += 1; + } + else if (code < 0x800) // 2 byte sequence + { + text[1] = static_cast((code | 0x80) & 0xBF); code >>= 6; + text[0] = static_cast(code | 0xC0); + text += 2; + } + else if (code < 0x10000) // 3 byte sequence + { + text[2] = static_cast((code | 0x80) & 0xBF); code >>= 6; + text[1] = static_cast((code | 0x80) & 0xBF); code >>= 6; + text[0] = static_cast(code | 0xE0); + text += 3; + } + else if (code < 0x110000) // 4 byte sequence + { + text[3] = static_cast((code | 0x80) & 0xBF); code >>= 6; + text[2] = static_cast((code | 0x80) & 0xBF); code >>= 6; + text[1] = static_cast((code | 0x80) & 0xBF); code >>= 6; + text[0] = static_cast(code | 0xF0); + text += 4; + } + else // Invalid, only codes up to 0x10FFFF are allowed in Unicode + { + RAPIDXML_PARSE_ERROR("invalid numeric character entity", text); + } + } + } + + // Skip characters until predicate evaluates to true + template + static void skip(Ch *&text) + { + Ch *tmp = text; + while (StopPred::test(*tmp)) + ++tmp; + text = tmp; + } + + // Skip characters until predicate evaluates to true while doing the following: + // - replacing XML character entity references with proper characters (' & " < > &#...;) + // - condensing whitespace sequences to single space character + template + static Ch *skip_and_expand_character_refs(Ch *&text) + { + // If entity translation, whitespace condense and whitespace trimming is disabled, use plain skip + if (Flags & parse_no_entity_translation && + !(Flags & parse_normalize_whitespace) && + !(Flags & parse_trim_whitespace)) + { + skip(text); + return text; + } + + // Use simple skip until first modification is detected + skip(text); + + // Use translation skip + Ch *src = text; + Ch *dest = src; + while (StopPred::test(*src)) + { + // If entity translation is enabled + if (!(Flags & parse_no_entity_translation)) + { + // Test if replacement is needed + if (src[0] == Ch('&')) + { + switch (src[1]) + { + + // & ' + case Ch('a'): + if (src[2] == Ch('m') && src[3] == Ch('p') && src[4] == Ch(';')) + { + *dest = Ch('&'); + ++dest; + src += 5; + continue; + } + if (src[2] == Ch('p') && src[3] == Ch('o') && src[4] == Ch('s') && src[5] == Ch(';')) + { + *dest = Ch('\''); + ++dest; + src += 6; + continue; + } + break; + + // " + case Ch('q'): + if (src[2] == Ch('u') && src[3] == Ch('o') && src[4] == Ch('t') && src[5] == Ch(';')) + { + *dest = Ch('"'); + ++dest; + src += 6; + continue; + } + break; + + // > + case Ch('g'): + if (src[2] == Ch('t') && src[3] == Ch(';')) + { + *dest = Ch('>'); + ++dest; + src += 4; + continue; + } + break; + + // < + case Ch('l'): + if (src[2] == Ch('t') && src[3] == Ch(';')) + { + *dest = Ch('<'); + ++dest; + src += 4; + continue; + } + break; + + // &#...; - assumes ASCII + case Ch('#'): + if (src[2] == Ch('x')) + { + unsigned long code = 0; + src += 3; // Skip &#x + while (1) + { + unsigned char digit = internal::lookup_tables<0>::lookup_digits[static_cast(*src)]; + if (digit == 0xFF) + break; + code = code * 16 + digit; + ++src; + } + insert_coded_character(dest, code); // Put character in output + } + else + { + unsigned long code = 0; + src += 2; // Skip &# + while (1) + { + unsigned char digit = internal::lookup_tables<0>::lookup_digits[static_cast(*src)]; + if (digit == 0xFF) + break; + code = code * 10 + digit; + ++src; + } + insert_coded_character(dest, code); // Put character in output + } + if (*src == Ch(';')) + ++src; + else + RAPIDXML_PARSE_ERROR("expected ;", src); + continue; + + // Something else + default: + // Ignore, just copy '&' verbatim + break; + + } + } + } + + // If whitespace condensing is enabled + if (Flags & parse_normalize_whitespace) + { + // Test if condensing is needed + if (whitespace_pred::test(*src)) + { + *dest = Ch(' '); ++dest; // Put single space in dest + ++src; // Skip first whitespace char + // Skip remaining whitespace chars + while (whitespace_pred::test(*src)) + ++src; + continue; + } + } + + // No replacement, only copy character + *dest++ = *src++; + + } + + // Return new end + text = src; + return dest; + + } + + /////////////////////////////////////////////////////////////////////// + // Internal parsing functions + + // Parse BOM, if any + template + void parse_bom(Ch *&text) + { + // UTF-8? + if (static_cast(text[0]) == 0xEF && + static_cast(text[1]) == 0xBB && + static_cast(text[2]) == 0xBF) + { + text += 3; // Skup utf-8 bom + } + } + + // Parse XML declaration ( + xml_node *parse_xml_declaration(Ch *&text) + { + // If parsing of declaration is disabled + if (!(Flags & parse_declaration_node)) + { + // Skip until end of declaration + while (text[0] != Ch('?') || text[1] != Ch('>')) + { + if (!text[0]) + RAPIDXML_PARSE_ERROR("unexpected end of data", text); + ++text; + } + text += 2; // Skip '?>' + return 0; + } + + // Create declaration + xml_node *declaration = this->allocate_node(node_declaration); + + // Skip whitespace before attributes or ?> + skip(text); + + // Parse declaration attributes + parse_node_attributes(text, declaration); + + // Skip ?> + if (text[0] != Ch('?') || text[1] != Ch('>')) + RAPIDXML_PARSE_ERROR("expected ?>", text); + text += 2; + + return declaration; + } + + // Parse XML comment (' + return 0; // Do not produce comment node + } + + // Remember value start + Ch *value = text; + + // Skip until end of comment + while (text[0] != Ch('-') || text[1] != Ch('-') || text[2] != Ch('>')) + { + if (!text[0]) + RAPIDXML_PARSE_ERROR("unexpected end of data", text); + ++text; + } + + // Create comment node + xml_node *comment = this->allocate_node(node_comment); + comment->value(value, text - value); + + // Place zero terminator after comment value + if (!(Flags & parse_no_string_terminators)) + *text = Ch('\0'); + + text += 3; // Skip '-->' + return comment; + } + + // Parse DOCTYPE + template + xml_node *parse_doctype(Ch *&text) + { + // Remember value start + Ch *value = text; + + // Skip to > + while (*text != Ch('>')) + { + // Determine character type + switch (*text) + { + + // If '[' encountered, scan for matching ending ']' using naive algorithm with depth + // This works for all W3C test files except for 2 most wicked + case Ch('['): + { + ++text; // Skip '[' + int depth = 1; + while (depth > 0) + { + switch (*text) + { + case Ch('['): ++depth; break; + case Ch(']'): --depth; break; + case 0: RAPIDXML_PARSE_ERROR("unexpected end of data", text); + } + ++text; + } + break; + } + + // Error on end of text + case Ch('\0'): + RAPIDXML_PARSE_ERROR("unexpected end of data", text); + + // Other character, skip it + default: + ++text; + + } + } + + // If DOCTYPE nodes enabled + if (Flags & parse_doctype_node) + { + // Create a new doctype node + xml_node *doctype = this->allocate_node(node_doctype); + doctype->value(value, text - value); + + // Place zero terminator after value + if (!(Flags & parse_no_string_terminators)) + *text = Ch('\0'); + + text += 1; // skip '>' + return doctype; + } + else + { + text += 1; // skip '>' + return 0; + } + + } + + // Parse PI + template + xml_node *parse_pi(Ch *&text) + { + // If creation of PI nodes is enabled + if (Flags & parse_pi_nodes) + { + // Create pi node + xml_node *pi = this->allocate_node(node_pi); + + // Extract PI target name + Ch *name = text; + skip(text); + if (text == name) + RAPIDXML_PARSE_ERROR("expected PI target", text); + pi->name(name, text - name); + + // Skip whitespace between pi target and pi + skip(text); + + // Remember start of pi + Ch *value = text; + + // Skip to '?>' + while (text[0] != Ch('?') || text[1] != Ch('>')) + { + if (*text == Ch('\0')) + RAPIDXML_PARSE_ERROR("unexpected end of data", text); + ++text; + } + + // Set pi value (verbatim, no entity expansion or whitespace normalization) + pi->value(value, text - value); + + // Place zero terminator after name and value + if (!(Flags & parse_no_string_terminators)) + { + pi->name()[pi->name_size()] = Ch('\0'); + pi->value()[pi->value_size()] = Ch('\0'); + } + + text += 2; // Skip '?>' + return pi; + } + else + { + // Skip to '?>' + while (text[0] != Ch('?') || text[1] != Ch('>')) + { + if (*text == Ch('\0')) + RAPIDXML_PARSE_ERROR("unexpected end of data", text); + ++text; + } + text += 2; // Skip '?>' + return 0; + } + } + + // Parse and append data + // Return character that ends data. + // This is necessary because this character might have been overwritten by a terminating 0 + template + Ch parse_and_append_data(xml_node *node, Ch *&text, Ch *contents_start) + { + // Backup to contents start if whitespace trimming is disabled + if (!(Flags & parse_trim_whitespace)) + text = contents_start; + + // Skip until end of data + Ch *value = text, *end; + if (Flags & parse_normalize_whitespace) + end = skip_and_expand_character_refs(text); + else + end = skip_and_expand_character_refs(text); + + // Trim trailing whitespace if flag is set; leading was already trimmed by whitespace skip after > + if (Flags & parse_trim_whitespace) + { + if (Flags & parse_normalize_whitespace) + { + // Whitespace is already condensed to single space characters by skipping function, so just trim 1 char off the end + if (*(end - 1) == Ch(' ')) + --end; + } + else + { + // Backup until non-whitespace character is found + while (whitespace_pred::test(*(end - 1))) + --end; + } + } + + // If characters are still left between end and value (this test is only necessary if normalization is enabled) + // Create new data node + if (!(Flags & parse_no_data_nodes)) + { + xml_node *data = this->allocate_node(node_data); + data->value(value, end - value); + node->append_node(data); + } + + // Add data to parent node if no data exists yet + if (!(Flags & parse_no_element_values)) + if (*node->value() == Ch('\0')) + node->value(value, end - value); + + // Place zero terminator after value + if (!(Flags & parse_no_string_terminators)) + { + Ch ch = *text; + *end = Ch('\0'); + return ch; // Return character that ends data; this is required because zero terminator overwritten it + } + + // Return character that ends data + return *text; + } + + // Parse CDATA + template + xml_node *parse_cdata(Ch *&text) + { + // If CDATA is disabled + if (Flags & parse_no_data_nodes) + { + // Skip until end of cdata + while (text[0] != Ch(']') || text[1] != Ch(']') || text[2] != Ch('>')) + { + if (!text[0]) + RAPIDXML_PARSE_ERROR("unexpected end of data", text); + ++text; + } + text += 3; // Skip ]]> + return 0; // Do not produce CDATA node + } + + // Skip until end of cdata + Ch *value = text; + while (text[0] != Ch(']') || text[1] != Ch(']') || text[2] != Ch('>')) + { + if (!text[0]) + RAPIDXML_PARSE_ERROR("unexpected end of data", text); + ++text; + } + + // Create new cdata node + xml_node *cdata = this->allocate_node(node_cdata); + cdata->value(value, text - value); + + // Place zero terminator after value + if (!(Flags & parse_no_string_terminators)) + *text = Ch('\0'); + + text += 3; // Skip ]]> + return cdata; + } + + // Parse element node + template + xml_node *parse_element(Ch *&text) + { + // Create element node + xml_node *element = this->allocate_node(node_element); + + // Extract element name + Ch *name = text; + skip(text); + if (text == name) + RAPIDXML_PARSE_ERROR("expected element name", text); + element->name(name, text - name); + + // Skip whitespace between element name and attributes or > + skip(text); + + // Parse attributes, if any + parse_node_attributes(text, element); + + // Determine ending type + if (*text == Ch('>')) + { + ++text; + parse_node_contents(text, element); + } + else if (*text == Ch('/')) + { + ++text; + if (*text != Ch('>')) + RAPIDXML_PARSE_ERROR("expected >", text); + ++text; + } + else + RAPIDXML_PARSE_ERROR("expected >", text); + + // Place zero terminator after name + if (!(Flags & parse_no_string_terminators)) + element->name()[element->name_size()] = Ch('\0'); + + // Return parsed element + return element; + } + + // Determine node type, and parse it + template + xml_node *parse_node(Ch *&text) + { + // Parse proper node type + switch (text[0]) + { + + // <... + default: + // Parse and append element node + return parse_element(text); + + // (text); + } + else + { + // Parse PI + return parse_pi(text); + } + + // (text); + } + break; + + // (text); + } + break; + + // (text); + } + + } // switch + + // Attempt to skip other, unrecognized node types starting with ')) + { + if (*text == 0) + RAPIDXML_PARSE_ERROR("unexpected end of data", text); + ++text; + } + ++text; // Skip '>' + return 0; // No node recognized + + } + } + + // Parse contents of the node - children, data etc. + template + void parse_node_contents(Ch *&text, xml_node *node) + { + // For all children and text + while (1) + { + // Skip whitespace between > and node contents + Ch *contents_start = text; // Store start of node contents before whitespace is skipped + skip(text); + Ch next_char = *text; + + // After data nodes, instead of continuing the loop, control jumps here. + // This is because zero termination inside parse_and_append_data() function + // would wreak havoc with the above code. + // Also, skipping whitespace after data nodes is unnecessary. + after_data_node: + + // Determine what comes next: node closing, child node, data node, or 0? + switch (next_char) + { + + // Node closing or child node + case Ch('<'): + if (text[1] == Ch('/')) + { + // Node closing + text += 2; // Skip '(text); + if (!internal::compare(node->name(), node->name_size(), closing_name, text - closing_name, true)) + RAPIDXML_PARSE_ERROR("invalid closing tag name", text); + } + else + { + // No validation, just skip name + skip(text); + } + // Skip remaining whitespace after node name + skip(text); + if (*text != Ch('>')) + RAPIDXML_PARSE_ERROR("expected >", text); + ++text; // Skip '>' + return; // Node closed, finished parsing contents + } + else + { + // Child node + ++text; // Skip '<' + if (xml_node *child = parse_node(text)) + node->append_node(child); + } + break; + + // End of data - error + case Ch('\0'): + RAPIDXML_PARSE_ERROR("unexpected end of data", text); + + // Data node + default: + next_char = parse_and_append_data(node, text, contents_start); + goto after_data_node; // Bypass regular processing after data nodes + + } + } + } + + // Parse XML attributes of the node + template + void parse_node_attributes(Ch *&text, xml_node *node) + { + // For all attributes + while (attribute_name_pred::test(*text)) + { + // Extract attribute name + Ch *name = text; + ++text; // Skip first character of attribute name + skip(text); + if (text == name) + RAPIDXML_PARSE_ERROR("expected attribute name", name); + + // Create new attribute + xml_attribute *attribute = this->allocate_attribute(); + attribute->name(name, text - name); + node->append_attribute(attribute); + + // Skip whitespace after attribute name + skip(text); + + // Skip = + if (*text != Ch('=')) + RAPIDXML_PARSE_ERROR("expected =", text); + ++text; + + // Add terminating zero after name + if (!(Flags & parse_no_string_terminators)) + attribute->name()[attribute->name_size()] = 0; + + // Skip whitespace after = + skip(text); + + // Skip quote and remember if it was ' or " + Ch quote = *text; + if (quote != Ch('\'') && quote != Ch('"')) + RAPIDXML_PARSE_ERROR("expected ' or \"", text); + ++text; + + // Extract attribute value and expand char refs in it + Ch *value = text, *end; + const int AttFlags = Flags & ~parse_normalize_whitespace; // No whitespace normalization in attributes + if (quote == Ch('\'')) + end = skip_and_expand_character_refs, attribute_value_pure_pred, AttFlags>(text); + else + end = skip_and_expand_character_refs, attribute_value_pure_pred, AttFlags>(text); + + // Set attribute value + attribute->value(value, end - value); + + // Make sure that end quote is present + if (*text != quote) + RAPIDXML_PARSE_ERROR("expected ' or \"", text); + ++text; // Skip quote + + // Add terminating zero after value + if (!(Flags & parse_no_string_terminators)) + attribute->value()[attribute->value_size()] = 0; + + // Skip whitespace after attribute value + skip(text); + } + } + + }; + + //! \cond internal + namespace internal + { + + // Whitespace (space \n \r \t) + template + const unsigned char lookup_tables::lookup_whitespace[256] = + { + // 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, // 0 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1 + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 3 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 4 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 5 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 6 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 7 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // C + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // D + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // E + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F + }; + + // Node name (anything but space \n \r \t / > ? \0) + template + const unsigned char lookup_tables::lookup_node_name[256] = + { + // 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, // 0 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1 + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, // 2 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, // 3 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 8 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 9 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // C + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // D + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // E + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // F + }; + + // Text (i.e. PCDATA) (anything but < \0) + template + const unsigned char lookup_tables::lookup_text[256] = + { + // 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, // 3 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 8 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 9 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // C + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // D + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // E + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // F + }; + + // Text (i.e. PCDATA) that does not require processing when ws normalization is disabled + // (anything but < \0 &) + template + const unsigned char lookup_tables::lookup_text_pure_no_ws[256] = + { + // 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1 + 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, // 3 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 8 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 9 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // C + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // D + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // E + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // F + }; + + // Text (i.e. PCDATA) that does not require processing when ws normalizationis is enabled + // (anything but < \0 & space \n \r \t) + template + const unsigned char lookup_tables::lookup_text_pure_with_ws[256] = + { + // 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, // 0 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1 + 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, // 3 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 8 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 9 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // C + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // D + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // E + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // F + }; + + // Attribute name (anything but space \n \r \t / < > = ? ! \0) + template + const unsigned char lookup_tables::lookup_attribute_name[256] = + { + // 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, // 0 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1 + 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, // 2 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, // 3 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 8 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 9 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // C + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // D + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // E + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // F + }; + + // Attribute data with single quote (anything but ' \0) + template + const unsigned char lookup_tables::lookup_attribute_data_1[256] = + { + // 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1 + 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, // 2 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 8 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 9 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // C + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // D + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // E + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // F + }; + + // Attribute data with single quote that does not require processing (anything but ' \0 &) + template + const unsigned char lookup_tables::lookup_attribute_data_1_pure[256] = + { + // 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1 + 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, // 2 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 8 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 9 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // C + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // D + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // E + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // F + }; + + // Attribute data with double quote (anything but " \0) + template + const unsigned char lookup_tables::lookup_attribute_data_2[256] = + { + // 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1 + 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 8 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 9 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // C + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // D + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // E + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // F + }; + + // Attribute data with double quote that does not require processing (anything but " \0 &) + template + const unsigned char lookup_tables::lookup_attribute_data_2_pure[256] = + { + // 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1 + 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 8 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 9 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // C + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // D + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // E + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // F + }; + + // Digits (dec and hex, 255 denotes end of numeric character reference) + template + const unsigned char lookup_tables::lookup_digits[256] = + { + // 0 1 2 3 4 5 6 7 8 9 A B C D E F + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // 0 + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // 1 + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // 2 + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,255,255,255,255,255,255, // 3 + 255, 10, 11, 12, 13, 14, 15,255,255,255,255,255,255,255,255,255, // 4 + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // 5 + 255, 10, 11, 12, 13, 14, 15,255,255,255,255,255,255,255,255,255, // 6 + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // 7 + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // 8 + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // 9 + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // A + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // B + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // C + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // D + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // E + 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255 // F + }; + + // Upper case conversion + template + const unsigned char lookup_tables::lookup_upcase[256] = + { + // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, A B C D E F + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 0 + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, // 1 + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, // 2 + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, // 3 + 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, // 4 + 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, // 5 + 96, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, // 6 + 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 123,124,125,126,127, // 7 + 128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143, // 8 + 144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159, // 9 + 160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175, // A + 176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191, // B + 192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207, // C + 208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223, // D + 224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239, // E + 240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255 // F + }; + } + //! \endcond + +} + +// Undefine internal macros +#undef RAPIDXML_PARSE_ERROR + +// On MSVC, restore warnings state +#ifdef _MSC_VER + #pragma warning(pop) +#endif + +#endif diff --git a/thirdpt/rapidxml-1.13/rapidxml_iterators.hpp b/thirdpt/rapidxml-1.13/rapidxml_iterators.hpp new file mode 100644 index 0000000..52ebc29 --- /dev/null +++ b/thirdpt/rapidxml-1.13/rapidxml_iterators.hpp @@ -0,0 +1,174 @@ +#ifndef RAPIDXML_ITERATORS_HPP_INCLUDED +#define RAPIDXML_ITERATORS_HPP_INCLUDED + +// Copyright (C) 2006, 2009 Marcin Kalicinski +// Version 1.13 +// Revision $DateTime: 2009/05/13 01:46:17 $ +//! \file rapidxml_iterators.hpp This file contains rapidxml iterators + +#include "rapidxml.hpp" + +namespace rapidxml +{ + + //! Iterator of child nodes of xml_node + template + class node_iterator + { + + public: + + typedef typename xml_node value_type; + typedef typename xml_node &reference; + typedef typename xml_node *pointer; + typedef std::ptrdiff_t difference_type; + typedef std::bidirectional_iterator_tag iterator_category; + + node_iterator() + : m_node(0) + { + } + + node_iterator(xml_node *node) + : m_node(node->first_node()) + { + } + + reference operator *() const + { + assert(m_node); + return *m_node; + } + + pointer operator->() const + { + assert(m_node); + return m_node; + } + + node_iterator& operator++() + { + assert(m_node); + m_node = m_node->next_sibling(); + return *this; + } + + node_iterator operator++(int) + { + node_iterator tmp = *this; + ++this; + return tmp; + } + + node_iterator& operator--() + { + assert(m_node && m_node->previous_sibling()); + m_node = m_node->previous_sibling(); + return *this; + } + + node_iterator operator--(int) + { + node_iterator tmp = *this; + ++this; + return tmp; + } + + bool operator ==(const node_iterator &rhs) + { + return m_node == rhs.m_node; + } + + bool operator !=(const node_iterator &rhs) + { + return m_node != rhs.m_node; + } + + private: + + xml_node *m_node; + + }; + + //! Iterator of child attributes of xml_node + template + class attribute_iterator + { + + public: + + typedef typename xml_attribute value_type; + typedef typename xml_attribute &reference; + typedef typename xml_attribute *pointer; + typedef std::ptrdiff_t difference_type; + typedef std::bidirectional_iterator_tag iterator_category; + + attribute_iterator() + : m_attribute(0) + { + } + + attribute_iterator(xml_node *node) + : m_attribute(node->first_attribute()) + { + } + + reference operator *() const + { + assert(m_attribute); + return *m_attribute; + } + + pointer operator->() const + { + assert(m_attribute); + return m_attribute; + } + + attribute_iterator& operator++() + { + assert(m_attribute); + m_attribute = m_attribute->next_attribute(); + return *this; + } + + attribute_iterator operator++(int) + { + attribute_iterator tmp = *this; + ++this; + return tmp; + } + + attribute_iterator& operator--() + { + assert(m_attribute && m_attribute->previous_attribute()); + m_attribute = m_attribute->previous_attribute(); + return *this; + } + + attribute_iterator operator--(int) + { + attribute_iterator tmp = *this; + ++this; + return tmp; + } + + bool operator ==(const attribute_iterator &rhs) + { + return m_attribute == rhs.m_attribute; + } + + bool operator !=(const attribute_iterator &rhs) + { + return m_attribute != rhs.m_attribute; + } + + private: + + xml_attribute *m_attribute; + + }; + +} + +#endif diff --git a/thirdpt/rapidxml-1.13/rapidxml_print.hpp b/thirdpt/rapidxml-1.13/rapidxml_print.hpp new file mode 100644 index 0000000..0ae2b14 --- /dev/null +++ b/thirdpt/rapidxml-1.13/rapidxml_print.hpp @@ -0,0 +1,421 @@ +#ifndef RAPIDXML_PRINT_HPP_INCLUDED +#define RAPIDXML_PRINT_HPP_INCLUDED + +// Copyright (C) 2006, 2009 Marcin Kalicinski +// Version 1.13 +// Revision $DateTime: 2009/05/13 01:46:17 $ +//! \file rapidxml_print.hpp This file contains rapidxml printer implementation + +#include "rapidxml.hpp" + +// Only include streams if not disabled +#ifndef RAPIDXML_NO_STREAMS + #include + #include +#endif + +namespace rapidxml +{ + + /////////////////////////////////////////////////////////////////////// + // Printing flags + + const int print_no_indenting = 0x1; //!< Printer flag instructing the printer to suppress indenting of XML. See print() function. + + /////////////////////////////////////////////////////////////////////// + // Internal + + //! \cond internal + namespace internal + { + + /////////////////////////////////////////////////////////////////////////// + // Internal character operations + + // Copy characters from given range to given output iterator + template + inline OutIt copy_chars(const Ch *begin, const Ch *end, OutIt out) + { + while (begin != end) + *out++ = *begin++; + return out; + } + + // Copy characters from given range to given output iterator and expand + // characters into references (< > ' " &) + template + inline OutIt copy_and_expand_chars(const Ch *begin, const Ch *end, Ch noexpand, OutIt out) + { + while (begin != end) + { + if (*begin == noexpand) + { + *out++ = *begin; // No expansion, copy character + } + else + { + switch (*begin) + { + case Ch('<'): + *out++ = Ch('&'); *out++ = Ch('l'); *out++ = Ch('t'); *out++ = Ch(';'); + break; + case Ch('>'): + *out++ = Ch('&'); *out++ = Ch('g'); *out++ = Ch('t'); *out++ = Ch(';'); + break; + case Ch('\''): + *out++ = Ch('&'); *out++ = Ch('a'); *out++ = Ch('p'); *out++ = Ch('o'); *out++ = Ch('s'); *out++ = Ch(';'); + break; + case Ch('"'): + *out++ = Ch('&'); *out++ = Ch('q'); *out++ = Ch('u'); *out++ = Ch('o'); *out++ = Ch('t'); *out++ = Ch(';'); + break; + case Ch('&'): + *out++ = Ch('&'); *out++ = Ch('a'); *out++ = Ch('m'); *out++ = Ch('p'); *out++ = Ch(';'); + break; + default: + *out++ = *begin; // No expansion, copy character + } + } + ++begin; // Step to next character + } + return out; + } + + // Fill given output iterator with repetitions of the same character + template + inline OutIt fill_chars(OutIt out, int n, Ch ch) + { + for (int i = 0; i < n; ++i) + *out++ = ch; + return out; + } + + // Find character + template + inline bool find_char(const Ch *begin, const Ch *end) + { + while (begin != end) + if (*begin++ == ch) + return true; + return false; + } + + /////////////////////////////////////////////////////////////////////////// + // Internal printing operations + + // Print node + template + inline OutIt print_node(OutIt out, const xml_node *node, int flags, int indent) + { + // Print proper node type + switch (node->type()) + { + + // Document + case node_document: + out = print_children(out, node, flags, indent); + break; + + // Element + case node_element: + out = print_element_node(out, node, flags, indent); + break; + + // Data + case node_data: + out = print_data_node(out, node, flags, indent); + break; + + // CDATA + case node_cdata: + out = print_cdata_node(out, node, flags, indent); + break; + + // Declaration + case node_declaration: + out = print_declaration_node(out, node, flags, indent); + break; + + // Comment + case node_comment: + out = print_comment_node(out, node, flags, indent); + break; + + // Doctype + case node_doctype: + out = print_doctype_node(out, node, flags, indent); + break; + + // Pi + case node_pi: + out = print_pi_node(out, node, flags, indent); + break; + + // Unknown + default: + assert(0); + break; + } + + // If indenting not disabled, add line break after node + if (!(flags & print_no_indenting)) + *out = Ch('\n'), ++out; + + // Return modified iterator + return out; + } + + // Print children of the node + template + inline OutIt print_children(OutIt out, const xml_node *node, int flags, int indent) + { + for (xml_node *child = node->first_node(); child; child = child->next_sibling()) + out = print_node(out, child, flags, indent); + return out; + } + + // Print attributes of the node + template + inline OutIt print_attributes(OutIt out, const xml_node *node, int flags) + { + for (xml_attribute *attribute = node->first_attribute(); attribute; attribute = attribute->next_attribute()) + { + if (attribute->name() && attribute->value()) + { + // Print attribute name + *out = Ch(' '), ++out; + out = copy_chars(attribute->name(), attribute->name() + attribute->name_size(), out); + *out = Ch('='), ++out; + // Print attribute value using appropriate quote type + if (find_char(attribute->value(), attribute->value() + attribute->value_size())) + { + *out = Ch('\''), ++out; + out = copy_and_expand_chars(attribute->value(), attribute->value() + attribute->value_size(), Ch('"'), out); + *out = Ch('\''), ++out; + } + else + { + *out = Ch('"'), ++out; + out = copy_and_expand_chars(attribute->value(), attribute->value() + attribute->value_size(), Ch('\''), out); + *out = Ch('"'), ++out; + } + } + } + return out; + } + + // Print data node + template + inline OutIt print_data_node(OutIt out, const xml_node *node, int flags, int indent) + { + assert(node->type() == node_data); + if (!(flags & print_no_indenting)) + out = fill_chars(out, indent, Ch('\t')); + out = copy_and_expand_chars(node->value(), node->value() + node->value_size(), Ch(0), out); + return out; + } + + // Print data node + template + inline OutIt print_cdata_node(OutIt out, const xml_node *node, int flags, int indent) + { + assert(node->type() == node_cdata); + if (!(flags & print_no_indenting)) + out = fill_chars(out, indent, Ch('\t')); + *out = Ch('<'); ++out; + *out = Ch('!'); ++out; + *out = Ch('['); ++out; + *out = Ch('C'); ++out; + *out = Ch('D'); ++out; + *out = Ch('A'); ++out; + *out = Ch('T'); ++out; + *out = Ch('A'); ++out; + *out = Ch('['); ++out; + out = copy_chars(node->value(), node->value() + node->value_size(), out); + *out = Ch(']'); ++out; + *out = Ch(']'); ++out; + *out = Ch('>'); ++out; + return out; + } + + // Print element node + template + inline OutIt print_element_node(OutIt out, const xml_node *node, int flags, int indent) + { + assert(node->type() == node_element); + + // Print element name and attributes, if any + if (!(flags & print_no_indenting)) + out = fill_chars(out, indent, Ch('\t')); + *out = Ch('<'), ++out; + out = copy_chars(node->name(), node->name() + node->name_size(), out); + out = print_attributes(out, node, flags); + + // If node is childless + if (node->value_size() == 0 && !node->first_node()) + { + // Print childless node tag ending + *out = Ch('/'), ++out; + *out = Ch('>'), ++out; + } + else + { + // Print normal node tag ending + *out = Ch('>'), ++out; + + // Test if node contains a single data node only (and no other nodes) + xml_node *child = node->first_node(); + if (!child) + { + // If node has no children, only print its value without indenting + out = copy_and_expand_chars(node->value(), node->value() + node->value_size(), Ch(0), out); + } + else if (child->next_sibling() == 0 && child->type() == node_data) + { + // If node has a sole data child, only print its value without indenting + out = copy_and_expand_chars(child->value(), child->value() + child->value_size(), Ch(0), out); + } + else + { + // Print all children with full indenting + if (!(flags & print_no_indenting)) + *out = Ch('\n'), ++out; + out = print_children(out, node, flags, indent + 1); + if (!(flags & print_no_indenting)) + out = fill_chars(out, indent, Ch('\t')); + } + + // Print node end + *out = Ch('<'), ++out; + *out = Ch('/'), ++out; + out = copy_chars(node->name(), node->name() + node->name_size(), out); + *out = Ch('>'), ++out; + } + return out; + } + + // Print declaration node + template + inline OutIt print_declaration_node(OutIt out, const xml_node *node, int flags, int indent) + { + // Print declaration start + if (!(flags & print_no_indenting)) + out = fill_chars(out, indent, Ch('\t')); + *out = Ch('<'), ++out; + *out = Ch('?'), ++out; + *out = Ch('x'), ++out; + *out = Ch('m'), ++out; + *out = Ch('l'), ++out; + + // Print attributes + out = print_attributes(out, node, flags); + + // Print declaration end + *out = Ch('?'), ++out; + *out = Ch('>'), ++out; + + return out; + } + + // Print comment node + template + inline OutIt print_comment_node(OutIt out, const xml_node *node, int flags, int indent) + { + assert(node->type() == node_comment); + if (!(flags & print_no_indenting)) + out = fill_chars(out, indent, Ch('\t')); + *out = Ch('<'), ++out; + *out = Ch('!'), ++out; + *out = Ch('-'), ++out; + *out = Ch('-'), ++out; + out = copy_chars(node->value(), node->value() + node->value_size(), out); + *out = Ch('-'), ++out; + *out = Ch('-'), ++out; + *out = Ch('>'), ++out; + return out; + } + + // Print doctype node + template + inline OutIt print_doctype_node(OutIt out, const xml_node *node, int flags, int indent) + { + assert(node->type() == node_doctype); + if (!(flags & print_no_indenting)) + out = fill_chars(out, indent, Ch('\t')); + *out = Ch('<'), ++out; + *out = Ch('!'), ++out; + *out = Ch('D'), ++out; + *out = Ch('O'), ++out; + *out = Ch('C'), ++out; + *out = Ch('T'), ++out; + *out = Ch('Y'), ++out; + *out = Ch('P'), ++out; + *out = Ch('E'), ++out; + *out = Ch(' '), ++out; + out = copy_chars(node->value(), node->value() + node->value_size(), out); + *out = Ch('>'), ++out; + return out; + } + + // Print pi node + template + inline OutIt print_pi_node(OutIt out, const xml_node *node, int flags, int indent) + { + assert(node->type() == node_pi); + if (!(flags & print_no_indenting)) + out = fill_chars(out, indent, Ch('\t')); + *out = Ch('<'), ++out; + *out = Ch('?'), ++out; + out = copy_chars(node->name(), node->name() + node->name_size(), out); + *out = Ch(' '), ++out; + out = copy_chars(node->value(), node->value() + node->value_size(), out); + *out = Ch('?'), ++out; + *out = Ch('>'), ++out; + return out; + } + + } + //! \endcond + + /////////////////////////////////////////////////////////////////////////// + // Printing + + //! Prints XML to given output iterator. + //! \param out Output iterator to print to. + //! \param node Node to be printed. Pass xml_document to print entire document. + //! \param flags Flags controlling how XML is printed. + //! \return Output iterator pointing to position immediately after last character of printed text. + template + inline OutIt print(OutIt out, const xml_node &node, int flags = 0) + { + return internal::print_node(out, &node, flags, 0); + } + +#ifndef RAPIDXML_NO_STREAMS + + //! Prints XML to given output stream. + //! \param out Output stream to print to. + //! \param node Node to be printed. Pass xml_document to print entire document. + //! \param flags Flags controlling how XML is printed. + //! \return Output stream. + template + inline std::basic_ostream &print(std::basic_ostream &out, const xml_node &node, int flags = 0) + { + print(std::ostream_iterator(out), node, flags); + return out; + } + + //! Prints formatted XML to given output stream. Uses default printing flags. Use print() function to customize printing process. + //! \param out Output stream to print to. + //! \param node Node to be printed. + //! \return Output stream. + template + inline std::basic_ostream &operator <<(std::basic_ostream &out, const xml_node &node) + { + return print(out, node); + } + +#endif + +} + +#endif diff --git a/thirdpt/rapidxml-1.13/rapidxml_utils.hpp b/thirdpt/rapidxml-1.13/rapidxml_utils.hpp new file mode 100644 index 0000000..37c2953 --- /dev/null +++ b/thirdpt/rapidxml-1.13/rapidxml_utils.hpp @@ -0,0 +1,122 @@ +#ifndef RAPIDXML_UTILS_HPP_INCLUDED +#define RAPIDXML_UTILS_HPP_INCLUDED + +// Copyright (C) 2006, 2009 Marcin Kalicinski +// Version 1.13 +// Revision $DateTime: 2009/05/13 01:46:17 $ +//! \file rapidxml_utils.hpp This file contains high-level rapidxml utilities that can be useful +//! in certain simple scenarios. They should probably not be used if maximizing performance is the main objective. + +#include "rapidxml.hpp" +#include +#include +#include +#include + +namespace rapidxml +{ + + //! Represents data loaded from a file + template + class file + { + + public: + + //! Loads file into the memory. Data will be automatically destroyed by the destructor. + //! \param filename Filename to load. + file(const char *filename) + { + using namespace std; + + // Open stream + basic_ifstream stream(filename, ios::binary); + if (!stream) + throw runtime_error(string("cannot open file ") + filename); + stream.unsetf(ios::skipws); + + // Determine stream size + stream.seekg(0, ios::end); + size_t size = stream.tellg(); + stream.seekg(0); + + // Load data and add terminating 0 + m_data.resize(size + 1); + stream.read(&m_data.front(), static_cast(size)); + m_data[size] = 0; + } + + //! Loads file into the memory. Data will be automatically destroyed by the destructor + //! \param stream Stream to load from + file(std::basic_istream &stream) + { + using namespace std; + + // Load data and add terminating 0 + stream.unsetf(ios::skipws); + m_data.assign(istreambuf_iterator(stream), istreambuf_iterator()); + if (stream.fail() || stream.bad()) + throw runtime_error("error reading stream"); + m_data.push_back(0); + } + + //! Gets file data. + //! \return Pointer to data of file. + Ch *data() + { + return &m_data.front(); + } + + //! Gets file data. + //! \return Pointer to data of file. + const Ch *data() const + { + return &m_data.front(); + } + + //! Gets file data size. + //! \return Size of file data, in characters. + std::size_t size() const + { + return m_data.size(); + } + + private: + + std::vector m_data; // File data + + }; + + //! Counts children of node. Time complexity is O(n). + //! \return Number of children of node + template + inline std::size_t count_children(xml_node *node) + { + xml_node *child = node->first_node(); + std::size_t count = 0; + while (child) + { + ++count; + child = child->next_sibling(); + } + return count; + } + + //! Counts attributes of node. Time complexity is O(n). + //! \return Number of attributes of node + template + inline std::size_t count_attributes(xml_node *node) + { + xml_attribute *attr = node->first_attribute(); + std::size_t count = 0; + while (attr) + { + ++count; + attr = attr->next_attribute(); + } + return count; + } + +} + +#endif