From f4db66967b4d20ff57c8fa8d19f3af0a22bc8a8e Mon Sep 17 00:00:00 2001
From: "Dinei A. Rockenbach" <dineiar@gmail.com>
Date: Fri, 19 May 2023 20:48:15 +0000
Subject: [PATCH] GSParLib source and examples

---
 .github/workflows/gspar-build-ci.yml          |   26 +
 .gitignore                                    |   40 +
 LICENSE                                       |   21 +
 Makefile                                      |  151 +
 README.md                                     |   24 +-
 examples/driver_api/atomic_operations.cpp     |  113 +
 examples/driver_api/gpuinfo.cpp               |   93 +
 examples/driver_api/matrix_multi.cpp          |  143 +
 examples/driver_api/shared_memory.cpp         |  126 +
 examples/driver_api/vector_sum.cpp            |  135 +
 .../driver_api/vector_sum_chunked_memory.cpp  |  152 +
 examples/include/cudabase.hpp                 |   49 +
 examples/include/cudabase_driver.hpp          |   56 +
 examples/include/cudabase_nvrtc.hpp           |   57 +
 examples/include/oclbase.h                    |  116 +
 examples/include/oclbase.hpp                  |  116 +
 examples/pattern_api/mandel.cpp               |  141 +
 .../pattern_api/mandel_batched_parameters.cpp |  222 ++
 examples/pattern_api/mandel_stream.cpp        |  218 ++
 examples/pattern_api/matrix_multi_map_cm.cpp  |  133 +
 examples/pattern_api/matrix_multi_map_rm.cpp  |  132 +
 examples/pattern_api/raytracer.cpp            |  818 ++++++
 examples/pattern_api/reduce_sample.cpp        |   55 +
 examples/pattern_api/vector_sum_map.cpp       |  107 +
 examples/pattern_api/vector_sum_map_batch.cpp |  117 +
 .../vector_sum_map_managing_memory.cpp        |  103 +
 .../pattern_api/vector_sum_map_parallel.cpp   |  195 ++
 examples/pattern_api/vector_sum_mapreduce.cpp |  135 +
 examples/sequential/mandel.cpp                |  100 +
 examples/sequential/matrix_multi_cm.cpp       |  101 +
 examples/sequential/matrix_multi_rm.cpp       |   95 +
 examples/sequential/primer.cpp                |   79 +
 examples/sequential/raytracer.cpp             |  524 ++++
 examples/sequential/reduce.cpp                |   79 +
 examples/sequential/saxpy.cpp                 |   96 +
 examples/sequential/vector_sum.cpp            |   95 +
 examples/workloads/raytracer_scene.xml        |  137 +
 src/GSPar.hpp                                 |   13 +
 src/GSPar_Base.cpp                            |   28 +
 src/GSPar_Base.hpp                            |   37 +
 src/GSPar_BaseGPUDriver.hpp                   |  796 +++++
 src/GSPar_BaseParallelPattern.hpp             | 1129 +++++++
 src/GSPar_CUDA.cpp                            |  942 ++++++
 src/GSPar_CUDA.hpp                            |  262 ++
 src/GSPar_OpenCL.cpp                          | 1051 +++++++
 src/GSPar_OpenCL.hpp                          |  260 ++
 src/GSPar_PatternComposition.hpp              |  271 ++
 src/GSPar_PatternMap.hpp                      |   29 +
 src/GSPar_PatternReduce.cpp                   |  155 +
 src/GSPar_PatternReduce.hpp                   |  174 ++
 thirdpt/marX2/marX2.c                         |  434 +++
 thirdpt/marX2/marX2.h                         |   29 +
 thirdpt/rapidxml-1.13/license.txt             |   52 +
 thirdpt/rapidxml-1.13/manual.html             |  406 +++
 thirdpt/rapidxml-1.13/rapidxml.hpp            | 2596 +++++++++++++++++
 thirdpt/rapidxml-1.13/rapidxml_iterators.hpp  |  174 ++
 thirdpt/rapidxml-1.13/rapidxml_print.hpp      |  421 +++
 thirdpt/rapidxml-1.13/rapidxml_utils.hpp      |  122 +
 58 files changed, 14480 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/gspar-build-ci.yml
 create mode 100644 .gitignore
 create mode 100644 LICENSE
 create mode 100644 Makefile
 create mode 100644 examples/driver_api/atomic_operations.cpp
 create mode 100644 examples/driver_api/gpuinfo.cpp
 create mode 100644 examples/driver_api/matrix_multi.cpp
 create mode 100644 examples/driver_api/shared_memory.cpp
 create mode 100644 examples/driver_api/vector_sum.cpp
 create mode 100644 examples/driver_api/vector_sum_chunked_memory.cpp
 create mode 100644 examples/include/cudabase.hpp
 create mode 100644 examples/include/cudabase_driver.hpp
 create mode 100644 examples/include/cudabase_nvrtc.hpp
 create mode 100644 examples/include/oclbase.h
 create mode 100644 examples/include/oclbase.hpp
 create mode 100644 examples/pattern_api/mandel.cpp
 create mode 100644 examples/pattern_api/mandel_batched_parameters.cpp
 create mode 100644 examples/pattern_api/mandel_stream.cpp
 create mode 100644 examples/pattern_api/matrix_multi_map_cm.cpp
 create mode 100644 examples/pattern_api/matrix_multi_map_rm.cpp
 create mode 100644 examples/pattern_api/raytracer.cpp
 create mode 100644 examples/pattern_api/reduce_sample.cpp
 create mode 100644 examples/pattern_api/vector_sum_map.cpp
 create mode 100644 examples/pattern_api/vector_sum_map_batch.cpp
 create mode 100644 examples/pattern_api/vector_sum_map_managing_memory.cpp
 create mode 100644 examples/pattern_api/vector_sum_map_parallel.cpp
 create mode 100644 examples/pattern_api/vector_sum_mapreduce.cpp
 create mode 100644 examples/sequential/mandel.cpp
 create mode 100644 examples/sequential/matrix_multi_cm.cpp
 create mode 100644 examples/sequential/matrix_multi_rm.cpp
 create mode 100644 examples/sequential/primer.cpp
 create mode 100644 examples/sequential/raytracer.cpp
 create mode 100644 examples/sequential/reduce.cpp
 create mode 100644 examples/sequential/saxpy.cpp
 create mode 100644 examples/sequential/vector_sum.cpp
 create mode 100644 examples/workloads/raytracer_scene.xml
 create mode 100644 src/GSPar.hpp
 create mode 100644 src/GSPar_Base.cpp
 create mode 100644 src/GSPar_Base.hpp
 create mode 100644 src/GSPar_BaseGPUDriver.hpp
 create mode 100644 src/GSPar_BaseParallelPattern.hpp
 create mode 100644 src/GSPar_CUDA.cpp
 create mode 100644 src/GSPar_CUDA.hpp
 create mode 100644 src/GSPar_OpenCL.cpp
 create mode 100644 src/GSPar_OpenCL.hpp
 create mode 100644 src/GSPar_PatternComposition.hpp
 create mode 100644 src/GSPar_PatternMap.hpp
 create mode 100644 src/GSPar_PatternReduce.cpp
 create mode 100644 src/GSPar_PatternReduce.hpp
 create mode 100644 thirdpt/marX2/marX2.c
 create mode 100644 thirdpt/marX2/marX2.h
 create mode 100644 thirdpt/rapidxml-1.13/license.txt
 create mode 100644 thirdpt/rapidxml-1.13/manual.html
 create mode 100644 thirdpt/rapidxml-1.13/rapidxml.hpp
 create mode 100644 thirdpt/rapidxml-1.13/rapidxml_iterators.hpp
 create mode 100644 thirdpt/rapidxml-1.13/rapidxml_print.hpp
 create mode 100644 thirdpt/rapidxml-1.13/rapidxml_utils.hpp

diff --git a/.github/workflows/gspar-build-ci.yml b/.github/workflows/gspar-build-ci.yml
new file mode 100644
index 0000000..0d3fb6a
--- /dev/null
+++ b/.github/workflows/gspar-build-ci.yml
@@ -0,0 +1,26 @@
+name: GSPar Build CI
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Check out repository
+      uses: actions/checkout@v2
+    - name: Fix apt on GitHub Actions
+      run: sudo gem install apt-spy2 && sudo apt-spy2 fix --commit --launchpad --country=US
+    - name: Update apt
+      run: sudo apt-get update
+    - name: Install OpenCL
+      run: sudo apt-get -o Acquire::Retries=3 install opencl-headers nvidia-opencl-dev #nvidia-libopencl1-384
+    - name: Install CUDA
+      run: sudo apt-get -o Acquire::Retries=3 install nvidia-cuda-dev
+    - name: Build library
+      run: make
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..b8106b1
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,40 @@
+# VS Code
+.vscode/ipch/*
+
+# Prerequisites
+*.d
+
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.obj
+
+# Precompiled Headers
+*.gch
+*.pch
+
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+
+# Fortran module files
+*.mod
+*.smod
+
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+
+# Ignore the build and lib dirs
+build
+lib/*
+
+# Executables
+bin/*
+
+# Auto-generated
+generated_*.cpp
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..0a03df8
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 Parallel Applications Modelling Group - GMAP
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..f64422b
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,151 @@
+# Compilers
+COMPILER := g++
+# Directories
+SRCDIR := src
+BUILDDIR := build
+TARGETDIR := bin
+EXAMPLESDIR := examples
+EXAMPLEDRIVERAPIDIR := $(EXAMPLESDIR)/driver_api
+EXAMPLEPATTERNAPIDIR := $(EXAMPLESDIR)/pattern_api
+EXAMPLESEQUENTIALDIR := $(EXAMPLESDIR)/sequential
+THIRDPTDIR := thirdpt
+MARX2DIR := $(THIRDPTDIR)/marX2
+LIBMARX2PATH := $(MARX2DIR)/libmarX2.a
+# Names
+LIBNAME := gspar
+GSPARNAME := gspar
+CUDANAME := cuda
+OCLNAME := opencl
+DRIVERAPINAME := driverapi
+PATTERNAPINAME := patternapi
+SEQUENTIALNAME := seq
+# App names
+MANDELNAME := mandel
+LANEDETECTIONNAME := lanedetection
+# Target
+TARGET := $(TARGETDIR)/lib$(LIBNAME).so
+# Others
+SPACE := 
+SPACE +=
+SRCEXT := cpp
+EXAMPLESTARGETPREFIX := ex
+
+# Files
+SOURCES := $(wildcard $(SRCDIR)/*.$(SRCEXT))
+OBJECTS := $(patsubst $(SRCDIR)/%,$(BUILDDIR)/%,$(SOURCES:.$(SRCEXT)=.o))
+
+CFLAGS := -Wall -std=c++14 -O3 -Wno-reorder
+LIB := -Llib -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/local/cuda/targets/x86_64-linux/lib/stubs -L/usr/local/cuda/lib -L/usr/local/cuda/lib64
+LIBOCL := -lOpenCL
+LIBCUDADRIVER := -lcuda
+LIBCUDANVRTC := -lnvrtc
+LIBPTHREAD := -pthread
+PATHSLIB := -I/usr/local/cuda/include -Isrc
+PATHSTEST := $(PATHSLIB) -I$(THIRDPTDIR) -I$(EXAMPLESDIR)/include
+TESTLIB := -L$(TARGETDIR) -l$(LIBNAME)
+
+PATHOPENCV := -I/usr/local/include/opencv4
+LIBSOPENCV := -lopencv_core -lopencv_highgui -lopencv_imgproc -lopencv_imgcodecs
+EXTRADEPS := 
+INCMANDEL := 
+LIBMANDEL := 
+ifdef DEBUG
+	DEFS +=-DDEBUG -DGSPAR_DEBUG
+	EXTRADEPS := $(LIBMARX2PATH)
+	INCMANDEL := -I$(MARX2DIR) -L$(MARX2DIR)
+	LIBMANDEL := -lmarX2 -lX11 -lm
+endif
+
+CLR_BLUE := \033[0;34m
+CLR_ORANGE := \033[0;33m
+CLR_DARKCYAN := \033[0;36m
+CLR_NO := \033[0m
+
+# Functions
+get_paths_mandel = $(if $(findstring $(MANDELNAME), $(1)), $(INCMANDEL))
+get_paths_lanedetection = $(if $(findstring $(LANEDETECTIONNAME), $(1)), $(PATHOPENCV))
+get_paths = $(strip $(PATHSTEST) $(LIB) $(call get_paths_mandel, $(1)) $(call get_paths_lanedetection, $(1)) )
+
+get_libs_mandel = $(if $(findstring $(MANDELNAME), $(1)), $(LIBMANDEL))
+get_libs_lanedetection = $(if $(findstring $(LANEDETECTIONNAME), $(1)), $(LIBSOPENCV))
+get_libs = $(strip $(call get_libs_mandel, $(1)) $(call get_libs_lanedetection, $(1)))
+
+# Driver API examples
+EXAMPLESOURCES_DRIVERAPI := $(wildcard $(EXAMPLEDRIVERAPIDIR)/*.$(SRCEXT))
+EXAMPLETARGETS_DRIVERAPI_CUDA := $(patsubst $(EXAMPLEDRIVERAPIDIR)/%,$(TARGETDIR)/$(EXAMPLESTARGETPREFIX)_$(DRIVERAPINAME)_%,$(EXAMPLESOURCES_DRIVERAPI:.$(SRCEXT)=_$(CUDANAME)))
+EXAMPLETARGETS_DRIVERAPI_OPENCL := $(patsubst $(EXAMPLEDRIVERAPIDIR)/%,$(TARGETDIR)/$(EXAMPLESTARGETPREFIX)_$(DRIVERAPINAME)_%,$(EXAMPLESOURCES_DRIVERAPI:.$(SRCEXT)=_$(OCLNAME)))
+# Pattern API examples
+EXAMPLESOURCES_PATTERNAPI := $(wildcard $(EXAMPLEPATTERNAPIDIR)/*.$(SRCEXT))
+EXAMPLETARGETS_PATTERNAPI_CUDA := $(patsubst $(EXAMPLEPATTERNAPIDIR)/%,$(TARGETDIR)/$(EXAMPLESTARGETPREFIX)_$(PATTERNAPINAME)_%,$(EXAMPLESOURCES_PATTERNAPI:.$(SRCEXT)=_$(CUDANAME)))
+EXAMPLETARGETS_PATTERNAPI_OPENCL := $(patsubst $(EXAMPLEPATTERNAPIDIR)/%,$(TARGETDIR)/$(EXAMPLESTARGETPREFIX)_$(PATTERNAPINAME)_%,$(EXAMPLESOURCES_PATTERNAPI:.$(SRCEXT)=_$(OCLNAME)))
+# Sequential examples
+EXAMPLESOURCES_SEQUENTIAL := $(wildcard $(EXAMPLESEQUENTIALDIR)/*.$(SRCEXT))
+EXAMPLETARGETS_SEQUENTIAL := $(patsubst $(EXAMPLESEQUENTIALDIR)/%,$(TARGETDIR)/$(EXAMPLESTARGETPREFIX)_$(SEQUENTIALNAME)_%,$(EXAMPLESOURCES_SEQUENTIAL:.$(SRCEXT)=))
+
+
+# Build targets
+
+$(TARGET): $(OBJECTS) | $(TARGETDIR)
+	@echo "${CLR_DARKCYAN}Linking dynamic library ${CLR_ORANGE}$(TARGET)${CLR_NO}..."
+	$(COMPILER) $(DEFS) -shared -fPIC -o $(TARGET) $^ $(LIB) $(LIBOCL) $(LIBCUDADRIVER) $(LIBCUDANVRTC)
+
+$(BUILDDIR)/%.o: $(SRCDIR)/%.$(SRCEXT) | $(BUILDDIR)
+	@echo "${CLR_DARKCYAN}Compiling and assembling object ${CLR_ORANGE}$@${CLR_NO}..."
+	$(COMPILER) $(DEFS) $(CFLAGS) $(PATHSLIB) -c -fPIC -o $@ $<
+
+$(TARGETDIR):
+	@mkdir -p $@
+
+$(BUILDDIR):
+	@mkdir -p $@
+
+
+.PHONY: examples
+examples: examples_driver_api examples_pattern_api examples_sequential
+
+# Driver API examples
+examples_driver_api: $(EXAMPLETARGETS_DRIVERAPI_CUDA) $(EXAMPLETARGETS_DRIVERAPI_OPENCL)
+$(TARGETDIR)/$(EXAMPLESTARGETPREFIX)_$(DRIVERAPINAME)_%: $(TARGETDIR)/$(EXAMPLESTARGETPREFIX)_$(DRIVERAPINAME)_%_$(CUDANAME) $(TARGETDIR)/$(EXAMPLESTARGETPREFIX)_$(DRIVERAPINAME)_%_$(OCLNAME) ;
+# Lib to CUDA
+$(TARGETDIR)/$(EXAMPLESTARGETPREFIX)_$(DRIVERAPINAME)_%_$(CUDANAME): $(EXAMPLEDRIVERAPIDIR)/%.$(SRCEXT) $(TARGET) $(EXTRADEPS) | $(TARGETDIR)
+	@echo "${CLR_DARKCYAN}Building GSPar Driver API example ${CLR_ORANGE}$@${CLR_DARKCYAN} from $<${CLR_NO}"
+	$(COMPILER) $(DEFS) -DGSPARDRIVER_CUDA $(CFLAGS) $< $(call get_paths, $<) $(TESTLIB) -o $@ $(LIBPTHREAD) $(call get_libs, $<)
+# Lib to OpenCL
+$(TARGETDIR)/$(EXAMPLESTARGETPREFIX)_$(DRIVERAPINAME)_%_$(OCLNAME): $(EXAMPLEDRIVERAPIDIR)/%.$(SRCEXT) $(TARGET) $(EXTRADEPS) | $(TARGETDIR)
+	@echo "${CLR_DARKCYAN}Building GSPar Driver API example ${CLR_ORANGE}$@${CLR_DARKCYAN} from $<${CLR_NO}"
+	$(COMPILER) $(DEFS) -DGSPARDRIVER_OPENCL $(CFLAGS) $< $(call get_paths, $<) $(TESTLIB) -o $@ $(LIBPTHREAD) $(call get_libs, $<)
+
+# Pattern API examples
+examples_pattern_api: $(EXAMPLETARGETS_PATTERNAPI_CUDA) $(EXAMPLETARGETS_PATTERNAPI_OPENCL)
+$(TARGETDIR)/$(EXAMPLESTARGETPREFIX)_$(PATTERNAPINAME)_%: $(TARGETDIR)/$(EXAMPLESTARGETPREFIX)_$(PATTERNAPINAME)_%_$(CUDANAME) $(TARGETDIR)/$(EXAMPLESTARGETPREFIX)_$(PATTERNAPINAME)_%_$(OCLNAME) ;
+# Lib to CUDA
+$(TARGETDIR)/$(EXAMPLESTARGETPREFIX)_$(PATTERNAPINAME)_%_$(CUDANAME): $(EXAMPLEPATTERNAPIDIR)/%.$(SRCEXT) $(TARGET) $(EXTRADEPS) | $(TARGETDIR)
+	@echo "${CLR_DARKCYAN}Building GSPar Pattern API example ${CLR_ORANGE}$@${CLR_DARKCYAN} from $<${CLR_NO}"
+	$(COMPILER) $(DEFS) -DGSPARDRIVER_CUDA $(CFLAGS) $< $(call get_paths, $<) $(TESTLIB) -o $@ $(LIBPTHREAD) $(call get_libs, $<)
+# Lib to OpenCL
+$(TARGETDIR)/$(EXAMPLESTARGETPREFIX)_$(PATTERNAPINAME)_%_$(OCLNAME): $(EXAMPLEPATTERNAPIDIR)/%.$(SRCEXT) $(TARGET) $(EXTRADEPS) | $(TARGETDIR)
+	@echo "${CLR_DARKCYAN}Building GSPar Pattern API example ${CLR_ORANGE}$@${CLR_DARKCYAN} from $<${CLR_NO}"
+	$(COMPILER) $(DEFS) -DGSPARDRIVER_OPENCL $(CFLAGS) $< $(call get_paths, $<) $(TESTLIB) -o $@ $(LIBPTHREAD) $(call get_libs, $<)
+
+# Sequential examples
+examples_sequential: $(EXAMPLETARGETS_SEQUENTIAL)
+$(TARGETDIR)/$(EXAMPLESTARGETPREFIX)_$(SEQUENTIALNAME)_%: $(EXAMPLESEQUENTIALDIR)/%.$(SRCEXT) | $(TARGETDIR)
+	@echo "${CLR_DARKCYAN}Building sequential example ${CLR_ORANGE}$@${CLR_DARKCYAN} from $<${CLR_NO}"
+	$(COMPILER) $(DEFS) $(CFLAGS) $< $(call get_paths, $<) -o $@ $(call get_libs, $<)
+
+$(LIBMARX2PATH): $(MARX2DIR)/marX2.c $(MARX2DIR)/marX2.h
+	@echo "${CLR_DARKCYAN}Building ${CLR_ORANGE}$(LIBMARX2PATH)${CLR_DARKCYAN}${CLR_NO}"
+	gcc -c -Wall -O3 -I/usr/X11R6/include -I$(MARX2DIR) $(MARX2DIR)/marX2.c -o $(MARX2DIR)/marX2.o
+	ar -rv $(LIBMARX2PATH) $(MARX2DIR)/marX2.o
+	ranlib $(LIBMARX2PATH)
+
+.PHONY: clean
+clean:
+	@echo "${CLR_DARKCYAN}Cleaning...${CLR_NO}"; 
+	$(RM) -r $(BUILDDIR) $(TARGETDIR) $(MARX2DIR)/*.a $(MARX2DIR)/*.o
+
+clean_lib:
+	@echo "${CLR_DARKCYAN}Cleaning lib $(TARGET)...${CLR_NO}"; 
+	$(RM) $(OBJECTS) $(TARGET)
+
+all: $(TARGET) examples
diff --git a/README.md b/README.md
index 73de64e..bcd1726 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,23 @@
-# GSParLib
\ No newline at end of file
+# GSParLib
+
+GSParLib is a C++ object-oriented multi-level API for GPU programming that allows code portability between different GPU platforms and targets stream and data parallelism.
+
+The scientific article presenting GSParLib is currently under review.
+
+## Compilation
+
+- `make` builds the library
+- `make examples` builds the examples for both Pattern and Driver API, as well as the sequential versions. To compile just a specific set of examples, use one of:
+  - `make examples_driver_api`
+  - `make examples_pattern_api`
+  - `make examples_sequential`
+- Alternatively, it is possible to compile individual examples by referring directly to their compiled names (the `cuda`/`opencl` suffix may be ommited). Ex.: `make bin/ex_driverapi_gpuinfo` compiles both CUDA and OpenCL versions of the [gpuinfo.cpp](examples/driver_api/gpuinfo.cpp) example.
+
+To compile with debugging enabled, use `DEBUG=1 make` (both when compiling the library and the examples). This automatically enables debugging flags, so that GSParLib print various debugging information during execution.
+
+## Run examples
+
+After building the library it is necessary to make it available at runtime.
+To do this, execute `export LD_LIBRARY_PATH=<path>/bin:$LD_LIBRARY_PATH`, replacing `<path>` with the path to the repository's root folder.
+
+After this, just execute any example under the path `bin/ex_`
diff --git a/examples/driver_api/atomic_operations.cpp b/examples/driver_api/atomic_operations.cpp
new file mode 100644
index 0000000..679cd77
--- /dev/null
+++ b/examples/driver_api/atomic_operations.cpp
@@ -0,0 +1,113 @@
+#include <iostream>
+#include <chrono>
+
+#ifdef GSPARDRIVER_OPENCL
+    #include "GSPar_OpenCL.hpp"
+    using namespace GSPar::Driver::OpenCL;
+#else
+    #include "GSPar_CUDA.hpp"
+    using namespace GSPar::Driver::CUDA;
+#endif
+
+const char* kernelSource = GSPAR_STRINGIZE_SOURCE(
+    GSPAR_DEVICE_KERNEL void atomicops_kernel(const int max,
+            GSPAR_DEVICE_GLOBAL_MEMORY const int *vector,
+            GSPAR_DEVICE_GLOBAL_MEMORY int *result) {
+        size_t gid = gspar_get_global_id(0);
+        if (gid <= max) {
+            gspar_atomic_add_int(result, vector[gid]);
+        }
+    }
+);
+
+void print_vector(int size, const int* vector, bool compact = false) {
+    if (compact || size > 100) {
+        std::cout << vector[0] << "..." << vector[size-1];
+    } else {
+        for (int i = 0; i < size; i++) {
+            std::cout << vector[i] << " ";
+        }
+    }
+    std::cout << std::endl;
+}
+
+int main(int argc, const char * argv[]) {
+
+    std::cout << "Testing atomic operations in GSParLib Driver API" << std::endl;
+
+    const int VECTOR_SIZE = 20;
+
+    // Create memory objects
+    int correctResult = 0;
+    int* result = new int;
+    int* vector = new int[VECTOR_SIZE];
+    for (int i = 0; i < VECTOR_SIZE; i++) {
+        vector[i] = (int)i;
+        correctResult += i;
+    }
+
+    std::cout << "Vector with " << VECTOR_SIZE << " elements:" << std::endl;
+    print_vector(VECTOR_SIZE, vector);
+
+    try {
+
+        auto t_start = std::chrono::steady_clock::now();
+
+        auto driver = Instance::getInstance();
+        driver->init();
+
+        int numGpus = driver->getGpuCount();
+        if (numGpus == 0) {
+            std::cout << "No GPU found, interrupting test" << std::endl;
+            exit(-1);
+        }
+
+        // Get the first GPU
+        auto gpu = driver->getGpu(0);
+        
+        auto vector_dev = gpu->malloc(sizeof(int) * VECTOR_SIZE, vector);
+        // Async copy
+        // vector_dev->copyInAsync();
+        // vector_dev->waitAsync();
+        // Sync copy
+        vector_dev->copyIn();
+
+        auto result_dev = gpu->malloc(sizeof(int), result);
+
+        auto kernel = gpu->prepareKernel(kernelSource, "atomicops_kernel");
+        // auto kernel = new Kernel(gpu, kernelSource, "atomicops_kernel");
+
+        // Set a fixed number of threads per block for the X dimension
+        // kernel->setNumThreadsPerBlockForX(5);
+        kernel->setParameter(sizeof(VECTOR_SIZE), &VECTOR_SIZE);
+        kernel->setParameter(vector_dev);
+        kernel->setParameter(result_dev);
+
+        kernel->runAsync({VECTOR_SIZE, 0});
+        kernel->waitAsync();
+
+        result_dev->copyOut();
+
+        delete kernel;
+        delete vector_dev;
+        delete result_dev;
+
+        auto t_end = std::chrono::steady_clock::now();
+
+        // Output the result buffer
+        std::cout << "Expected result: " << correctResult << std::endl;
+        std::cout << "Actual result:   " << *result << std::endl;
+
+        delete vector;
+        delete result;
+
+        std::cout << "Test finished succesfully in " << std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count() << " ms " << std::endl;
+
+        return 0;
+
+    } catch (GSPar::GSParException &ex) {
+        std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl;
+        exit(-1);
+    }
+
+}
diff --git a/examples/driver_api/gpuinfo.cpp b/examples/driver_api/gpuinfo.cpp
new file mode 100644
index 0000000..184f860
--- /dev/null
+++ b/examples/driver_api/gpuinfo.cpp
@@ -0,0 +1,93 @@
+#include <iostream>
+#include <chrono>
+
+#ifdef GSPARDRIVER_OPENCL
+
+    const char* nameOfGSParDriver = "OpenCL";
+
+    #include "GSPar_OpenCL.hpp"
+    using namespace GSPar::Driver::OpenCL;
+
+#else
+
+    const char* nameOfGSParDriver = "CUDA";
+
+    #include "GSPar_CUDA.hpp"
+    using namespace GSPar::Driver::CUDA;
+
+#endif
+
+const char* kernelSource = GSPAR_STRINGIZE_SOURCE(
+    GSPAR_DEVICE_MACRO_BEGIN CONSTANT_N 42 GSPAR_DEVICE_MACRO_END
+    GSPAR_DEVICE_KERNEL void info_kernel(int N) {
+        unsigned int idx_x = gspar_get_global_id(0);
+        unsigned int idx_y = gspar_get_global_id(1);
+        unsigned int blk_x = gspar_get_block_size(0);
+        unsigned int blk_y = gspar_get_block_size(1);
+        unsigned int blkid_x = gspar_get_block_id(0);
+        unsigned int blkid_y = gspar_get_block_id(1);
+        unsigned int thr_x = gspar_get_thread_id(0);
+        unsigned int thr_y = gspar_get_thread_id(1);
+        gspar_synchronize_local_threads(); // Unnecessary, just for show
+        printf("Thread [%u,%u]: Dim (%u, %u), Block (%u, %u), thread (%u, %u), constant N: %d, parameter N: %d\n",
+            idx_x, idx_y, blk_x, blk_y, blkid_x, blkid_y, thr_x, thr_y, CONSTANT_N, N);
+    }
+);
+
+int main(int argc, const char * argv[]) {
+
+    std::cout << "Testing GSPar Driver: " << nameOfGSParDriver << std::endl;
+
+    try {
+
+        auto t_start = std::chrono::steady_clock::now();
+
+        Instance* driver = Instance::getInstance();
+        driver->init();
+
+        int numGpus = driver->getGpuCount();
+        if (numGpus == 0) {
+            std::cout << "No GPU found, interrupting test" << std::endl;
+            exit(-1);
+        }
+
+        auto gpus = driver->getGpuList();
+
+        std::cout << "Found " << numGpus << " GPU devices:" << std::endl;
+        int d = 0;
+        for (auto const& gpu : gpus) {
+            std::cout << "Device #" << ++d << ": \"" << gpu->getName() << "\"";
+            std::cout << " (" << (gpu->isIntegratedMainMemory() ? "integrated" : "dedicated") << ")" << std::endl;
+            std::cout << "    Memory:" << std::endl;
+            std::cout << "      Total global memory:        " << gpu->getGlobalMemorySizeBytes()/(1024 * 1024) << " MB" << std::endl;
+            std::cout << "      Total local memory:         " << gpu->getLocalMemorySizeBytes()/1024 << " KB" << std::endl;
+            std::cout << "      Total shared memory per CU: " << gpu->getSharedMemoryPerComputeUnitSizeBytes()/1024 << " KB" << std::endl;
+            std::cout << "    Number of compute units (CU): " << gpu->getComputeUnitsCount() << std::endl;
+            std::cout << "    Maximum threads per block:    " << gpu->getMaxThreadsPerBlock() << std::endl;
+            std::cout << "    Device clock rate:            " << gpu->getClockRateMHz() << " MHz" << std::endl;
+        }
+
+        auto gpu = gpus.front();
+        std::cout << "Running test kernel in the first GPU (" << gpu->getName() << ")" << std::endl;
+
+        auto kernel = gpu->prepareKernel(kernelSource, "info_kernel");
+        // auto kernel = new Kernel(gpu, kernelSource, "info_kernel");
+
+        int N = 12;
+        kernel->setParameter(sizeof(N), &N);
+
+        kernel->runAsync({2, 3});
+        // kernel->waitAsync();
+
+        delete kernel;
+
+        auto t_end = std::chrono::steady_clock::now();
+
+        std::cout << "Test finished succesfully in " << std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count() << " ms " << std::endl;
+
+    } catch (GSPar::GSParException &ex) {
+        std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl;
+    }
+
+    return 0;
+}
diff --git a/examples/driver_api/matrix_multi.cpp b/examples/driver_api/matrix_multi.cpp
new file mode 100644
index 0000000..aff2bc5
--- /dev/null
+++ b/examples/driver_api/matrix_multi.cpp
@@ -0,0 +1,143 @@
+#include <iostream>
+#include <chrono>
+
+#ifdef GSPARDRIVER_CUDA
+
+    #include "GSPar_CUDA.hpp"
+    using namespace GSPar::Driver::CUDA;
+
+// #elif GSPARDRIVER_OPENCL
+#else // This way my IDE doesn't complain
+
+    #include "GSPar_OpenCL.hpp"
+    using namespace GSPar::Driver::OpenCL;
+
+#endif
+
+const char* kernelSource = GSPAR_STRINGIZE_SOURCE(
+    GSPAR_DEVICE_KERNEL void matrix_multi(long MX,
+            GSPAR_DEVICE_GLOBAL_MEMORY const long *a,
+            GSPAR_DEVICE_GLOBAL_MEMORY const long *b,
+            GSPAR_DEVICE_GLOBAL_MEMORY long *result) {
+        long i = gspar_get_global_id(0);
+        long j = gspar_get_global_id(1);
+        if (i < MX && j < MX) {
+            for (long k = 0; k<MX; k++) {
+                result[i*MX+j] += a[i*MX+k] * b[k*MX+j];
+            }
+        }
+    }
+);
+
+void matrix_multi(const long max, const long* a, const long* b, long* result) {
+    try {
+
+        Instance* driver = Instance::getInstance();
+        driver->init();
+
+        int numGpus = driver->getGpuCount();
+        if (numGpus == 0) {
+            std::cerr << "No GPU found, interrupting test" << std::endl;
+            exit(-1);
+        }
+
+        auto gpus = driver->getGpuList();
+
+        // Get the first GPU
+        Device* gpu = gpus.front();
+        MemoryObject* a_dev = gpu->malloc(sizeof(long) * max * max, a);
+        MemoryObject* b_dev = gpu->malloc(sizeof(long) * max * max, b);
+        // Async copy
+        // a_dev->copyInAsync();
+        // b_dev->copyInAsync();
+        // AsyncExecutionSupport::waitAllAsync({ a_dev->getBaseAsyncObject(), b_dev->getBaseAsyncObject() });
+        // Sync copy
+        a_dev->copyIn();
+        b_dev->copyIn();
+
+        MemoryObject* result_dev = gpu->malloc(sizeof(long) * max * max, result);
+        result_dev->copyIn();
+
+        // Kernel* kernel = gpu->prepareKernel(kernelSource, "matrix_multi");
+        Kernel* kernel = new Kernel(gpu, kernelSource, "matrix_multi");
+        
+        kernel->setParameter(sizeof(max), &max);
+        kernel->setParameter(a_dev);
+        kernel->setParameter(b_dev);
+        kernel->setParameter(result_dev);
+
+        unsigned long dimensions[3] = {(unsigned long)max, (unsigned long)max, 0};
+        kernel->runAsync(dimensions);
+        kernel->waitAsync();
+
+        result_dev->copyOut();
+
+        delete kernel;
+        delete a_dev;
+        delete b_dev;
+        delete result_dev;
+
+    } catch (GSPar::GSParException &ex) {
+        std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl;
+        exit(-1);
+    }
+}
+
+void print_matrix(long max, const long* matrix, bool compact = false) {
+    if (compact || max > 100) {
+        std::cout << matrix[0] << "..." << matrix[(max * max)-1];
+    } else {
+        for (long i = 0; i < max; i++) {
+            std::cout << std::endl;
+            for (long j = 0; j < max; j++) {
+                std::cout << matrix[i * max + j] << " ";
+            }
+        }
+    }
+    std::cout << std::endl;
+}
+
+int main(int argc, const char * argv[]) {
+    if (argc < 2) {
+        std::cerr << "Use: " << argv[0] << " <matrix_size>" << std::endl;
+        exit(-1);
+    }
+    
+    const long MX = std::stoi(argv[1]);
+    std::cout << "Multiplying matrixes of " << MX << " x " << MX << std::endl;
+    
+    // Create memory objects
+    long* matrix_a = new long[MX * MX];
+    long* matrix_b = new long[MX * MX];
+    long* result = new long[MX * MX];
+    for (long i = 0; i < MX; i++) {
+        for (long j = 0; j < MX; j++) {
+            matrix_a[j * MX + i] = 4;
+            matrix_b[j * MX + i] = 5;
+            result[j * MX + i] = 0;
+        }
+    }
+
+    std::cout << "Matrix A: ";
+    print_matrix(MX, matrix_a, true);
+    std::cout << "Matrix B: ";
+    print_matrix(MX, matrix_b, true);
+
+    auto t_start = std::chrono::steady_clock::now();
+
+    matrix_multi(MX, matrix_a, matrix_b, result);
+
+    auto t_end = std::chrono::steady_clock::now();
+
+    // Output the result buffer
+    std::cout << "Result:   ";
+    print_matrix(MX, result);
+
+    delete matrix_a;
+    delete matrix_b;
+    delete result;
+
+    std::cout << "Test finished succesfully in " << std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count() << " ms " << std::endl;
+
+    return 0;
+}
diff --git a/examples/driver_api/shared_memory.cpp b/examples/driver_api/shared_memory.cpp
new file mode 100644
index 0000000..2f434e5
--- /dev/null
+++ b/examples/driver_api/shared_memory.cpp
@@ -0,0 +1,126 @@
+#include <iostream>
+#include <chrono>
+
+#ifdef GSPARDRIVER_OPENCL
+    #include "GSPar_OpenCL.hpp"
+    using namespace GSPar::Driver::OpenCL;
+#else
+    #include "GSPar_CUDA.hpp"
+    using namespace GSPar::Driver::CUDA;
+#endif
+
+void print_vector(unsigned int size, const unsigned int* vector, bool compact = false) {
+    if (compact || size > 100) {
+        std::cout << vector[0] << "..." << vector[size-1];
+    } else {
+        for (unsigned int i = 0; i < size; i++) {
+            std::cout << vector[i] << " ";
+        }
+    }
+    std::cout << std::endl;
+}
+
+int main(int argc, const char * argv[]) {
+
+    std::cout << "Testing shared memory in GSParLib Driver API" << std::endl;
+
+    std::string kernelSource = ""
+    "GSPAR_DEVICE_KERNEL void sharedmem_kernel(const int max, \n"
+    "    GSPAR_DEVICE_GLOBAL_MEMORY const unsigned int *vector, \n"
+    "    GSPAR_DEVICE_GLOBAL_MEMORY unsigned int *result";
+    #ifdef GSPARDRIVER_OPENCL // OpenCL requires declaring shared memory after all the parameters
+        kernelSource += ", GSPAR_DEVICE_SHARED_MEMORY unsigned int* sharedMem) { \n";
+    #else // CUDA requires declaring shared memory inside kernel's body
+        kernelSource += ") { \n GSPAR_DEVICE_SHARED_MEMORY unsigned int sharedMem[];\n";
+    #endif
+    kernelSource += 
+    "    size_t gid = gspar_get_global_id(0); \n"
+    "    if (gid <= max) { \n"
+    "        sharedMem[gid] = vector[gid]; \n"
+    "    } \n"
+    "    gspar_synchronize_local_threads(); \n"
+    "    if (gid == 0) { \n"
+    "        for (size_t i = 0; i < max; i++) { \n"
+    "            *result += sharedMem[i]; \n"
+    "        } \n"
+    "    } \n"
+    "} \n";
+
+    const unsigned int VECTOR_SIZE = 20;
+
+    // Create memory objects
+    unsigned int correctResult = 0;
+    unsigned int* result = new unsigned int;
+    unsigned int* vector = new unsigned int[VECTOR_SIZE];
+    for (unsigned int i = 0; i < VECTOR_SIZE; i++) {
+        vector[i] = (unsigned int)i;
+        correctResult += i;
+    }
+
+    std::cout << "Vector with " << VECTOR_SIZE << " elements:" << std::endl;
+    print_vector(VECTOR_SIZE, vector);
+
+    try {
+
+        auto t_start = std::chrono::steady_clock::now();
+
+        auto driver = Instance::getInstance();
+        driver->init();
+
+        int numGpus = driver->getGpuCount();
+        if (numGpus == 0) {
+            std::cout << "No GPU found, interrupting test" << std::endl;
+            exit(-1);
+        }
+
+        // Get the first GPU
+        auto gpu = driver->getGpu(0);
+        
+        auto vector_dev = gpu->malloc(sizeof(unsigned int) * VECTOR_SIZE, vector);
+        // Async copy
+        // vector_dev->copyInAsync();
+        // vector_dev->waitAsync();
+        // Sync copy
+        vector_dev->copyIn();
+
+        auto result_dev = gpu->malloc(sizeof(unsigned int), result);
+
+        auto kernel = gpu->prepareKernel(kernelSource, "sharedmem_kernel");
+        // auto kernel = new Kernel(gpu, kernelSource, "sharedmem_kernel");
+
+        kernel->setSharedMemoryAllocation(sizeof(unsigned int) * VECTOR_SIZE);
+
+        // Set a fixed number of threads per block for the X dimension
+        // kernel->setNumThreadsPerBlockForX(5);
+        kernel->setParameter(sizeof(VECTOR_SIZE), &VECTOR_SIZE);
+        kernel->setParameter(vector_dev);
+        kernel->setParameter(result_dev);
+
+        kernel->runAsync({VECTOR_SIZE, 0});
+        kernel->waitAsync();
+
+        result_dev->copyOut();
+
+        delete kernel;
+        delete vector_dev;
+        delete result_dev;
+
+        auto t_end = std::chrono::steady_clock::now();
+
+        // Output the result buffer
+        std::cout << "Expected result: " << correctResult << std::endl;
+        std::cout << "Actual result:   " << *result << std::endl;
+
+        delete vector;
+        delete result;
+
+        std::cout << "Test finished succesfully in " << std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count() << " ms " << std::endl;
+
+        return 0;
+
+    } catch (GSPar::GSParException &ex) {
+        std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl;
+        exit(-1);
+    }
+
+}
diff --git a/examples/driver_api/vector_sum.cpp b/examples/driver_api/vector_sum.cpp
new file mode 100644
index 0000000..17f5b6a
--- /dev/null
+++ b/examples/driver_api/vector_sum.cpp
@@ -0,0 +1,135 @@
+#include <iostream>
+#include <chrono>
+
+#ifdef GSPARDRIVER_OPENCL
+
+    #include "GSPar_OpenCL.hpp"
+    using namespace GSPar::Driver::OpenCL;
+
+#else
+
+    #include "GSPar_CUDA.hpp"
+    using namespace GSPar::Driver::CUDA;
+
+#endif
+
+const char* kernelSource = GSPAR_STRINGIZE_SOURCE(
+    GSPAR_DEVICE_KERNEL void vector_sum_kernel(const int max,
+            GSPAR_DEVICE_GLOBAL_MEMORY const unsigned int *a,
+            GSPAR_DEVICE_GLOBAL_MEMORY const unsigned int *b,
+            GSPAR_DEVICE_GLOBAL_MEMORY unsigned int *result) {
+        size_t gid = gspar_get_global_id(0);
+        if (gid <= max) {
+            result[gid] = a[gid] + b[gid];
+        }
+    }
+);
+
+void vector_sum(const unsigned int max, const unsigned int* a, const unsigned int* b, unsigned int* result) {
+
+    try {
+
+        auto driver = Instance::getInstance();
+        driver->init();
+
+        int numGpus = driver->getGpuCount();
+        if (numGpus == 0) {
+            std::cout << "No GPU found, interrupting test" << std::endl;
+            exit(-1);
+        }
+
+        // Get the first GPU
+        auto gpu = driver->getGpu(0);
+        
+        // MemoryObject* a_dev = new MemoryObject(gpu, sizeof(unsigned int) * max, a);
+        auto a_dev = gpu->malloc(sizeof(unsigned int) * max, a);
+        auto b_dev = gpu->malloc(sizeof(unsigned int) * max, b);
+        // Async copy
+        a_dev->copyInAsync();
+        b_dev->copyInAsync();
+        AsyncExecutionSupport::waitAllAsync({ a_dev, b_dev });
+        // Sync copy
+        // a_dev->copyIn();
+        // b_dev->copyIn();
+
+        auto result_dev = gpu->malloc(sizeof(unsigned int) * max, result);
+
+        // auto kernel = new Kernel(gpu, kernelSource, "vector_sum_kernel");
+        auto kernel = gpu->prepareKernel(kernelSource, "vector_sum_kernel");
+
+        // Set a fixed number of threads per block for the X dimension
+        kernel->setNumThreadsPerBlockForX(5);
+        kernel->setParameter(sizeof(max), &max);
+        kernel->setParameter(a_dev);
+        kernel->setParameter(b_dev);
+        kernel->setParameter(result_dev);
+
+        kernel->runAsync({max, 0});
+        kernel->waitAsync();
+
+        result_dev->copyOut();
+
+        delete kernel;
+        delete a_dev;
+        delete b_dev;
+        delete result_dev;
+
+    } catch (GSPar::GSParException &ex) {
+        std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl;
+        exit(-1);
+    }
+}
+
+void print_vector(unsigned int size, const unsigned int* vector, bool compact = false) {
+    if (compact || size > 100) {
+        std::cout << vector[0] << "..." << vector[size-1];
+    } else {
+        for (unsigned int i = 0; i < size; i++) {
+            std::cout << vector[i] << " ";
+        }
+    }
+    std::cout << std::endl;
+}
+
+int main(int argc, const char * argv[]) {
+    if (argc < 2) {
+        std::cerr << "Use: " << argv[0] << " <vector_size>" << std::endl;
+        exit(-1);
+    }
+
+    const unsigned int VECTOR_SIZE = std::stoi(argv[1]);
+
+    // Create memory objects
+    unsigned int* result = new unsigned int[VECTOR_SIZE];
+    unsigned int* a = new unsigned int[VECTOR_SIZE];
+    unsigned int* b = new unsigned int[VECTOR_SIZE];
+    for (unsigned int i = 0; i < VECTOR_SIZE; i++) {
+        a[i] = (unsigned int)i;
+        b[i] = (unsigned int)i + 1;
+        result[i] = 0;
+    }
+
+    std::cout << "Summing vectors:" << std::endl;
+    std::cout << "Vector A: ";
+    print_vector(VECTOR_SIZE, a);
+    std::cout << "Vector B: ";
+    print_vector(VECTOR_SIZE, b);
+
+    auto t_start = std::chrono::steady_clock::now();
+
+    vector_sum(VECTOR_SIZE, a, b, result);
+
+    auto t_end = std::chrono::steady_clock::now();
+
+    // Output the result buffer
+    std::cout << "Result:   ";
+    print_vector(VECTOR_SIZE, result);
+
+    delete result;
+    delete a;
+    delete b;
+
+    std::cout << "Test finished succesfully in " << std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count() << " ms " << std::endl;
+
+    return 0;
+}
diff --git a/examples/driver_api/vector_sum_chunked_memory.cpp b/examples/driver_api/vector_sum_chunked_memory.cpp
new file mode 100644
index 0000000..05f474d
--- /dev/null
+++ b/examples/driver_api/vector_sum_chunked_memory.cpp
@@ -0,0 +1,152 @@
+#include <iostream>
+#include <chrono>
+
+#ifdef GSPARDRIVER_OPENCL
+
+    #include "GSPar_OpenCL.hpp"
+    using namespace GSPar::Driver::OpenCL;
+
+#else
+
+    #include "GSPar_CUDA.hpp"
+    using namespace GSPar::Driver::CUDA;
+
+#endif
+
+const char* kernelSource = GSPAR_STRINGIZE_SOURCE(
+    GSPAR_DEVICE_KERNEL void vector_sum(const int max,
+            GSPAR_DEVICE_GLOBAL_MEMORY const float *a,
+            GSPAR_DEVICE_GLOBAL_MEMORY const float *b,
+            GSPAR_DEVICE_GLOBAL_MEMORY float *result) {
+        size_t gid = gspar_get_global_id(0);
+        if (gid <= max) {
+            result[gid] = a[gid] + b[gid];
+        }
+    }
+);
+
+void vector_sum(const unsigned int max, const unsigned int chunks, const float* a, const float* b, float* result) {
+
+    try {
+
+        Instance* driver = Instance::getInstance();
+        driver->init();
+
+        int numGpus = driver->getGpuCount();
+        if (numGpus == 0) {
+            std::cout << "No GPU found, interrupting test" << std::endl;
+            exit(-1);
+        }
+
+        // Get the first GPU
+        auto gpu = driver->getGpu(0);
+        
+        // Separate memory in chunks to simulate real-world chunked data
+        const unsigned int itemsInEachChunk = max/chunks;
+
+        const void** a_chunked = new const void*[chunks];
+        const void** b_chunked = new const void*[chunks];
+        for (unsigned int chunk = 0; chunk < chunks; chunk++) {
+            a_chunked[chunk] = &a[chunk*itemsInEachChunk];
+            b_chunked[chunk] = &b[chunk*itemsInEachChunk];
+            // std::cout << "a_chunked[" << chunk << "] starts on " << ((float*)a_chunked[chunk])[0] << std::endl;
+            // std::cout << "b_chunked[" << chunk << "] starts on " << ((float*)b_chunked[chunk])[0] << std::endl;
+        }
+
+        ChunkedMemoryObject* a_dev = gpu->mallocChunked(chunks, sizeof(float) * itemsInEachChunk, a_chunked);
+        ChunkedMemoryObject* b_dev = gpu->mallocChunked(chunks, sizeof(float) * itemsInEachChunk, b_chunked);
+
+        // Async copy
+        a_dev->copyInAsync();
+        b_dev->copyInAsync();
+        AsyncExecutionSupport::waitAllAsync({ a_dev, b_dev });
+        // Sync copy
+        // a_dev->copyIn();
+        // b_dev->copyIn();
+
+        MemoryObject* result_dev = gpu->malloc(sizeof(float) * max, result);
+
+        // Kernel* kernel = gpu->prepareKernel(kernelSource, "vector_sum");
+        Kernel* kernel = new Kernel(gpu, kernelSource, "vector_sum");
+
+        kernel->setParameter(sizeof(max), &max);
+        kernel->setParameter(a_dev);
+        kernel->setParameter(b_dev);
+        kernel->setParameter(result_dev);
+
+        unsigned long dimensions[3] = {max, 0, 0};
+        kernel->runAsync(dimensions);
+        kernel->waitAsync();
+
+        result_dev->copyOut();
+
+        delete kernel;
+        delete a_dev;
+        delete b_dev;
+        delete result_dev;
+
+    } catch (GSPar::GSParException &ex) {
+        std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl;
+        exit(-1);
+    }
+}
+
+void print_vector(unsigned int size, const float* vector, unsigned int itemsInEachChunk = 0, bool compact = false) {
+    if (compact || size > 100) {
+        std::cout << vector[0] << "..." << vector[size-1];
+    } else {
+        for (unsigned int i = 0; i < size; i++) {
+            std::cout << vector[i] << " ";
+            if (itemsInEachChunk && ((i+1) % itemsInEachChunk == 0)) std::cout << "| ";
+        }
+    }
+    std::cout << std::endl;
+}
+
+int main(int argc, const char * argv[]) {
+    if (argc < 3) {
+        std::cerr << "Use: " << argv[0] << " <vector_size> <chunks>" << std::endl;
+        std::cerr << " <vector_size> should be divisible by <chunks>" << std::endl;
+        exit(-1);
+    }
+
+    const unsigned int VECTOR_SIZE = std::stoi(argv[1]);
+    const unsigned int CHUNKS = std::stoi(argv[2]);
+
+    const unsigned int itemsInEachChunk = VECTOR_SIZE/CHUNKS;
+
+
+    // Create memory objects
+    float* result = new float[VECTOR_SIZE];
+    float* a = new float[VECTOR_SIZE];
+    float* b = new float[VECTOR_SIZE];
+    for (unsigned int i = 0; i < VECTOR_SIZE; i++) {
+        a[i] = (float)i;
+        b[i] = (float)(i * 2);
+        result[i] = 0;
+    }
+
+    std::cout << "Summing vectors:" << std::endl;
+    std::cout << "Vector A: ";
+    print_vector(VECTOR_SIZE, a, itemsInEachChunk);
+    std::cout << "Vector B: ";
+    print_vector(VECTOR_SIZE, b, itemsInEachChunk);
+
+    auto t_start = std::chrono::steady_clock::now();
+
+    vector_sum(VECTOR_SIZE, CHUNKS, a, b, result);
+
+    auto t_end = std::chrono::steady_clock::now();
+
+    // Output the result buffer
+    std::cout << "Result:   ";
+    print_vector(VECTOR_SIZE, result);
+
+    delete result;
+    delete a;
+    delete b;
+
+    std::cout << "Test finished succesfully in " << std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count() << " ms " << std::endl;
+
+    return 0;
+}
diff --git a/examples/include/cudabase.hpp b/examples/include/cudabase.hpp
new file mode 100644
index 0000000..3782235
--- /dev/null
+++ b/examples/include/cudabase.hpp
@@ -0,0 +1,49 @@
+
+#ifndef __CUDABASE_INCLUDED__
+#define __CUDABASE_INCLUDED__
+
+
+#define CUDA_ERROR_CHECK
+
+#define CudaSafeCall( err ) __cudaSafeCall( err, __FILE__, __LINE__ )
+#define CudaCheckError()    __cudaCheckError( __FILE__, __LINE__ )
+
+inline void __cudaSafeCall( cudaError err, const char *file, const int line )
+{
+#ifdef CUDA_ERROR_CHECK
+    if ( cudaSuccess != err )
+    {
+        fprintf( stderr, "cudaSafeCall() failed at %s:%i : %d-%s\n",
+                 file, line, err, cudaGetErrorString( err ) );
+        exit( -1 );
+    }
+#endif
+
+    return;
+}
+
+inline void __cudaCheckError( const char *file, const int line )
+{
+#ifdef CUDA_ERROR_CHECK
+    cudaError err = cudaGetLastError();
+    if ( cudaSuccess != err )
+    {
+        fprintf( stderr, "cudaCheckError() failed at %s:%i : %d-%s\n",
+                 file, line, err, cudaGetErrorString( err ) );
+        exit( -1 );
+    }
+
+    // More careful checking. However, this will affect performance.
+    // Comment away if needed.
+    err = cudaDeviceSynchronize();
+    if( cudaSuccess != err )
+    {
+        fprintf( stderr, "cudaCheckError() with sync failed at %s:%i : %d-%s\n",
+                 file, line, err, cudaGetErrorString( err ) );
+        exit( -1 );
+    }
+#endif
+
+    return;
+}
+#endif
\ No newline at end of file
diff --git a/examples/include/cudabase_driver.hpp b/examples/include/cudabase_driver.hpp
new file mode 100644
index 0000000..2b412cf
--- /dev/null
+++ b/examples/include/cudabase_driver.hpp
@@ -0,0 +1,56 @@
+
+#ifndef __CUDABASE_INCLUDED__
+#define __CUDABASE_INCLUDED__
+
+#include <stdio.h>
+#include <cuda.h>
+
+#define CUDA_ERROR_CHECK
+
+#define CudaSafeCall( err ) __cudaSafeCall( err, __FILE__, __LINE__ )
+// #define CudaCheckError()    __cudaCheckError( __FILE__, __LINE__ )
+
+inline void __cudaSafeCall( CUresult err, const char *file, const int line )
+{
+#ifdef CUDA_ERROR_CHECK
+    if ( CUDA_SUCCESS != err )
+    {
+        const char* errName;
+        cuGetErrorName(err, &errName);
+        const char* errString;
+        cuGetErrorString(err, &errString);
+
+        fprintf( stderr, "cudaSafeCall() failed at %s:%i : %s - %s\n",
+                 file, line, errName, errString );
+        exit( -1 );
+    }
+#endif
+
+    return;
+}
+
+// inline void __cudaCheckError( const char *file, const int line )
+// {
+// #ifdef CUDA_ERROR_CHECK
+//     CUresult err = cudaGetLastError();
+//     if ( cudaSuccess != err )
+//     {
+//         fprintf( stderr, "cudaCheckError() failed at %s:%i : %s\n",
+//                  file, line, cudaGetErrorString( err ) );
+//         exit( -1 );
+//     }
+
+//     // More careful checking. However, this will affect performance.
+//     // Comment away if needed.
+//     err = cudaDeviceSynchronize();
+//     if( cudaSuccess != err )
+//     {
+//         fprintf( stderr, "cudaCheckError() with sync failed at %s:%i : %s\n",
+//                  file, line, cudaGetErrorString( err ) );
+//         exit( -1 );
+//     }
+// #endif
+
+//     return;
+// }
+#endif
\ No newline at end of file
diff --git a/examples/include/cudabase_nvrtc.hpp b/examples/include/cudabase_nvrtc.hpp
new file mode 100644
index 0000000..deb85d1
--- /dev/null
+++ b/examples/include/cudabase_nvrtc.hpp
@@ -0,0 +1,57 @@
+
+#ifndef __CUDABASENVRTC_INCLUDED__
+#define __CUDABASENVRTC_INCLUDED__
+
+#include <stdio.h>
+#include <nvrtc.h>
+
+#define CUDA_ERROR_CHECK
+
+#define NvrtcSafeCall( err ) __nvrtcSafeCall( err, __FILE__, __LINE__ )
+#define NvrtcSafeBuild( err, prog ) __nvrtcSafeBuild( prog, err, __FILE__, __LINE__ )
+
+inline void __nvrtcSafeCall( nvrtcResult err, const char *file, const int line )
+{
+#ifdef CUDA_ERROR_CHECK
+    if ( NVRTC_SUCCESS != err )
+    {
+        const char* errString = nvrtcGetErrorString(err);
+
+        fprintf( stderr, "nvrtcSafeCall() failed at %s:%i : %s\n",
+                 file, line, errString );
+        exit( -1 );
+    }
+#endif
+
+    return;
+}
+
+inline void __nvrtcSafeBuild( nvrtcProgram prog, nvrtcResult err, const char *file, const int line )
+{
+#ifdef CUDA_ERROR_CHECK
+    if ( NVRTC_SUCCESS != err )
+    {
+        const char* errString = nvrtcGetErrorString(err);
+
+        fprintf( stderr, "nvrtcSafeBuild() failed at %s:%i : %s\n",
+                 file, line, errString );
+
+        size_t logSize = 0;
+        nvrtcGetProgramLogSize(prog, &logSize);
+
+        if (logSize > 0) {
+            char *buildLog = new char[logSize];
+            nvrtcGetProgramLog(prog, buildLog);
+
+            fprintf( stderr, "Build log:\n%s", buildLog);
+            delete[] buildLog;
+        }
+
+        exit( -1 );
+    }
+#endif
+
+    return;
+}
+
+#endif
\ No newline at end of file
diff --git a/examples/include/oclbase.h b/examples/include/oclbase.h
new file mode 100644
index 0000000..2a7d757
--- /dev/null
+++ b/examples/include/oclbase.h
@@ -0,0 +1,116 @@
+
+#ifndef __OCLBASE_INCLUDED__
+#define __OCLBASE_INCLUDED__
+
+#include <CL/opencl.h>
+#include <cstdio>
+
+#define OCL_ERROR_CHECK
+
+const char *__openCLGetErrorString(cl_int error)
+{
+switch(error){
+    // run-time and JIT compiler errors
+    case 0: return "CL_SUCCESS";
+    case -1: return "CL_DEVICE_NOT_FOUND";
+    case -2: return "CL_DEVICE_NOT_AVAILABLE";
+    case -3: return "CL_COMPILER_NOT_AVAILABLE";
+    case -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
+    case -5: return "CL_OUT_OF_RESOURCES";
+    case -6: return "CL_OUT_OF_HOST_MEMORY";
+    case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE";
+    case -8: return "CL_MEM_COPY_OVERLAP";
+    case -9: return "CL_IMAGE_FORMAT_MISMATCH";
+    case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
+    case -11: return "CL_BUILD_PROGRAM_FAILURE";
+    case -12: return "CL_MAP_FAILURE";
+    case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
+    case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
+    case -15: return "CL_COMPILE_PROGRAM_FAILURE";
+    case -16: return "CL_LINKER_NOT_AVAILABLE";
+    case -17: return "CL_LINK_PROGRAM_FAILURE";
+    case -18: return "CL_DEVICE_PARTITION_FAILED";
+    case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
+
+    // compile-time errors
+    case -30: return "CL_INVALID_VALUE";
+    case -31: return "CL_INVALID_DEVICE_TYPE";
+    case -32: return "CL_INVALID_PLATFORM";
+    case -33: return "CL_INVALID_DEVICE";
+    case -34: return "CL_INVALID_CONTEXT";
+    case -35: return "CL_INVALID_QUEUE_PROPERTIES";
+    case -36: return "CL_INVALID_COMMAND_QUEUE";
+    case -37: return "CL_INVALID_HOST_PTR";
+    case -38: return "CL_INVALID_MEM_OBJECT";
+    case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
+    case -40: return "CL_INVALID_IMAGE_SIZE";
+    case -41: return "CL_INVALID_SAMPLER";
+    case -42: return "CL_INVALID_BINARY";
+    case -43: return "CL_INVALID_BUILD_OPTIONS";
+    case -44: return "CL_INVALID_PROGRAM";
+    case -45: return "CL_INVALID_PROGRAM_EXECUTABLE";
+    case -46: return "CL_INVALID_KERNEL_NAME";
+    case -47: return "CL_INVALID_KERNEL_DEFINITION";
+    case -48: return "CL_INVALID_KERNEL";
+    case -49: return "CL_INVALID_ARG_INDEX";
+    case -50: return "CL_INVALID_ARG_VALUE";
+    case -51: return "CL_INVALID_ARG_SIZE";
+    case -52: return "CL_INVALID_KERNEL_ARGS";
+    case -53: return "CL_INVALID_WORK_DIMENSION";
+    case -54: return "CL_INVALID_WORK_GROUP_SIZE";
+    case -55: return "CL_INVALID_WORK_ITEM_SIZE";
+    case -56: return "CL_INVALID_GLOBAL_OFFSET";
+    case -57: return "CL_INVALID_EVENT_WAIT_LIST";
+    case -58: return "CL_INVALID_EVENT";
+    case -59: return "CL_INVALID_OPERATION";
+    case -60: return "CL_INVALID_GL_OBJECT";
+    case -61: return "CL_INVALID_BUFFER_SIZE";
+    case -62: return "CL_INVALID_MIP_LEVEL";
+    case -63: return "CL_INVALID_GLOBAL_WORK_SIZE";
+    case -64: return "CL_INVALID_PROPERTY";
+    case -65: return "CL_INVALID_IMAGE_DESCRIPTOR";
+    case -66: return "CL_INVALID_COMPILER_OPTIONS";
+    case -67: return "CL_INVALID_LINKER_OPTIONS";
+    case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT";
+
+    // extension errors
+    case -1000: return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR";
+    case -1001: return "CL_PLATFORM_NOT_FOUND_KHR";
+    case -1002: return "CL_INVALID_D3D10_DEVICE_KHR";
+    case -1003: return "CL_INVALID_D3D10_RESOURCE_KHR";
+    case -1004: return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR";
+    case -1005: return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR";
+    default: return "Unknown OpenCL error";
+    }
+}
+
+#define OpenCLCheckError(status)    __openCLCheckError( status, __FILE__, __LINE__ )
+#define OpenCLCheckBuildError(status, program, device)    __openCLPrintBuildError( status, program, device, __FILE__, __LINE__ )
+
+inline void __openCLCheckError( cl_int status, const char *file, const int line )
+{
+#ifdef OCL_ERROR_CHECK
+    if (status != CL_SUCCESS) {
+        printf("OpenCL failed at %s:%i : %s (%d)\n", file, line, __openCLGetErrorString(status), status);
+        exit( -1 );
+    }
+#endif
+    return;
+}
+
+inline void __openCLPrintBuildError(cl_int status, cl_program program, cl_device_id device, const char *file, const int line)
+{
+    if (status == CL_BUILD_PROGRAM_FAILURE) {
+        size_t log_size;
+        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
+        char *log = (char *) malloc(log_size);
+        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
+        printf("Program build log:\n");
+        printf("%s\n", log);
+        free(log);
+    }
+
+    OpenCLCheckError(status);
+}
+
+#endif
diff --git a/examples/include/oclbase.hpp b/examples/include/oclbase.hpp
new file mode 100644
index 0000000..4e30632
--- /dev/null
+++ b/examples/include/oclbase.hpp
@@ -0,0 +1,116 @@
+
+#ifndef __OCLBASE_INCLUDED__
+#define __OCLBASE_INCLUDED__
+
+#include <CL/opencl.hpp>
+#include <cstdio>
+
+#define OCL_ERROR_CHECK
+
+const char *__openCLGetErrorString(cl_int error)
+{
+switch(error){
+    // run-time and JIT compiler errors
+    case 0: return "CL_SUCCESS";
+    case -1: return "CL_DEVICE_NOT_FOUND";
+    case -2: return "CL_DEVICE_NOT_AVAILABLE";
+    case -3: return "CL_COMPILER_NOT_AVAILABLE";
+    case -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
+    case -5: return "CL_OUT_OF_RESOURCES";
+    case -6: return "CL_OUT_OF_HOST_MEMORY";
+    case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE";
+    case -8: return "CL_MEM_COPY_OVERLAP";
+    case -9: return "CL_IMAGE_FORMAT_MISMATCH";
+    case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
+    case -11: return "CL_BUILD_PROGRAM_FAILURE";
+    case -12: return "CL_MAP_FAILURE";
+    case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
+    case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
+    case -15: return "CL_COMPILE_PROGRAM_FAILURE";
+    case -16: return "CL_LINKER_NOT_AVAILABLE";
+    case -17: return "CL_LINK_PROGRAM_FAILURE";
+    case -18: return "CL_DEVICE_PARTITION_FAILED";
+    case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
+
+    // compile-time errors
+    case -30: return "CL_INVALID_VALUE";
+    case -31: return "CL_INVALID_DEVICE_TYPE";
+    case -32: return "CL_INVALID_PLATFORM";
+    case -33: return "CL_INVALID_DEVICE";
+    case -34: return "CL_INVALID_CONTEXT";
+    case -35: return "CL_INVALID_QUEUE_PROPERTIES";
+    case -36: return "CL_INVALID_COMMAND_QUEUE";
+    case -37: return "CL_INVALID_HOST_PTR";
+    case -38: return "CL_INVALID_MEM_OBJECT";
+    case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
+    case -40: return "CL_INVALID_IMAGE_SIZE";
+    case -41: return "CL_INVALID_SAMPLER";
+    case -42: return "CL_INVALID_BINARY";
+    case -43: return "CL_INVALID_BUILD_OPTIONS";
+    case -44: return "CL_INVALID_PROGRAM";
+    case -45: return "CL_INVALID_PROGRAM_EXECUTABLE";
+    case -46: return "CL_INVALID_KERNEL_NAME";
+    case -47: return "CL_INVALID_KERNEL_DEFINITION";
+    case -48: return "CL_INVALID_KERNEL";
+    case -49: return "CL_INVALID_ARG_INDEX";
+    case -50: return "CL_INVALID_ARG_VALUE";
+    case -51: return "CL_INVALID_ARG_SIZE";
+    case -52: return "CL_INVALID_KERNEL_ARGS";
+    case -53: return "CL_INVALID_WORK_DIMENSION";
+    case -54: return "CL_INVALID_WORK_GROUP_SIZE";
+    case -55: return "CL_INVALID_WORK_ITEM_SIZE";
+    case -56: return "CL_INVALID_GLOBAL_OFFSET";
+    case -57: return "CL_INVALID_EVENT_WAIT_LIST";
+    case -58: return "CL_INVALID_EVENT";
+    case -59: return "CL_INVALID_OPERATION";
+    case -60: return "CL_INVALID_GL_OBJECT";
+    case -61: return "CL_INVALID_BUFFER_SIZE";
+    case -62: return "CL_INVALID_MIP_LEVEL";
+    case -63: return "CL_INVALID_GLOBAL_WORK_SIZE";
+    case -64: return "CL_INVALID_PROPERTY";
+    case -65: return "CL_INVALID_IMAGE_DESCRIPTOR";
+    case -66: return "CL_INVALID_COMPILER_OPTIONS";
+    case -67: return "CL_INVALID_LINKER_OPTIONS";
+    case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT";
+
+    // extension errors
+    case -1000: return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR";
+    case -1001: return "CL_PLATFORM_NOT_FOUND_KHR";
+    case -1002: return "CL_INVALID_D3D10_DEVICE_KHR";
+    case -1003: return "CL_INVALID_D3D10_RESOURCE_KHR";
+    case -1004: return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR";
+    case -1005: return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR";
+    default: return "Unknown OpenCL error";
+    }
+}
+
+#define OpenCLCheckError(status)    __openCLCheckError( status, __FILE__, __LINE__ )
+#define OpenCLCheckBuildError(status, program, device)    __openCLPrintBuildError( status, program, device, __FILE__, __LINE__ )
+
+inline void __openCLCheckError( cl_int status, const char *file, const int line )
+{
+#ifdef OCL_ERROR_CHECK
+    if (status != CL_SUCCESS) {
+        printf("OpenCL failed at %s:%i : %s (%d)\n", file, line, __openCLGetErrorString(status), status);
+        exit( -1 );
+    }
+#endif
+    return;
+}
+
+inline void __openCLPrintBuildError(cl_int status, cl_program program, cl_device_id device, const char *file, const int line)
+{
+    if (status == CL_BUILD_PROGRAM_FAILURE) {
+        size_t log_size;
+        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
+        char *log = (char *) malloc(log_size);
+        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
+        printf("Program build log:\n");
+        printf("%s\n", log);
+        free(log);
+    }
+
+    OpenCLCheckError(status);
+}
+
+#endif
diff --git a/examples/pattern_api/mandel.cpp b/examples/pattern_api/mandel.cpp
new file mode 100644
index 0000000..c578277
--- /dev/null
+++ b/examples/pattern_api/mandel.cpp
@@ -0,0 +1,141 @@
+#include <iostream>
+#include <chrono>
+#ifdef DEBUG
+#include "marX2/marX2.h"
+#endif
+
+std::chrono::steady_clock::time_point tInitialization;
+std::chrono::steady_clock::time_point tComputation;
+std::chrono::steady_clock::time_point tFinishing;
+std::chrono::steady_clock::time_point tEnd;
+
+#ifdef GSPARDRIVER_CUDA
+
+    #include "GSPar_CUDA.hpp"
+    using namespace GSPar::Driver::CUDA;
+
+// #elif GSPARDRIVER_OPENCL
+#else // This way my IDE doesn't complain
+
+    #include "GSPar_OpenCL.hpp"
+    using namespace GSPar::Driver::OpenCL;
+
+#endif
+
+#include "GSPar_PatternMap.hpp"
+using namespace GSPar::Pattern;
+
+void mandelbrot(const double init_a, const double init_b, const double range, const unsigned long dim, const unsigned long niter, unsigned char *M) {
+    double step = range/((double) dim);
+
+    auto pattern = new Map(GSPAR_STRINGIZE_SOURCE(
+        double im=init_b+(step*i);
+        double cr;
+        double a=cr=init_a+step*j;
+        double b=im;
+        unsigned long k = 0;
+        for (k = 0; k < niter; k++) {
+            double a2=a*a;
+            double b2=b*b;
+            if ((a2+b2)>4.0) break;
+            b=2*a*b+im;
+            a=a2-b2+cr;
+        }
+        M[i*dim+j] = (unsigned char)(255-((k*255/niter)));
+    ));
+
+    try {
+
+        pattern->setParameter("init_a", init_a)
+            .setParameter("init_b", init_b)
+            .setParameter("step", step)
+            .setParameter("dim", dim)
+            .setParameter("niter", niter)
+            .setParameter("M", dim*dim, M, GSPAR_PARAM_OUT);
+
+        pattern->setStdVarNames({"i", "j", ""});
+
+        pattern->compile<Instance>({dim, dim, 0});
+
+    } catch (GSPar::GSParException &ex) {
+        std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl;
+        exit(-1);
+    }
+
+
+    tComputation = std::chrono::steady_clock::now(); // Ends initialization, start computation
+
+
+    try {
+        pattern->run<Instance>();
+    } catch (GSPar::GSParException &ex) {
+        std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl;
+        exit(-1);
+    }
+
+
+    tFinishing = std::chrono::steady_clock::now(); // Ends computation, start finishing
+
+
+    delete pattern;
+}
+
+int main(int argc, char **argv) {
+    double init_a=-2.125,init_b=-1.5,range=3.0;
+    unsigned long dim = 1000;
+    unsigned long niter = 1000;
+
+    #ifndef DEBUG
+        if (argc<3) {
+            std::cerr << "Usage: " << argv[0] << " <size> <niterations>" << std::endl;
+            exit(-1);
+        }
+    #endif
+    if (argc > 1) {
+        dim = strtoul(argv[1], 0, 10);
+    }
+    if (argc > 2) {
+        niter = strtoul(argv[2], 0, 10);
+    }
+
+    unsigned char *M = new unsigned char[dim*dim];
+
+    #ifdef DEBUG
+        SetupXWindows(dim,dim,1,NULL,"Mandelbroot");
+    #endif
+
+    tInitialization = std::chrono::steady_clock::now(); // Begins initialization
+
+    mandelbrot(init_a, init_b, range, dim, niter, M);
+
+    tEnd = std::chrono::steady_clock::now(); // Ends finish
+
+    #ifdef DEBUG
+        for(unsigned long i=0; i<dim; i++) {
+            ShowLine(&M[i*dim],dim,i);
+        }
+    #endif
+
+    double msTotal = std::chrono::duration_cast<std::chrono::milliseconds>(tEnd - tInitialization).count();
+    double msInitialization = std::chrono::duration_cast<std::chrono::milliseconds>(tComputation - tInitialization).count();
+    double msComputation = std::chrono::duration_cast<std::chrono::milliseconds>(tFinishing - tComputation).count();
+    double msFinishing = std::chrono::duration_cast<std::chrono::milliseconds>(tEnd - tFinishing).count();
+
+    #ifdef DEBUG
+        std::cout << "Teste: " << argv[0] << " " << dim << " " << niter << std::endl;
+        std::cout << "Total: " << msTotal << " ms" << std::endl;
+        std::cout << "Initialization: " << msInitialization << " ms" << std::endl;
+        std::cout << "Computation: " << msComputation << " ms" << std::endl;
+        std::cout << "Finishing: " << msFinishing << " ms" << std::endl;
+    #else
+        std::cout << argv[0] << " " << dim << " " << niter << ";" << msTotal << ";" << msInitialization << ";" << msComputation << ";" << msFinishing << std::endl;
+    #endif
+
+    #ifdef DEBUG
+        getchar();
+        CloseXWindows();
+    #endif
+
+    delete[] M;
+    return 0;
+}
diff --git a/examples/pattern_api/mandel_batched_parameters.cpp b/examples/pattern_api/mandel_batched_parameters.cpp
new file mode 100644
index 0000000..1950079
--- /dev/null
+++ b/examples/pattern_api/mandel_batched_parameters.cpp
@@ -0,0 +1,222 @@
+/* ***************************************************************************
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ *  As a special exception, you may use this file as part of a free software
+ *  library without restriction.  Specifically, if other files instantiate
+ *  templates or use macros or inline functions from this file, or you compile
+ *  this file and link it with other files to produce an executable, this
+ *  file does not by itself cause the resulting executable to be covered by
+ *  the GNU General Public License.  This exception does not however
+ *  invalidate any other reasons why the executable file might be covered by
+ *  the GNU General Public License.
+ *
+ ****************************************************************************
+ */
+
+/*
+
+   Author: Marco Aldinucci.
+   email:  aldinuc@di.unipi.it
+   marco@pisa.quadrics.com
+   date :  15/11/97
+
+Modified by:
+
+****************************************************************************
+ *  Author: Dalvan Griebler <dalvangriebler@gmail.com>
+ *  Author: Dinei Rockenbach <dinei.rockenbach@edu.pucrs.br>
+ *
+ *  Copyright: GNU General Public License
+ *  Description: This program simply computes the mandelbroat set.
+ *  File Name: mandel.cpp
+ *  Version: 1.0 (25/05/2018)
+ *  Compilation Command: make
+ ****************************************************************************
+*/
+
+
+#include <stdio.h>
+#ifdef DEBUG
+#include "marX2.h"
+#endif
+#include <sys/time.h>
+#include <math.h>
+
+#include <iostream>
+#include <chrono>
+
+#ifdef GSPARDRIVER_CUDA
+
+    #include "GSPar_CUDA.hpp"
+    using namespace GSPar::Driver::CUDA;
+
+// #elif GSPARDRIVER_OPENCL
+#else // This way my IDE doesn't complain
+
+    #include "GSPar_OpenCL.hpp"
+    using namespace GSPar::Driver::OpenCL;
+
+#endif
+
+#include "GSPar_PatternMap.hpp"
+using namespace GSPar::Pattern;
+
+#define DIM 800
+#define ITERATION 1024
+
+double diffmsec(struct timeval  a,  struct timeval  b) {
+    long sec  = (a.tv_sec  - b.tv_sec);
+    long usec = (a.tv_usec - b.tv_usec);
+
+    if(usec < 0) {
+        --sec;
+        usec += 1000000;
+    }
+    return ((double)(sec*1000)+ (double)usec/1000.0);
+}
+
+
+
+int main(int argc, char **argv) {
+    double init_a=-2.125,init_b=-1.5,range=3.0;
+    unsigned long dim = DIM, niter = ITERATION;
+    // stats
+    struct timeval t1,t2;
+    int retries=1;
+    double avg = 0;
+    int batch_size = 1;
+
+    if (argc<5) {
+        printf("Usage: %s size niterations retries batch_size\n\n", argv[0]);
+        exit(-1);
+    }
+    else {
+        dim = atoi(argv[1]);
+        niter = atoi(argv[2]);
+        retries = atoi(argv[3]);
+        batch_size = atoi(argv[4]);
+    }
+
+    double * runs = new double[retries];
+    unsigned char **Ms = new unsigned char*[batch_size];
+    for (int b = 0; b < batch_size; b++) {
+        Ms[b] = new unsigned char[dim];
+    }
+
+    unsigned int batches = ceil((double)dim/batch_size);
+
+    double step = range/((double) dim);
+
+#ifdef DEBUG
+    SetupXWindows(dim,dim,1,NULL,"Sequential Mandelbroot");
+#endif
+    
+    printf("bin;size;numiter;time (ms);workers;batch size\n");
+    for (int r=0; r<retries; r++) {
+
+        auto pattern = new Map(GSPAR_STRINGIZE_SOURCE(
+            double im=init_b+(step*i);
+            double cr;
+            double a=cr=init_a+step*j;
+            double b=im;
+            int k = 0;
+            for (k=0; k<niter; k++) {
+                double a2=a*a;
+                double b2=b*b;
+                if ((a2+b2)>4.0) break;
+                b=2*a*b+im;
+                a=a2-b2+cr;
+            }
+            M[j]= (unsigned char) 255-((k*255/niter));
+        ));
+
+        unsigned long dimensions[3] = {dim, 0, 0};
+
+        try {
+
+            pattern->setParameterPlaceholder<int>("i", GSPAR_PARAM_VALUE, GSPAR_PARAM_IN, true)
+                .setParameter("dim", dim)
+                .setParameter("init_a", init_a)
+                .setParameter("init_b", init_b)
+                .setParameter("step", step)
+                .setParameter("niter", niter)
+                .setParameterPlaceholder<unsigned char*>("M", GSPAR_PARAM_POINTER, GSPAR_PARAM_INOUT, true);
+
+            pattern->setStdVarNames({"j", "", ""})
+                .setBatchSize(batch_size);
+
+            pattern->compile<Instance>(dimensions);
+
+        } catch (GSPar::GSParException &ex) {
+            std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl;
+            exit(-1);
+        }
+
+        // Start time
+        gettimeofday(&t1,NULL);
+
+        int* Is = new int[batch_size];
+
+        for(unsigned int b=0; b<batches; b++) {
+            // std::cout << "Processing batch " << b << ", lines ";
+            for (int i = 0; i < batch_size; i++) {
+                Is[i] = b*batch_size + i;
+                // std::cout << Is[i] << " ";
+            }
+            // std::cout << std::endl;
+
+            try {
+
+                pattern->setBatchedParameter("i", Is)
+                    .setBatchedParameter("M", dim, Ms, GSPAR_PARAM_INOUT);
+
+                pattern->run<Instance>(dimensions);
+
+            } catch (GSPar::GSParException &ex) {
+                std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl;
+                exit(-1);
+            }
+            
+#ifdef DEBUG
+            for (int i = 0; i < batch_size; i++) {
+                ShowLine(Ms[i],dim,Is[i]);
+            }
+#endif
+        }
+        // Stop time
+        gettimeofday(&t2,NULL);
+
+        avg += runs[r] = diffmsec(t2,t1);
+        printf("%s;%lu;%lu;%.2f;1;1\n", argv[0], dim, niter, runs[r]);
+    }
+    avg = avg / (double) retries;
+    double var = 0;
+    for (int r=0; r<retries; r++) {
+        var += (runs[r] - avg) * (runs[r] - avg);
+    }
+    var /= retries;
+
+#ifdef DEBUG
+    printf("Average on %d experiments = %f (ms) Std. Dev. %f\n\nPress a key\n",retries,avg,sqrt(var));
+    getchar();
+    CloseXWindows();
+#endif
+
+    delete[] runs;
+    for (int b = 0; b < batch_size; b++) {
+        delete[] Ms[b];
+    }
+    delete[] Ms;
+    return 0;
+}
diff --git a/examples/pattern_api/mandel_stream.cpp b/examples/pattern_api/mandel_stream.cpp
new file mode 100644
index 0000000..69cd4f4
--- /dev/null
+++ b/examples/pattern_api/mandel_stream.cpp
@@ -0,0 +1,218 @@
+/* ***************************************************************************
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ *  As a special exception, you may use this file as part of a free software
+ *  library without restriction.  Specifically, if other files instantiate
+ *  templates or use macros or inline functions from this file, or you compile
+ *  this file and link it with other files to produce an executable, this
+ *  file does not by itself cause the resulting executable to be covered by
+ *  the GNU General Public License.  This exception does not however
+ *  invalidate any other reasons why the executable file might be covered by
+ *  the GNU General Public License.
+ *
+ ****************************************************************************
+ */
+
+/*
+
+   Author: Marco Aldinucci.
+   email:  aldinuc@di.unipi.it
+   marco@pisa.quadrics.com
+   date :  15/11/97
+
+Modified by:
+
+****************************************************************************
+ *  Author: Dalvan Griebler <dalvangriebler@gmail.com>
+ *  Author: Dinei Rockenbach <dinei.rockenbach@edu.pucrs.br>
+ *
+ *  Copyright: GNU General Public License
+ *  Description: This program simply computes the mandelbroat set.
+ *  File Name: mandel.cpp
+ *  Version: 1.0 (25/05/2018)
+ *  Compilation Command: make
+ ****************************************************************************
+*/
+
+
+#include <stdio.h>
+#ifdef DEBUG
+#include "marX2/marX2.h"
+#endif
+#include <sys/time.h>
+#include <math.h>
+
+#include <iostream>
+#include <chrono>
+
+#ifdef GSPARDRIVER_CUDA
+
+    #include "GSPar_CUDA.hpp"
+    using namespace GSPar::Driver::CUDA;
+
+// #elif GSPARDRIVER_OPENCL
+#else // This way my IDE doesn't complain
+
+    #include "GSPar_OpenCL.hpp"
+    using namespace GSPar::Driver::OpenCL;
+
+#endif
+
+#include "GSPar_PatternMap.hpp"
+using namespace GSPar::Pattern;
+
+#define DIM 800
+#define ITERATION 1024
+
+double diffmsec(struct timeval  a,  struct timeval  b) {
+    long sec  = (a.tv_sec  - b.tv_sec);
+    long usec = (a.tv_usec - b.tv_usec);
+
+    if(usec < 0) {
+        --sec;
+        usec += 1000000;
+    }
+    return ((double)(sec*1000)+ (double)usec/1000.0);
+}
+
+
+
+int main(int argc, char **argv) {
+    double init_a=-2.125,init_b=-1.5,range=3.0;
+    unsigned long dim = DIM, niter = ITERATION;
+    // stats
+    struct timeval t1,t2;
+    int retries=1;
+    double avg = 0;
+
+
+    if (argc<4) {
+        printf("Usage: %s size niterations retries\n\n", argv[0]);
+        exit(-1);
+    }
+    else {
+        dim = atoi(argv[1]);
+        niter = atoi(argv[2]);
+        retries = atoi(argv[3]);
+    }
+
+    double * runs = new double[retries];
+    unsigned char *M = new unsigned char[dim];
+
+    double step = range/((double) dim);
+
+#ifdef DEBUG
+    SetupXWindows(dim,dim,1,NULL,"Sequential Mandelbroot");
+#endif
+    
+    printf("bin;size;numiter;time (ms);workers;batch size\n");
+    for (int r=0; r<retries; r++) {
+
+        auto pattern = new Map(GSPAR_STRINGIZE_SOURCE(
+            double im=init_b+(step*i);
+            double cr;
+            double a=cr=init_a+step*j;
+            double b=im;
+            int k = 0;
+            for (k=0; k<niter; k++)
+            {
+                double a2=a*a;
+                double b2=b*b;
+                if ((a2+b2)>4.0) break;
+                b=2*a*b+im;
+                a=a2-b2+cr;
+            }
+            M[j]= (unsigned char) 255-((k*255/niter));
+        ));
+
+        unsigned long dimensions[3] = {dim, 0, 0};
+        try {
+
+            pattern->setParameterPlaceholder<unsigned long>("i", GSPAR_PARAM_VALUE)
+                .setParameter("dim", dim)
+                .setParameter("init_a", init_a)
+                .setParameter("init_b", init_b)
+                .setParameter("step", step)
+                .setParameter("niter", niter)
+                .setParameterPlaceholder<unsigned char*>("M", GSPAR_PARAM_POINTER, GSPAR_PARAM_INOUT);
+
+            pattern->setStdVarNames({"j"});
+
+            pattern->compile<Instance>(dimensions);
+
+        } catch (GSPar::GSParException &ex) {
+            std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl;
+            exit(-1);
+        }
+
+        // Start time
+        gettimeofday(&t1,NULL);
+
+        for(unsigned long i=0; i<dim; i++) {
+            // for (int j=0; j<dim; j++)
+            // {
+            //     double im=init_b+(step*i);
+            //     double cr;
+            //     double a=cr=init_a+step*j;
+            //     double b=im;
+            //     int k = 0;
+            //     for (k=0; k<niter; k++)
+            //     {
+            //         double a2=a*a;
+            //         double b2=b*b;
+            //         if ((a2+b2)>4.0) break;
+            //         b=2*a*b+im;
+            //         a=a2-b2+cr;
+            //     }
+            //     M[j]= (unsigned char) 255-((k*255/niter));
+            // }
+
+            try {
+                pattern->setParameter("i", i)
+                    .setParameter("M", dim, M, GSPAR_PARAM_INOUT);
+
+                pattern->run<Instance>(dimensions);
+
+            } catch (GSPar::GSParException &ex) {
+                std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl;
+                exit(-1);
+            }
+            
+#ifdef DEBUG
+            ShowLine(M,dim,i);
+#endif
+        }
+        // Stop time
+        gettimeofday(&t2,NULL);
+
+        avg += runs[r] = diffmsec(t2,t1);
+        printf("%s;%lu;%lu;%.2f;1;1\n", argv[0], dim, niter, runs[r]);
+    }
+    avg = avg / (double) retries;
+    double var = 0;
+    for (int r=0; r<retries; r++) {
+        var += (runs[r] - avg) * (runs[r] - avg);
+    }
+    var /= retries;
+
+#ifdef DEBUG
+    printf("Average on %d experiments = %f (ms) Std. Dev. %f\n\nPress a key\n",retries,avg,sqrt(var));
+    getchar();
+    CloseXWindows();
+#endif
+
+    delete[] runs;
+    delete[] M;
+    return 0;
+}
diff --git a/examples/pattern_api/matrix_multi_map_cm.cpp b/examples/pattern_api/matrix_multi_map_cm.cpp
new file mode 100644
index 0000000..094490d
--- /dev/null
+++ b/examples/pattern_api/matrix_multi_map_cm.cpp
@@ -0,0 +1,133 @@
+#include <iostream>
+#include <chrono>
+#include <iomanip>
+
+std::chrono::steady_clock::time_point tInitialization;
+std::chrono::steady_clock::time_point tComputation;
+std::chrono::steady_clock::time_point tFinishing;
+std::chrono::steady_clock::time_point tEnd;
+
+#ifdef GSPARDRIVER_CUDA
+
+    #include "GSPar_CUDA.hpp"
+    namespace Driver = GSPar::Driver::CUDA;
+
+// #elif GSPARDRIVER_OPENCL
+#else // This way my IDE doesn't complain
+
+    #include "GSPar_OpenCL.hpp"
+    namespace Driver = GSPar::Driver::OpenCL;
+
+#endif
+
+#include "GSPar_PatternMap.hpp"
+namespace Pattern = GSPar::Pattern;
+
+void multiply(const unsigned long size, const float *matrixA, const float *matrixB, float *result) {
+    try {
+
+        auto map = new Pattern::Map(GSPAR_STRINGIZE_SOURCE(
+            float sum = 0;
+            for (unsigned long k = 0; k < size; k++) {
+                sum += a[k * size + i] * b[j * size + k]; // Column-major
+            }
+            result[j * size + i] = sum;
+        ));
+
+        map->setStdVarNames({"i", "j"});
+
+        map->setParameter("size", size)
+            .setParameter("a", sizeof(float) * size * size, matrixA)
+            .setParameter("b", sizeof(float) * size * size, matrixB)
+            .setParameter("result", sizeof(float) * size * size, result, Pattern::GSPAR_PARAM_OUT);
+
+        map->compile<Driver::Instance>({size, size, 0});
+
+
+        tComputation = std::chrono::steady_clock::now(); // Ends initialization, start computation
+
+
+        map->run<Driver::Instance>();
+
+
+        tFinishing = std::chrono::steady_clock::now(); // Ends computation, start finishing
+
+
+        delete map;
+
+    } catch (GSPar::GSParException &ex) {
+        std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl;
+        exit(-1);
+    }
+}
+
+void printMatrix(const unsigned long size, float *matrix, bool compact = false) {
+    if (compact || size > 100) {
+        std::cout << matrix[0] << ".." << matrix[size-1] << ".." << matrix[size*size-1];
+    } else {
+        for (unsigned long i = 0; i < size; i++) {
+            std::cout << std::endl;
+            for (unsigned long j = 0; j < size; j++) {
+                // std::cout << matrix[i * size + j] << " "; // Row-major
+                std::cout << matrix[j * size + i] << " "; // Column-major
+            }
+        }
+    }
+    std::cout << std::endl;
+}
+
+int main(int argc, char const *argv[]) {
+    if (argc < 2) {
+        std::cerr << "Use: " << argv[0] << " <matrix_size>" << std::endl;
+        exit(-1);
+    }
+    std::cout << std::fixed << std::setprecision(0);
+    
+    const unsigned long MX = std::stoi(argv[1]);
+
+    float *matrixA = new float[MX * MX];
+    float *matrixB = new float[MX * MX];
+    float *result = new float[MX * MX];  
+    for (unsigned long i = 0; i < MX; i++) {
+        for (unsigned long j = 0; j < MX; j++) {
+            // Column-major
+            matrixA[j * MX + i] = i+1;
+            matrixB[j * MX + i] = j+1;
+            result[j * MX + i] = 0;
+        }
+    }
+
+    #ifdef DEBUG
+        std::cout << "Multiplying matrixes of " << MX << " x " << MX << std::endl;
+        std::cout << "Matrix A: ";
+        printMatrix(MX, matrixA);
+        std::cout << "Matrix B: ";
+        printMatrix(MX, matrixB);
+    #endif
+
+    tInitialization = std::chrono::steady_clock::now(); // Begins initialization
+
+    multiply(MX, matrixA, matrixB, result);
+
+    tEnd = std::chrono::steady_clock::now(); // Ends finish
+
+    double msTotal = std::chrono::duration_cast<std::chrono::milliseconds>(tEnd - tInitialization).count();
+    double msInitialization = std::chrono::duration_cast<std::chrono::milliseconds>(tComputation - tInitialization).count();
+    double msComputation = std::chrono::duration_cast<std::chrono::milliseconds>(tFinishing - tComputation).count();
+    double msFinishing = std::chrono::duration_cast<std::chrono::milliseconds>(tEnd - tFinishing).count();
+
+    #ifdef DEBUG
+        std::cout << "Result: ";
+        printMatrix(MX, result);
+        std::cout << "Total: " << msTotal << " ms" << std::endl;
+        std::cout << "Initialization: " << msInitialization << " ms" << std::endl;
+        std::cout << "Computation: " << msComputation << " ms" << std::endl;
+        std::cout << "Finishing: " << msFinishing << " ms" << std::endl;
+    #else
+        std::cout << result[0] << ".." << result[MX-1] << ".." << result[MX*MX-1] << ";" << msTotal << ";" << msInitialization << ";" << msComputation << ";" << msFinishing << std::endl;
+    #endif
+
+    delete[] matrixA;
+    delete[] matrixB;
+    delete[] result;
+}
diff --git a/examples/pattern_api/matrix_multi_map_rm.cpp b/examples/pattern_api/matrix_multi_map_rm.cpp
new file mode 100644
index 0000000..7b176ae
--- /dev/null
+++ b/examples/pattern_api/matrix_multi_map_rm.cpp
@@ -0,0 +1,132 @@
+#include <iostream>
+#include <chrono>
+#include <iomanip>
+
+std::chrono::steady_clock::time_point tInitialization;
+std::chrono::steady_clock::time_point tComputation;
+std::chrono::steady_clock::time_point tFinishing;
+std::chrono::steady_clock::time_point tEnd;
+
+#ifdef GSPARDRIVER_CUDA
+
+    #include "GSPar_CUDA.hpp"
+    namespace Driver = GSPar::Driver::CUDA;
+
+// #elif GSPARDRIVER_OPENCL
+#else // This way my IDE doesn't complain
+
+    #include "GSPar_OpenCL.hpp"
+    namespace Driver = GSPar::Driver::OpenCL;
+
+#endif
+
+#include "GSPar_PatternMap.hpp"
+namespace Pattern = GSPar::Pattern;
+
+void multiply(const unsigned long size, const float *matrixA, const float *matrixB, float *result) {
+    try {
+
+        auto map = new Pattern::Map(GSPAR_STRINGIZE_SOURCE(
+            float sum = 0;
+            for (unsigned long k = 0; k < size; k++) {
+                sum += a[i * size + k] * b[k * size + j]; // Row-major
+            }
+            result[i * size + j] = sum;
+        ));
+
+        map->setStdVarNames({"i", "j"});
+
+        map->setParameter("size", size)
+            .setParameter("a", sizeof(float) * size * size, matrixA)
+            .setParameter("b", sizeof(float) * size * size, matrixB)
+            .setParameter("result", sizeof(float) * size * size, result, Pattern::GSPAR_PARAM_OUT);
+
+        map->compile<Driver::Instance>({size, size, 0});
+
+
+        tComputation = std::chrono::steady_clock::now(); // Ends initialization, start computation
+
+
+        map->run<Driver::Instance>();
+
+
+        tFinishing = std::chrono::steady_clock::now(); // Ends computation, start finishing
+
+
+        delete map;
+
+    } catch (GSPar::GSParException &ex) {
+        std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl;
+        exit(-1);
+    }
+}
+
+void printMatrix(const unsigned long size, float *matrix, bool compact = false) {
+    if (compact || size > 100) {
+        std::cout << matrix[0] << ".." << matrix[size-1] << ".." << matrix[size*size-1];
+    } else {
+        for (unsigned long i = 0; i < size; i++) {
+            std::cout << std::endl;
+            for (unsigned long j = 0; j < size; j++) {
+                std::cout << matrix[i * size + j] << " "; // Row-major
+            }
+        }
+    }
+    std::cout << std::endl;
+}
+
+int main(int argc, char const *argv[]) {
+    if (argc < 2) {
+        std::cerr << "Use: " << argv[0] << " <matrix_size>" << std::endl;
+        exit(-1);
+    }
+    std::cout << std::fixed << std::setprecision(0);
+    
+    const unsigned long MX = std::stoi(argv[1]);
+
+    float *matrixA = new float[MX * MX];
+    float *matrixB = new float[MX * MX];
+    float *result = new float[MX * MX];  
+    for (unsigned long i = 0; i < MX; i++) {
+        for (unsigned long j = 0; j < MX; j++) {
+            // Row-major
+            matrixA[i * MX + j] = i+1;
+            matrixB[i * MX + j] = j+1;
+            result[i * MX + j] = 0;
+        }
+    }
+
+    #ifdef DEBUG
+        std::cout << "Multiplying matrixes of " << MX << " x " << MX << std::endl;
+        std::cout << "Matrix A: ";
+        printMatrix(MX, matrixA);
+        std::cout << "Matrix B: ";
+        printMatrix(MX, matrixB);
+    #endif
+
+    tInitialization = std::chrono::steady_clock::now(); // Begins initialization
+
+    multiply(MX, matrixA, matrixB, result);
+
+    tEnd = std::chrono::steady_clock::now(); // Ends finish
+
+    double msTotal = std::chrono::duration_cast<std::chrono::milliseconds>(tEnd - tInitialization).count();
+    double msInitialization = std::chrono::duration_cast<std::chrono::milliseconds>(tComputation - tInitialization).count();
+    double msComputation = std::chrono::duration_cast<std::chrono::milliseconds>(tFinishing - tComputation).count();
+    double msFinishing = std::chrono::duration_cast<std::chrono::milliseconds>(tEnd - tFinishing).count();
+
+    #ifdef DEBUG
+        std::cout << "Result: ";
+        printMatrix(MX, result);
+        std::cout << "Total: " << msTotal << " ms" << std::endl;
+        std::cout << "Initialization: " << msInitialization << " ms" << std::endl;
+        std::cout << "Computation: " << msComputation << " ms" << std::endl;
+        std::cout << "Finishing: " << msFinishing << " ms" << std::endl;
+    #else
+        std::cout << result[0] << ".." << result[MX-1] << ".." << result[MX*MX-1] << ";" << msTotal << ";" << msInitialization << ";" << msComputation << ";" << msFinishing << std::endl;
+    #endif
+
+    delete[] matrixA;
+    delete[] matrixB;
+    delete[] result;
+}
diff --git a/examples/pattern_api/raytracer.cpp b/examples/pattern_api/raytracer.cpp
new file mode 100644
index 0000000..865a426
--- /dev/null
+++ b/examples/pattern_api/raytracer.cpp
@@ -0,0 +1,818 @@
+// [header]
+// A very basic raytracer example.
+// [/header]
+// [compile]
+// c++ -o raytracer -O3 -Wall raytracer.cpp
+// [/compile]
+// [ignore]
+// Copyright (C) 2012  www.scratchapixel.com
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+// [/ignore]
+#include <cstdlib>
+#include <cstdio>
+#include <cmath>
+#include <fstream>
+#include <vector>
+#include <iostream>
+#include <cassert>
+#include <cstring>
+#include <string>
+#include <sstream>
+#include <iomanip>
+#include <iostream>
+#include "rapidxml-1.13/rapidxml.hpp"
+
+#ifdef GSPARDRIVER_CUDA
+
+    #include "GSPar_CUDA.hpp"
+    using namespace GSPar::Driver::CUDA;
+
+    const char* extraKernelCode = GSPAR_STRINGIZE_SOURCE(
+        template<typename T>
+        class Vec3
+        {
+        public:
+            T x, y, z;
+            Vec3() : x(T(0)), y(T(0)), z(T(0)) {}
+            Vec3(T xx) : x(xx), y(xx), z(xx) {}
+            Vec3(T xx, T yy, T zz) : x(xx), y(yy), z(zz) {}
+            void normalize() { Vec3f_normalize(this); }
+            Vec3<T> operator * (const T &f) const { return Vec3<T>(x * f, y * f, z * f); }
+            Vec3<T> operator * (const Vec3<T> &v) const { return Vec3<T>(x * v.x, y * v.y, z * v.z); }
+            T dot(const Vec3<T> &v) const { return x * v.x + y * v.y + z * v.z; }
+            Vec3<T> operator - (const Vec3<T> &v) const { return Vec3<T>(x - v.x, y - v.y, z - v.z); }
+            Vec3<T> operator + (const Vec3<T> &v) const { return Vec3<T>(x + v.x, y + v.y, z + v.z); }
+            Vec3<T>& operator += (const Vec3<T> &v) { x += v.x, y += v.y, z += v.z; return *this; }
+            Vec3<T>& operator *= (const Vec3<T> &v) { x *= v.x, y *= v.y, z *= v.z; return *this; }
+            Vec3<T> operator - () const { return Vec3<T>(-x, -y, -z); }
+            T length2() const { return x * x + y * y + z * z; }
+            T length() const { return sqrt(length2()); }
+        };
+
+        typedef Vec3<float> Vec3f;
+        typedef Vec3<bool> Vec3b;
+
+        Vec3f Vec3f_new_single(float xx) {
+            Vec3f v;
+            v.x = xx;
+            v.y = xx;
+            v.z = xx;
+            return v;
+        }
+        Vec3f Vec3f_new(float xx, float yy, float zz) {
+            Vec3f v;
+            v.x = xx;
+            v.y = yy;
+            v.z = zz;
+            return v;
+        }
+        Vec3f Vec3f_mult_single(const Vec3f *thes, const float f) { return Vec3f_new(thes->x * f, thes->y * f, thes->z * f); }
+        Vec3f Vec3f_mult(const Vec3f *thes, const Vec3f* v) { return Vec3f_new(thes->x * v->x, thes->y * v->y, thes->z * v->z); }
+        float Vec3f_dot(const Vec3f *thes, const Vec3f *v) { return thes->x * v->x + thes->y * v->y + thes->z * v->z; }
+        Vec3f Vec3f_minus(const Vec3f *thes, const Vec3f *v) { return Vec3f_new(thes->x - v->x, thes->y - v->y, thes->z - v->z); }
+        Vec3f Vec3f_plus(const Vec3f *thes, const Vec3f *v) { return Vec3f_new(thes->x + v->x, thes->y + v->y, thes->z + v->z); }
+        Vec3f Vec3f_inverse(const Vec3f *thes) { return Vec3f_new(-thes->x, -thes->y, -thes->z); }
+        float Vec3f_length2(const Vec3f *thes) { return thes->x * thes->x + thes->y * thes->y + thes->z * thes->z; }
+        void Vec3f_normalize(Vec3f *thes) {
+            float nor2 = Vec3f_length2(thes);
+            if (nor2 > 0) {
+                float invNor = 1 / sqrt(nor2);
+                thes->x *= invNor;
+                thes->y *= invNor;
+                thes->z *= invNor;
+            }
+        }
+
+        class Sphere
+        {
+        public:
+            const char* id;
+            Vec3f center;                           /// position of the sphere
+            float radius, radius2;                  /// sphere radius and radius^2
+            Vec3f surfaceColor, emissionColor;      /// surface color and emission (light)
+            float transparency, reflection;         /// surface transparency and reflectivity
+            int animation_frame;
+            Vec3b animation_position_rand;
+            Vec3f animation_position;
+            Sphere() { }
+            Sphere(
+                const char* id,
+                const Vec3f &c,
+                const float &r,
+                const Vec3f &sc,
+                const float &refl = 0,
+                const float &transp = 0,
+                const Vec3f &ec = 0) :
+                id(id), center(c), radius(r), radius2(r * r), surfaceColor(sc),
+                emissionColor(ec), transparency(transp), reflection(refl)
+            {
+                animation_frame = 0;
+            }
+            //[comment]
+            // Compute a ray-sphere intersection using the geometric solution
+            //[/comment]
+            bool intersect(const Vec3f &rayorig, const Vec3f &raydir, float &t0, float &t1) const
+            {
+                Vec3f l = center - rayorig;
+                float tca = l.dot(raydir);
+                if (tca < 0) return false;
+                float d2 = l.dot(l) - tca * tca;
+                if (d2 > radius2) return false;
+                float thc = sqrt(radius2 - d2);
+                t0 = tca - thc;
+                t1 = tca + thc;
+                
+                return true;
+            }
+        };
+
+        float mixfresnel(const float &a, const float &b, const float &mixval) {
+            return b * mixval + a * (1 - mixval);
+        }
+
+        Vec3f trace(
+            const Vec3f *rayorig_ptr,
+            const Vec3f *raydir_ptr,
+            const Sphere *spheres,
+            const unsigned int spheres_size,
+            const int &depth)
+        {
+            const Vec3f rayorig = *rayorig_ptr;
+            const Vec3f raydir = *raydir_ptr;
+
+            float tnear = 1e8;
+            const Sphere* sphere = NULL;
+            // find intersection of this ray with the sphere in the scene
+            for (unsigned i = 0; i < spheres_size; ++i) {
+                float t0 = 1e8, t1 = 1e8;
+                if (spheres[i].intersect(rayorig, raydir, t0, t1)) {
+                    if (t0 < 0) t0 = t1;
+                    if (t0 < tnear) {
+                        tnear = t0;
+                        sphere = &spheres[i];
+                    }
+                }
+            }
+            // if there's no intersection return black or background color
+            if (!sphere) return Vec3f(2);
+            Vec3f surfaceColor = 0; // color of the ray/surfaceof the object intersected by the ray
+            Vec3f phit = rayorig + raydir * tnear; // point of intersection
+            Vec3f nhit = phit - sphere->center; // normal at the intersection point
+            nhit.normalize(); // normalize normal direction
+            // If the normal and the view direction are not opposite to each other
+            // reverse the normal direction. That also means we are inside the sphere so set
+            // the inside bool to true. Finally reverse the sign of IdotN which we want
+            // positive.
+            float bias = 1e-4; // add some bias to the point from which we will be tracing
+            bool inside = false;
+            if (raydir.dot(nhit) > 0) nhit = -nhit, inside = true;
+            if ((sphere->transparency > 0 || sphere->reflection > 0) && depth < 5) { //MAX_RAY_DEPTH
+                float facingratio = 1+raydir.dot(nhit);
+                float fresneleffect = facingratio*facingratio*facingratio;
+                // change the mix value to tweak the effect
+                fresneleffect = mixfresnel(fresneleffect, 1, 0.1);
+                // compute reflection direction (not need to normalize because all vectors
+                // are already normalized)
+                Vec3f refldir = raydir - nhit * 2 * raydir.dot(nhit);
+                refldir.normalize();
+                Vec3f new_rayorig = phit + nhit * bias;
+                Vec3f reflection = trace(&new_rayorig, &refldir, spheres, spheres_size, depth + 1);
+                Vec3f refraction = 0;
+                // if the sphere is also transparent compute refraction ray (transmission)
+                if (sphere->transparency) {
+                    float ior = 1.1, eta = (inside) ? ior : 1 / ior; // are we inside or outside the surface?
+                    float cosi = -nhit.dot(raydir);
+                    float k = 1 - eta * eta * (1 - cosi * cosi);
+                    Vec3f refrdir = raydir * eta + nhit * (eta * cosi - sqrt(k));
+                    refrdir.normalize();
+                    new_rayorig = phit - nhit * bias;
+                    refraction = trace(&new_rayorig, &refrdir, spheres, spheres_size, depth + 1);
+                }
+                // the result is a mix of reflection and refraction (if the sphere is transparent)
+                surfaceColor = (
+                    reflection * fresneleffect +
+                    refraction * (1 - fresneleffect) * sphere->transparency) * sphere->surfaceColor;
+            }
+            else {
+                // it's a diffuse object, no need to raytrace any further
+                for (unsigned i = 0; i < spheres_size; ++i) {
+                    if (spheres[i].emissionColor.x > 0) {
+                        // this is a light
+                        Vec3f transmission = 1;
+                        Vec3f lightDirection = spheres[i].center - phit;
+                        lightDirection.normalize();
+                        for (unsigned j = 0; j < spheres_size; ++j) {
+                            if (i != j) {
+                                float t0, t1;
+                                if (spheres[j].intersect(phit + nhit * bias, lightDirection, t0, t1)) {
+                                    transmission = 0;
+                                    break;
+                                }
+                            }
+                        }
+                        surfaceColor += sphere->surfaceColor * transmission *
+                        max(float(0), nhit.dot(lightDirection)) * spheres[i].emissionColor;
+                    }
+                }
+            }
+
+            return surfaceColor + sphere->emissionColor;
+        }
+    );
+
+// #elif GSPARDRIVER_OPENCL
+#else // This way my IDE doesn't complain
+
+    #include "GSPar_OpenCL.hpp"
+    using namespace GSPar::Driver::OpenCL;
+
+    const char* extraKernelCode = GSPAR_STRINGIZE_SOURCE(
+        typedef struct tVec3b { bool x; bool y; bool z; } Vec3b;
+        typedef struct tVec3f { float x; float y; float z; } Vec3f;
+        Vec3f Vec3f_new_single(float xx) {
+            Vec3f v;
+            v.x = xx;
+            v.y = xx;
+            v.z = xx;
+            return v;
+        }
+        Vec3f Vec3f_new(float xx, float yy, float zz) {
+            Vec3f v;
+            v.x = xx;
+            v.y = yy;
+            v.z = zz;
+            return v;
+        }
+        Vec3f Vec3f_mult_single(const Vec3f *thes, const float f) { return Vec3f_new(thes->x * f, thes->y * f, thes->z * f); }
+        Vec3f Vec3f_mult(const Vec3f *thes, const Vec3f* v) { return Vec3f_new(thes->x * v->x, thes->y * v->y, thes->z * v->z); }
+        Vec3f Vec3f_mult__global_first(const __global Vec3f *thes, const Vec3f* v) { return Vec3f_new(thes->x * v->x, thes->y * v->y, thes->z * v->z); }
+        Vec3f Vec3f_mult__global_second(const Vec3f *thes, const __global Vec3f* v) { return Vec3f_new(thes->x * v->x, thes->y * v->y, thes->z * v->z); }
+        float Vec3f_dot(const Vec3f *thes, const Vec3f *v) { return thes->x * v->x + thes->y * v->y + thes->z * v->z; }
+        Vec3f Vec3f_minus(const Vec3f *thes, const Vec3f *v) { return Vec3f_new(thes->x - v->x, thes->y - v->y, thes->z - v->z); }
+        Vec3f Vec3f_minus__global_first(__global const Vec3f *thes, const Vec3f *v) { return Vec3f_new(thes->x - v->x, thes->y - v->y, thes->z - v->z); }
+        Vec3f Vec3f_minus__global_second(const Vec3f *thes, const __global Vec3f *v) { return Vec3f_new(thes->x - v->x, thes->y - v->y, thes->z - v->z); }
+        Vec3f Vec3f_plus(const Vec3f *thes, const Vec3f *v) { return Vec3f_new(thes->x + v->x, thes->y + v->y, thes->z + v->z); }
+        Vec3f Vec3f_plus__global_second(const Vec3f *thes, const __global Vec3f *v) { return Vec3f_new(thes->x + v->x, thes->y + v->y, thes->z + v->z); }
+        Vec3f Vec3f_inverse(const Vec3f *thes) { return Vec3f_new(-thes->x, -thes->y, -thes->z); }
+        float Vec3f_length2(const Vec3f *thes) { return thes->x * thes->x + thes->y * thes->y + thes->z * thes->z; }
+        void Vec3f_normalize(Vec3f *thes) {
+            float nor2 = Vec3f_length2(thes);
+            if (nor2 > 0) {
+                float invNor = 1 / sqrt(nor2);
+                thes->x *= invNor;
+                thes->y *= invNor;
+                thes->z *= invNor;
+            }
+        }
+
+        typedef struct tSphere {
+            const char *id;
+            Vec3f center;
+            float radius, radius2;
+            Vec3f surfaceColor, emissionColor;
+            float transparency, reflection;
+            int animation_frame;
+            Vec3b animation_position_rand;
+            Vec3f animation_position;
+        } Sphere;
+
+        bool Sphere_intersect(__global const Sphere* thes, const Vec3f *rayorig, const Vec3f *raydir, float *t0, float *t1) {
+            Vec3f l = Vec3f_minus__global_first(&thes->center, rayorig);
+            float tca = Vec3f_dot(&l, raydir);
+            if (tca < 0) return false;
+            float d2 = Vec3f_dot(&l, &l) - tca * tca;
+            if (d2 > thes->radius2) return false;
+            float thc = sqrt(thes->radius2 - d2);
+            *t0 = tca - thc;
+            *t1 = tca + thc;
+            
+            return true;
+        }
+
+        float mix_fresnel(const float a, const float b, const float mixval) {
+            return b * mixval + a * (1 - mixval);
+        }
+
+        Vec3f trace(
+            const Vec3f* rayorig,
+            const Vec3f* raydir,
+            const __global Sphere *spheres,
+            const unsigned int spheres_size,
+            const int depth)
+        {
+            float tnear = 1e8;
+            const __global Sphere* sphere = NULL;
+            // find intersection of the ray with the sphere in the scene
+            for (unsigned i = 0; i < spheres_size; ++i) {
+                float t0 = 1e8, t1 = 1e8;
+                if (Sphere_intersect(&spheres[i], rayorig, raydir, &t0, &t1)) {
+                    if (t0 < 0) t0 = t1;
+                    if (t0 < tnear) {
+                        tnear = t0;
+                        sphere = &spheres[i];
+                    }
+                }
+            }
+
+            // if there's no intersection return black or background color
+            if (!sphere) return Vec3f_new_single(2);
+            Vec3f surfaceColor = Vec3f_new_single(0); // color of the ray/surfaceof the object intersected by the ray
+            Vec3f aux = Vec3f_mult_single(raydir, tnear);
+            Vec3f phit = Vec3f_plus(rayorig, &aux);
+            Vec3f nhit = Vec3f_minus__global_second(&phit, &sphere->center); // normal at the intersection point
+            Vec3f_normalize(&nhit); // normalize normal direction
+            // If the normal and the view direction are not opposite to each other
+            // reverse the normal direction. That also means we are inside the sphere so set
+            // the inside bool to true. Finally reverse the sign of IdotN which we want
+            // positive.
+            float bias = 1e-4; // add some bias to the point from which we will be tracing
+            bool inside = false;
+            if (Vec3f_dot(raydir, &nhit) > 0) {
+                nhit = Vec3f_inverse(&nhit);
+                inside = true;
+            }
+            if ((sphere->transparency > 0 || sphere->reflection > 0) && depth < 5) { // MAX_RAY_DEPTH
+                float facingratio = 1+Vec3f_dot(raydir, &nhit);
+                float fresneleffect = facingratio*facingratio*facingratio;
+                // change the mix value to tweak the effect
+                fresneleffect = mix_fresnel(fresneleffect, 1, 0.1);
+                // compute reflection direction (not need to normalize because all vectors
+                // are already normalized)
+                aux = Vec3f_mult_single(&nhit, 2);
+                aux = Vec3f_mult_single(&aux, Vec3f_dot(raydir, &nhit));
+                Vec3f refldir = Vec3f_minus(raydir, &aux);
+                Vec3f_normalize(&refldir);
+                aux = Vec3f_mult_single(&nhit, bias);
+                aux = Vec3f_plus(&phit, &aux);
+                Vec3f reflection = trace(&aux, &refldir, spheres, spheres_size, depth + 1);
+                Vec3f refraction = Vec3f_new_single(0);
+                // if the sphere is also transparent compute refraction ray (transmission)
+                if (sphere->transparency) {
+                    float ior = 1.1, eta = (inside) ? ior : 1 / ior; // are we inside or outside the surface?
+                    float cosi = -Vec3f_dot(&nhit, raydir);
+                    float k = 1 - eta * eta * (1 - cosi * cosi);
+                    aux = Vec3f_mult_single(raydir, eta);
+                    Vec3f aux2 = Vec3f_mult_single(&nhit, (eta * cosi - sqrt(k)));
+                    Vec3f refrdir = Vec3f_plus(&aux, &aux2);
+                    Vec3f_normalize(&refrdir);
+                    aux = Vec3f_mult_single(&nhit, bias);
+                    aux = Vec3f_minus(&phit, &aux);
+                    refraction = trace(&aux, &refrdir, spheres, spheres_size, depth + 1);
+                }
+                // the result is a mix of reflection and refraction (if the sphere is transparent)
+                aux = Vec3f_mult_single(&reflection, fresneleffect);
+                Vec3f aux2 = Vec3f_mult_single(&refraction, (1 - fresneleffect) * sphere->transparency);
+                surfaceColor = Vec3f_plus(&aux, &aux2);
+                surfaceColor = Vec3f_mult__global_second(&surfaceColor, &sphere->surfaceColor);
+            }
+            else {
+                // it's a diffuse object, no need to raytrace any further
+                for (unsigned i = 0; i < spheres_size; ++i) {
+                    if (spheres[i].emissionColor.x > 0) {
+                        // this is a light
+                        Vec3f transmission = Vec3f_new_single(1);
+                        Vec3f lightDirection = Vec3f_minus__global_first(&spheres[i].center, &phit);
+                        Vec3f_normalize(&lightDirection);
+                        for (unsigned j = 0; j < spheres_size; ++j) {
+                            if (i != j) {
+                                float t0, t1; //Unused
+                                // t0 = 0;
+                                // t1 = 0;
+                                aux = Vec3f_mult_single(&nhit, bias);
+                                aux = Vec3f_plus(&phit, &aux);
+                                if (Sphere_intersect(&spheres[j], &aux, &lightDirection, &t0, &t1)) {
+                                    transmission = Vec3f_new_single(0);
+                                    break;
+                                }
+                            }
+                        }
+                        
+                        aux = Vec3f_mult__global_first(&sphere->surfaceColor, &transmission);
+                        aux = Vec3f_mult_single(&aux, fmax((float)0, Vec3f_dot(&nhit, &lightDirection)));
+                        aux = Vec3f_mult__global_second(&aux, &spheres[i].emissionColor);
+                        surfaceColor = Vec3f_plus(&surfaceColor, &aux);
+                    }
+                }
+            }
+            
+            return Vec3f_plus__global_second(&surfaceColor, &sphere->emissionColor);
+        }
+    );
+
+#endif
+
+#include "GSPar_PatternMap.hpp"
+using namespace GSPar::Pattern;
+
+#if defined __linux__ || defined __APPLE__
+// "Compiled for Linux
+#else
+// Windows doesn't define these values by default, Linux does
+#define M_PI 3.141592653589793
+#endif
+
+// This variable controls if it should work in memory. If it is not defined, works in disk
+#define WORK_IN_MEMORY
+
+#ifdef WORK_IN_MEMORY
+#define WORKING_MEDIA "memory"
+#else
+#define WORKING_MEDIA "disk"
+#endif
+
+class Vec3f {
+public:
+    float x, y, z;
+    Vec3f() : x(0), y(0), z(0) {}
+    Vec3f(float xx) : x(xx), y(xx), z(xx) {}
+    Vec3f(float xx, float yy, float zz) : x(xx), y(yy), z(zz) {}
+};
+struct Vec3b {
+    bool x; bool y; bool z;
+};
+
+class Sphere
+{
+public:
+    const char *id;
+    Vec3f center;                           /// position of the sphere
+    float radius, radius2;                  /// sphere radius and radius^2
+    Vec3f surfaceColor, emissionColor;      /// surface color and emission (light)
+    float transparency, reflection;         /// surface transparency and reflectivity
+    int animation_frame;
+    Vec3b animation_position_rand;
+    Vec3f animation_position;
+    Sphere() { }
+    Sphere(
+        const char *id,
+        const Vec3f &c,
+        const float &r,
+        const Vec3f &sc,
+        const float &refl = 0,
+        const float &transp = 0,
+        const Vec3f &ec = 0) :
+        id(id), center(c), radius(r), radius2(r * r), surfaceColor(sc),
+        emissionColor(ec), transparency(transp), reflection(refl)
+    {
+        animation_frame = 0;
+    }
+};
+
+void save_image(const std::string output_folder, const int frame, const unsigned int width, const unsigned int height, Vec3f *image) {
+    // Save result to a PPM image (keep these flags if you compile under Windows)
+    std::stringstream ss;
+    ss << std::setfill('0') << std::setw(5) << frame;
+    std::string filename = output_folder + "/frame" + ss.str() + ".ppm";
+#ifdef DEBUG
+    std::cout << "[Work] Writing frame " << frame << " to " << filename << std::endl;
+#endif
+    std::ofstream ofs(filename, std::ios::out | std::ios::binary);
+    ofs << "P6\n" << width << " " << height << "\n255\n";
+    for (unsigned i = 0; i < width * height; ++i) {
+        ofs << (unsigned char)(std::min(float(1), image[i].x) * 255) <<
+            (unsigned char)(std::min(float(1), image[i].y) * 255) <<
+            (unsigned char)(std::min(float(1), image[i].z) * 255);
+    }
+    ofs.close();
+}
+
+
+void raytrace(std::string output_folder, int total_frames, unsigned int width, unsigned int height, const std::vector<Sphere> &initial_spheres) {
+    float invWidth = 1 / float(width);
+    float invHeight = 1 / float(height);
+    float fov = 30;
+    float aspectratio = width / float(height);
+    float angle = tan(M_PI * 0.5 * fov / 180.);
+    
+    // std::cout << "[Vec3f] CPU version is " << sizeof(Vec3f) << ", gpu version is " << sizeof(GpuVec3f) << std::endl;
+    // std::cout << "[Sphere] CPU version is " << sizeof(Sphere) << ", gpu version is " << sizeof(GpuSphere) << std::endl;
+
+#ifdef WORK_IN_MEMORY
+    unsigned int total_memory = sizeof(Vec3f)*total_frames*width*height;
+    std::string total_memory_unit = " bytes";
+    if (total_memory > 1024) {
+        total_memory = (total_frames*width*height)/1024;
+        total_memory_unit = " KB";
+    }
+    if (total_memory > (10*1024)) {
+        total_memory /= 1024;
+        total_memory_unit = " MB";
+    }
+#ifdef DEBUG
+    std::cout << "[Init] Allocating " << total_memory << total_memory_unit << " of memory to store images" << std::endl;
+#endif
+    Vec3f **images = new Vec3f*[total_frames];
+    for (int f=0; f<total_frames; f++) {
+        images[f] = new Vec3f[width * height];
+    }
+#endif
+
+#ifdef DEBUG
+    std::cout << "[Init] Defining GSPar pattern" << std::endl;
+#endif
+    
+    // Core kernel code
+    auto pattern = new Map(GSPAR_STRINGIZE_SOURCE(
+        float xx = (2 * ((x + 0.5) * invWidth) - 1) * angle * aspectratio;
+        float yy = (1 - 2 * ((y + 0.5) * invHeight)) * angle;
+        Vec3f raydir = Vec3f_new(xx, yy, -1);
+        Vec3f_normalize(&raydir);
+        Vec3f rayorig = Vec3f_new_single(0);
+        image[y*width+x] = trace(&rayorig, &raydir, spheres, spheres_size, 0);
+    ));
+    
+    try {
+
+        // Kernel parameters
+        pattern->setParameter("width", width)
+            .setParameter("invWidth", invWidth)
+            .setParameter("invHeight", invHeight)
+            .setParameter("aspectratio", aspectratio)
+            .setParameter("angle", angle)
+            .setParameterPlaceholder<Vec3f*>("image", GSPAR_PARAM_POINTER, GSPAR_PARAM_INOUT)
+            .setParameterPlaceholder<Sphere*>("spheres")
+            .setParameterPlaceholder<unsigned int>("spheres_size", GSPAR_PARAM_VALUE);
+
+        // Extra kernel code
+        pattern->addExtraKernelCode(extraKernelCode);
+
+        unsigned long dimensions[3] = {width, height, 0};
+        pattern->compile<Instance>(dimensions);
+
+    } catch (GSPar::GSParException &ex) {
+        std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl;
+        exit(-1);
+    }
+
+#   ifndef NO_TIME_MEASUREMENT
+#ifdef DEBUG
+    std::cout << "[Time] Starting time measurement" << std::endl;
+#endif
+    time_t wall_start, wall_end;
+    time(&wall_start);
+    clock_t cpu_start = clock();
+#   endif
+
+    for (int frame = 1; frame <= total_frames; frame++) {
+#ifdef DEBUG
+        std::cout << "[Work] Generating frame " << frame << "..." << std::endl;
+#endif
+        // Set up the scenne
+        unsigned int spheres_size = initial_spheres.size();
+        Sphere* spheres = new Sphere[spheres_size];
+        memcpy(spheres, initial_spheres.data(), sizeof(Sphere) * spheres_size);
+
+        // Animation of each frame
+        for(unsigned long i = 0; i != spheres_size; i++) {
+            if (spheres[i].animation_frame != 0 &&
+                    (spheres[i].animation_frame > 0 && frame < spheres[i].animation_frame)) {
+                continue;
+            }
+
+            int adjusted_frame = frame;
+            if (spheres[i].animation_frame < 0) {
+                if (frame > spheres[i].animation_frame*-1) {
+                    adjusted_frame = spheres[i].animation_frame*-1;
+                }
+            } else if (spheres[i].animation_frame > 0) {
+                adjusted_frame -= spheres[i].animation_frame;
+            }
+
+            if (spheres[i].animation_position.x) {
+                if (spheres[i].animation_position_rand.x) {
+                    spheres[i].center.x += (drand48()*spheres[i].animation_position.x);
+                } else {
+                    spheres[i].center.x += adjusted_frame*spheres[i].animation_position.x;
+                }
+            }
+            if (spheres[i].animation_position.y) {
+                if (spheres[i].animation_position_rand.y) {
+                    spheres[i].center.y += (drand48()*spheres[i].animation_position.y);
+                } else {
+                    spheres[i].center.y += adjusted_frame*spheres[i].animation_position.y;
+                }
+            }
+            if (spheres[i].animation_position.z) {
+                if (spheres[i].animation_position_rand.z) {
+                    spheres[i].center.z += (drand48()*spheres[i].animation_position.z);
+                } else {
+                    spheres[i].center.z += adjusted_frame*spheres[i].animation_position.z;
+                }
+            }
+        }
+
+#ifdef WORK_IN_MEMORY
+        Vec3f *image = images[frame-1];
+#else
+        Vec3f *image = new Vec3f[width * height];
+#endif
+
+        try {
+
+            // // Trace rays
+            // for (unsigned y = 0; y < height; ++y) {
+            //     for (unsigned x = 0; x < width; ++x) {
+            //         float xx = (2 * ((x + 0.5) * invWidth) - 1) * angle * aspectratio;
+            //         float yy = (1 - 2 * ((y + 0.5) * invHeight)) * angle;
+            //         Vec3f raydir(xx, yy, -1);
+            //         raydir.normalize();
+            //         image[y*width+x] = trace(Vec3f(0), raydir, spheres, 0);
+            //     }
+            // }
+            
+            // Kernel parameters
+            pattern->setParameter("image", sizeof(Vec3f) * width * height, image, GSPAR_PARAM_INOUT)
+                .setParameter("spheres", sizeof(Sphere) * spheres_size, spheres)
+                .setParameter("spheres_size", spheres_size);
+
+            unsigned long dimensions[3] = {width, height, 0};
+            pattern->run<Instance>(dimensions);
+
+        } catch (GSPar::GSParException &ex) {
+            std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl;
+            exit(-1);
+        }
+
+        delete [] spheres;
+#ifndef WORK_IN_MEMORY
+        save_image(output_folder, frame, width, height, image);
+        delete [] image;
+#endif
+    }
+
+#   ifndef NO_TIME_MEASUREMENT
+#ifdef DEBUG
+    std::cout << "[Time] Stopping time measurement" << std::endl;
+#endif
+    clock_t cpu_end = clock();
+    time(&wall_end);
+    double cpu_time_seconds = ((double) (cpu_end - cpu_start)) / CLOCKS_PER_SEC;
+    double wall_time_seconds = difftime(wall_end, wall_start);
+    printf("The generation of %d frames in %s of %u x %u with %lu spheres took:\n", total_frames, WORKING_MEDIA, width, height, initial_spheres.size());
+    printf("%.0f wall-clock seconds (%.2f FPS)\n", wall_time_seconds, ((double)total_frames)/wall_time_seconds);
+    printf("%.2f CPU time seconds\n", cpu_time_seconds);
+#   endif
+
+#ifdef WORK_IN_MEMORY
+    for (int frame = 1; frame <= total_frames; frame++) {
+        save_image(output_folder, frame, width, height, images[frame-1]);
+        delete [] images[frame-1];
+    }
+    delete [] images;
+#endif
+}
+
+
+int main(int argc, char **argv)
+{
+    int image_size_parameter = 2;
+    int total_frames = 1;
+
+    if (argc < 3) {
+        std::cerr << "Usage: " << argv[0] << " <scene.xml> <output_folder>[ <image_size>[ <frames>]]" << std::endl;
+        std::cerr << " <scene.xml>: XML with the scene description" << std::endl;
+        std::cerr << " <output_folder>: Folder on which produce output images" << std::endl;
+        std::cerr << " <image_size>: Size of images to generate, a single integer meaning 1=320x180, 2=640x360, 4=HD, 6=FHD and so on. Defaults to " << image_size_parameter << std::endl;
+        std::cerr << " <frames>: Number of frames to produce. Defaults to " << total_frames << std::endl;
+        exit(1);
+    }
+    srand48(13);
+
+    std::string scene_filename(argv[1]);
+    std::string output_folder = argv[2];
+    if (argc > 3) {
+        image_size_parameter = atoi(argv[3]);
+    }
+    if (argc > 4) {
+        total_frames = atoi(argv[4]);
+    }
+
+    // 1 = 320x180
+    // 2 = 640x360
+    // 4 = 1280x720 (HD)
+    // 6 = 1920x1080 (FHD)
+    unsigned int image_size_multiplier = 20*image_size_parameter;
+
+    unsigned int width = image_size_multiplier*16;
+    unsigned int height = image_size_multiplier*9;
+    
+    std::vector<Sphere> initial_spheres;
+
+#ifdef DEBUG
+    std::cout << "[Init] Generating " << total_frames << " frames of " << width << "x" << height << " in " << WORKING_MEDIA << " in " << output_folder << std::endl;
+    std::cout << "[Init] Loading scene from " << scene_filename << std::endl;
+#endif
+
+    // Parses the scene
+    std::ifstream scene_file(scene_filename, std::ios::binary | std::ios::ate);
+    std::streamsize scene_file_size = scene_file.tellg();
+    scene_file.seekg(0, std::ios::beg);
+    char *scene_buffer = new char[scene_file_size];
+    if (scene_file.read(scene_buffer, scene_file_size)) {
+        rapidxml::xml_document<> doc;
+        doc.parse<0>(scene_buffer);
+        rapidxml::xml_node<> *scene_node = doc.first_node("scene");
+
+        rapidxml::xml_node<> *spheres_node = scene_node->first_node("spheres");
+        rapidxml::xml_node<> *sphere_node = spheres_node->first_node();
+        while (sphere_node != 0) {
+            // position, radius, surface color, reflectivity, transparency, emission color
+            initial_spheres.push_back(Sphere(
+                sphere_node->first_attribute("id")->value(),
+                Vec3f( //Center position
+                    atof(sphere_node->first_node("position")->first_attribute("x")->value()),
+                    atof(sphere_node->first_node("position")->first_attribute("y")->value()),
+                    atof(sphere_node->first_node("position")->first_attribute("z")->value())
+                ),
+                atof(sphere_node->first_node("size")->first_attribute("radius")->value()), // Radius
+                Vec3f( //Surface color
+                    atof(sphere_node->first_node("surface_color")->first_attribute("red")->value()),
+                    atof(sphere_node->first_node("surface_color")->first_attribute("green")->value()),
+                    atof(sphere_node->first_node("surface_color")->first_attribute("blue")->value())
+                ),
+                atof(sphere_node->first_node("reflectivity")->first_attribute("value")->value()), // Reflectivity
+                atof(sphere_node->first_node("transparency")->first_attribute("value")->value()) // Transparency
+            ));
+            if (sphere_node->first_node("emission_color")) {
+                initial_spheres.back().emissionColor = Vec3f(
+                    atof(sphere_node->first_node("emission_color")->first_attribute("red")->value()),
+                    atof(sphere_node->first_node("emission_color")->first_attribute("green")->value()),
+                    atof(sphere_node->first_node("emission_color")->first_attribute("blue")->value())
+                );
+            }
+            sphere_node = sphere_node->next_sibling();
+        }
+#ifdef DEBUG
+        std::cout << "[Init] Loaded " << initial_spheres.size() << " spheres, looking for animations" << std::endl;
+#endif
+
+        rapidxml::xml_node<> *animation_node = scene_node->first_node("animation");
+        for (rapidxml::xml_node<> *sphere_animation = animation_node->first_node();
+                sphere_animation; sphere_animation = sphere_animation->next_sibling()) {
+            std::string id = sphere_animation->first_attribute("id")->value();
+            for(unsigned long i = 0; i != initial_spheres.size(); i++) {
+                if (id == initial_spheres[i].id) {
+                    rapidxml::xml_node<> *position_node = sphere_animation->first_node("position");
+                    if (position_node) {
+                        rapidxml::xml_attribute<> *attr;
+                        attr = position_node->first_attribute("after");
+                        if (attr) {
+                            initial_spheres[i].animation_frame = atoi(attr->value());
+                        }
+                        attr = position_node->first_attribute("before");
+                        if (attr) {
+                            initial_spheres[i].animation_frame = atoi(attr->value())*-1;
+                        }
+                        attr = position_node->first_attribute("x");
+                        if (attr) {
+                            if (strcmp(attr->value(), "random") == 0) {
+                                initial_spheres[i].animation_position_rand.x = true;
+                                initial_spheres[i].animation_position.x = atof(position_node->first_attribute("random")->value());
+                            } else {
+                                initial_spheres[i].animation_position.x = atof(attr->value());
+                            }
+                        }
+                        attr = position_node->first_attribute("y");
+                        if (attr) {
+                            if (strcmp(attr->value(), "random") == 0) {
+                                initial_spheres[i].animation_position_rand.y = true;
+                                initial_spheres[i].animation_position.y = atof(position_node->first_attribute("random")->value());
+                            } else {
+                                initial_spheres[i].animation_position.y = atof(position_node->first_attribute("y")->value());
+                            }
+                        }
+                        attr = position_node->first_attribute("z");
+                        if (attr) {
+                            if (strcmp(attr->value(), "random") == 0) {
+                                initial_spheres[i].animation_position_rand.z = true;
+                                initial_spheres[i].animation_position.z = atof(position_node->first_attribute("random")->value());
+                            } else {
+                                initial_spheres[i].animation_position.z = atof(position_node->first_attribute("z")->value());
+                            }
+                        }
+                    }
+                }
+            }
+        }
+#ifdef DEBUG
+        std::cout << "[Init] Finished loading animation for spheres" << std::endl;
+#endif
+
+    }
+
+    raytrace(output_folder, total_frames, width, height, initial_spheres);
+
+    return 0;
+}
\ No newline at end of file
diff --git a/examples/pattern_api/reduce_sample.cpp b/examples/pattern_api/reduce_sample.cpp
new file mode 100644
index 0000000..9e8a3e5
--- /dev/null
+++ b/examples/pattern_api/reduce_sample.cpp
@@ -0,0 +1,55 @@
+#ifdef GSPARDRIVER_CUDA
+    #include "GSPar_CUDA.hpp"
+    using namespace GSPar::Driver::CUDA;
+#else
+    #include "GSPar_OpenCL.hpp"
+    using namespace GSPar::Driver::OpenCL;
+#endif
+#include "GSPar_PatternReduce.hpp"
+using namespace GSPar::Pattern;
+
+int reduce_sum(const int size, const int *vector) {
+    int total;
+    try {
+        auto pattern = new Reduce("in_vector", "+", "total");
+        pattern->setParameter("in_vector", sizeof(int) * size, vector)
+                .setParameter("total", sizeof(int), &total, GSPAR_PARAM_OUT);
+        pattern->run<Instance>({(unsigned int)size, 0});
+        delete pattern;
+    } catch (GSPar::GSParException &ex) {
+        std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl;
+        exit(-1);
+    }
+    return total;
+}
+
+void print_vector(int size, const int* vector, bool compact = false) {
+    if (compact || size > 100) {
+        std::cout << vector[0] << "..." << vector[size-1];
+    } else {
+        for (int i = 0; i < size; i++) {
+            std::cout << vector[i] << " ";
+        }
+    }
+    std::cout << std::endl;
+}
+
+int main(int argc, const char * argv[]) {
+    if (argc < 2) {
+        std::cerr << "Use: " << argv[0] << " <vector_size>" << std::endl;
+        exit(-1);
+    }
+
+    const int VECTOR_SIZE = std::stoul(argv[1]);
+    int *vector = new int[VECTOR_SIZE];
+    for (int i = 0; i < VECTOR_SIZE; i++) {
+        vector[i] = i;
+    }
+
+    std::cout << "Summing vector: ";
+    print_vector(VECTOR_SIZE, vector);
+
+    int total = reduce_sum(VECTOR_SIZE, vector);
+
+    std::cout << "Summed vector of " << VECTOR_SIZE << " elements: " << total << std::endl;
+}
\ No newline at end of file
diff --git a/examples/pattern_api/vector_sum_map.cpp b/examples/pattern_api/vector_sum_map.cpp
new file mode 100644
index 0000000..d096da6
--- /dev/null
+++ b/examples/pattern_api/vector_sum_map.cpp
@@ -0,0 +1,107 @@
+#include <iostream>
+#include <chrono>
+
+#ifdef GSPARDRIVER_OPENCL
+    #include "GSPar_OpenCL.hpp"
+    using namespace GSPar::Driver::OpenCL;
+#else
+    #include "GSPar_CUDA.hpp"
+    using namespace GSPar::Driver::CUDA;
+#endif
+
+#include "GSPar_PatternMap.hpp"
+using namespace GSPar::Pattern;
+
+void vector_sum(const unsigned int max, const unsigned int* a, const unsigned int* b, unsigned int* result) {
+    try {
+
+        auto pattern = new Map(GSPAR_STRINGIZE_SOURCE(
+            result[x] = a[x] + b[x];
+        ));
+
+        pattern->setParameter("a", sizeof(unsigned int) * max, a)
+            .setParameter("b", sizeof(unsigned int) * max, b)
+            .setParameter("result", sizeof(unsigned int) * max, result, GSPAR_PARAM_OUT);
+
+        // This set only max values
+        unsigned long dims[3] = {max, 0, 0}; // Pass ulong max values directly
+        // GSPar::Driver::Dimensions dims(max, 0, 0); // Makes struct passing max values
+        // GSPar::Driver::Dimensions dims = {max, 0, 0}; // Makes struct using auto-initialization with ulong max values
+
+        // This way we can set max and min values
+        // GSPar::Driver::Dimensions dims = { // Makes struct using auto-initialization with ulong max and min values
+        //     {max, 0}, // X: max, min
+        //     {0, 0}, // Y: max, min
+        //     {0, 0} // Z: max, min
+        // };
+
+        // Makes empty struct and them fill values for intended dimensions
+        // GSPar::Driver::Dimensions dims;
+        // dims.x = GSPar::Driver::SingleDimension(max, 5);
+
+        pattern->run<Instance>(dims);
+
+        // We could also call initialize the Dimensions directly when calling the method:
+        // pattern->run<Instance>({max, 0});
+
+        delete pattern;
+
+    } catch (GSPar::GSParException &ex) {
+        std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl;
+        exit(-1);
+    }
+}
+
+void print_vector(unsigned int size, const unsigned int* vector, bool compact = false) {
+    if (compact || size > 100) {
+        std::cout << vector[0] << "..." << vector[size-1];
+    } else {
+        for (unsigned int i = 0; i < size; i++) {
+            std::cout << vector[i] << " ";
+        }
+    }
+    std::cout << std::endl;
+}
+
+int main(int argc, const char * argv[]) {
+    if (argc < 2) {
+        std::cerr << "Use: " << argv[0] << " <vector_size>" << std::endl;
+        exit(-1);
+    }
+
+    const unsigned int VECTOR_SIZE = std::stoi(argv[1]);
+
+    // Create memory objects
+    unsigned int* result = new unsigned int[VECTOR_SIZE];
+    unsigned int* a = new unsigned int[VECTOR_SIZE];
+    unsigned int* b = new unsigned int[VECTOR_SIZE];
+    for (unsigned int i = 0; i < VECTOR_SIZE; i++) {
+        a[i] = i;
+        b[i] = i + 1;
+        result[i] = 0;
+    }
+
+    std::cout << "Summing vectors:" << std::endl;
+    std::cout << "Vector A: ";
+    print_vector(VECTOR_SIZE, a);
+    std::cout << "Vector B: ";
+    print_vector(VECTOR_SIZE, b);
+
+    auto t_start = std::chrono::steady_clock::now();
+
+    vector_sum(VECTOR_SIZE, a, b, result);
+
+    auto t_end = std::chrono::steady_clock::now();
+
+    // Output the result buffer
+    std::cout << "Result:   ";
+    print_vector(VECTOR_SIZE, result);
+
+    delete result;
+    delete a;
+    delete b;
+
+    std::cout << "Test finished succesfully in " << std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count() << " ms " << std::endl;
+
+    return 0;
+}
diff --git a/examples/pattern_api/vector_sum_map_batch.cpp b/examples/pattern_api/vector_sum_map_batch.cpp
new file mode 100644
index 0000000..63e4b12
--- /dev/null
+++ b/examples/pattern_api/vector_sum_map_batch.cpp
@@ -0,0 +1,117 @@
+#include <iostream>
+#include <chrono>
+
+#ifdef GSPARDRIVER_OPENCL
+    #include "GSPar_OpenCL.hpp"
+    using namespace GSPar::Driver::OpenCL;
+#else
+    #include "GSPar_CUDA.hpp"
+    using namespace GSPar::Driver::CUDA;
+#endif
+
+#include "GSPar_PatternMap.hpp"
+using namespace GSPar::Pattern;
+
+void vector_sum(const unsigned int num_vectors, const unsigned int batch_size, const unsigned int vector_size, unsigned int **as, unsigned int **bs, unsigned int **results) {
+    try {
+
+        auto pattern = new Map(GSPAR_STRINGIZE_SOURCE(
+            result[x] = a[x] + b[x];
+        ));
+
+        pattern->setParameter("size", vector_size)
+            .setParameterPlaceholder<unsigned int *>("a", GSPAR_PARAM_POINTER, GSPAR_PARAM_IN, true)
+            .setParameterPlaceholder<unsigned int *>("b", GSPAR_PARAM_POINTER, GSPAR_PARAM_IN, true)
+            .setParameterPlaceholder<unsigned int *>("result", GSPAR_PARAM_POINTER, GSPAR_PARAM_OUT, true);
+
+        pattern->setBatchSize(batch_size);
+
+        pattern->compile<Instance>({vector_size, 0});
+
+        // If num_vectors is not divisible by batch_size, the lib issues a segfault.
+        // unsigned int batches = ceil((double)num_vectors/batch_size);
+        unsigned int batches = num_vectors/batch_size;
+        for (unsigned int b = 0; b < batches; b++) {
+            pattern->setBatchedParameter("a", sizeof(unsigned int) * vector_size, &as[b*batch_size])
+                .setBatchedParameter("b", sizeof(unsigned int) * vector_size, &bs[b*batch_size])
+                .setBatchedParameter("result", sizeof(unsigned int) * vector_size, &results[b*batch_size], GSPAR_PARAM_OUT);
+
+            pattern->run<Instance>();
+        }
+
+    } catch (GSPar::GSParException &ex) {
+        std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl;
+        exit(-1);
+    }
+}
+
+void print_vector(unsigned int size, const unsigned int* vector, bool compact = false) {
+    if (compact || size > 100) {
+        std::cout << vector[0] << "..." << vector[size-1];
+    } else {
+        for (unsigned int i = 0; i < size; i++) {
+            std::cout << vector[i] << " ";
+        }
+    }
+    std::cout << std::endl;
+}
+
+int main(int argc, const char * argv[]) {
+    if (argc < 4) {
+        std::cerr << "Use: " << argv[0] << " <vector_size> <vectors> <batch_size>" << std::endl;
+        exit(-1);
+    }
+
+    const unsigned int VECTOR_SIZE = std::stoi(argv[1]);
+    const unsigned int NUM_VECTORS = std::stoi(argv[2]);
+    const unsigned int BATCH_SIZE = std::stoi(argv[3]);
+
+    // Create memory objects
+    unsigned int** results = new unsigned int*[NUM_VECTORS];
+    unsigned int** as = new unsigned int*[NUM_VECTORS];
+    unsigned int** bs = new unsigned int*[NUM_VECTORS];
+    for (unsigned int v = 0; v < NUM_VECTORS; v++) {
+        results[v] = new unsigned int[VECTOR_SIZE];
+        as[v] = new unsigned int[VECTOR_SIZE];
+        bs[v] = new unsigned int[VECTOR_SIZE];
+        for (unsigned int i = 0; i < VECTOR_SIZE; i++) {
+            as[v][i] = i + v;
+            bs[v][i] = i + v + 1;
+            results[v][i] = 0;
+        }
+    }
+
+    std::cout << "Summing " << NUM_VECTORS << " vectors:" << std::endl;
+    for (unsigned int v = 0; v < NUM_VECTORS; v++) {
+        std::cout << "Vector A" << v+1 << ": ";
+        print_vector(VECTOR_SIZE, as[v]);
+        std::cout << "Vector B" << v+1 << ": ";
+        print_vector(VECTOR_SIZE, bs[v]);
+    }
+
+    auto t_start = std::chrono::steady_clock::now();
+
+    vector_sum(NUM_VECTORS, BATCH_SIZE, VECTOR_SIZE, as, bs, results);
+
+    auto t_end = std::chrono::steady_clock::now();
+
+    // Output the result buffer
+    std::cout << "Results:" << std::endl;
+    for (unsigned int v = 0; v < NUM_VECTORS; v++) {
+        std::cout << "Vector " << v+1 << ": ";
+        print_vector(VECTOR_SIZE, results[v]);
+    }
+
+    for (unsigned int v = 0; v < NUM_VECTORS; v++) {
+        delete results[v];
+        delete as[v];
+        delete bs[v];
+    }
+    delete results;
+    delete as;
+    delete bs;
+
+    std::cout << "Test finished succesfully in " << std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count() << " ms " << std::endl;
+
+    return 0;
+}
diff --git a/examples/pattern_api/vector_sum_map_managing_memory.cpp b/examples/pattern_api/vector_sum_map_managing_memory.cpp
new file mode 100644
index 0000000..1c2e690
--- /dev/null
+++ b/examples/pattern_api/vector_sum_map_managing_memory.cpp
@@ -0,0 +1,103 @@
+#include <iostream>
+#include <chrono>
+
+#ifdef GSPARDRIVER_OPENCL
+    #include "GSPar_OpenCL.hpp"
+    using namespace GSPar::Driver::OpenCL;
+#else
+    #include "GSPar_CUDA.hpp"
+    using namespace GSPar::Driver::CUDA;
+#endif
+
+#include "GSPar_PatternMap.hpp"
+using namespace GSPar::Pattern;
+
+void vector_sum(const unsigned int max, const unsigned int* a, const unsigned int* b, unsigned int* result) {
+    try {
+
+        auto pattern = new Map(GSPAR_STRINGIZE_SOURCE(
+            result[x] = a[x] + b[x];
+        ));
+
+        auto gpu = pattern->getGpu<Instance>();
+        // Memory spaces for matrixes a and result are managed by hand by the programmer,
+        // while matrix b is managed automatically by GSParLib.
+        auto resultA = gpu->malloc(sizeof(unsigned int) * max, a);
+        resultA->copyIn();
+        auto resultDev = gpu->malloc(sizeof(unsigned int) * max, result);
+
+        // The direction GSPAR_PARAM_PRESENT indicates to GSParLib that the data is already
+        // in the GPU memory and no memory copies should be performed.
+        pattern->setParameter<const unsigned int *>("a", resultA, GSPAR_PARAM_PRESENT)
+            .setParameter("b", sizeof(unsigned int) * max, b)
+            .setParameter<unsigned int *>("result", resultDev, GSPAR_PARAM_PRESENT);
+
+        pattern->run<Instance>({max, 0});
+
+        // Since the parameter was informed using GSPAR_PARAM_PRESENT, we should copy the data.
+        // This would not be necessary if we passed the parameter with the direction GSPAR_PARAM_OUT.
+        resultDev->copyOut();
+
+        delete resultA;
+        delete resultDev;
+        delete pattern;
+
+    } catch (GSPar::GSParException &ex) {
+        std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl;
+        exit(-1);
+    }
+}
+
+void print_vector(unsigned int size, const unsigned int* vector, bool compact = false) {
+    if (compact || size > 100) {
+        std::cout << vector[0] << "..." << vector[size-1];
+    } else {
+        for (unsigned int i = 0; i < size; i++) {
+            std::cout << vector[i] << " ";
+        }
+    }
+    std::cout << std::endl;
+}
+
+int main(int argc, const char * argv[]) {
+    if (argc < 2) {
+        std::cerr << "Use: " << argv[0] << " <vector_size>" << std::endl;
+        exit(-1);
+    }
+
+    const unsigned int VECTOR_SIZE = std::stoi(argv[1]);
+
+    // Create memory objects
+    unsigned int* result = new unsigned int[VECTOR_SIZE];
+    unsigned int* a = new unsigned int[VECTOR_SIZE];
+    unsigned int* b = new unsigned int[VECTOR_SIZE];
+    for (unsigned int i = 0; i < VECTOR_SIZE; i++) {
+        a[i] = i;
+        b[i] = i + 1;
+        result[i] = 0;
+    }
+
+    std::cout << "Summing vectors:" << std::endl;
+    std::cout << "Vector A: ";
+    print_vector(VECTOR_SIZE, a);
+    std::cout << "Vector B: ";
+    print_vector(VECTOR_SIZE, b);
+
+    auto t_start = std::chrono::steady_clock::now();
+
+    vector_sum(VECTOR_SIZE, a, b, result);
+
+    auto t_end = std::chrono::steady_clock::now();
+
+    // Output the result buffer
+    std::cout << "Result:   ";
+    print_vector(VECTOR_SIZE, result);
+
+    delete result;
+    delete a;
+    delete b;
+
+    std::cout << "Test finished succesfully in " << std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count() << " ms " << std::endl;
+
+    return 0;
+}
diff --git a/examples/pattern_api/vector_sum_map_parallel.cpp b/examples/pattern_api/vector_sum_map_parallel.cpp
new file mode 100644
index 0000000..a930d6b
--- /dev/null
+++ b/examples/pattern_api/vector_sum_map_parallel.cpp
@@ -0,0 +1,195 @@
+#include <iostream>
+#include <chrono>
+#include <thread>
+#include <sstream>
+
+#ifdef GSPARDRIVER_OPENCL
+    #include "GSPar_OpenCL.hpp"
+    using namespace GSPar::Driver::OpenCL;
+#else
+    #include "GSPar_CUDA.hpp"
+    using namespace GSPar::Driver::CUDA;
+#endif
+
+#include "GSPar_PatternMap.hpp"
+using namespace GSPar::Pattern;
+
+struct Task {
+    float* a;
+    float* b;
+    float* result;
+    float total;
+};
+
+void vector_sum(const unsigned int from, const unsigned int to, const unsigned int max, Task* tasks, Map* pattern) {
+    // Sequential version, for debugging purposes
+    // for (unsigned int t = from; t < to; t++) {
+    //     tasks[t].total = 0;
+    //     for (unsigned int x = 0; x < max; x++) {
+    //         tasks[t].result[x] = tasks[t].a[x] + tasks[t].b[x];
+    //         tasks[t].total += tasks[t].result[x];
+    //     }
+    // }
+    // return;
+
+    std::stringstream ss;
+
+#ifdef GSPAR_DEBUG
+    ss << "Pattern " << pattern << " processing tasks " << from+1 << " to " << to << std::endl;
+    std::cout << ss.str();
+    ss.str("");
+#endif
+    for (unsigned int t = from; t < to; t++) {
+        try {
+
+            // Now we set the real parameter values
+            pattern->setParameter("a", sizeof(float) * max, tasks[t].a)
+                .setParameter("b", sizeof(float) * max, tasks[t].b)
+                .setParameter("result", sizeof(float) * max, tasks[t].result, GSPAR_PARAM_OUT);
+
+
+            // As we compiled the kernel before, it is not needed to compile it again now.
+            // The pattern will automatically skip the compiling phase.
+            unsigned long dimensions[3] = {max, 0, 0}; // If the dimensions were to be different from the already compiled kernel, it would be re-compiled.
+#ifdef GSPAR_DEBUG
+            ss << "Pattern " << pattern << " running task " << (t+1) << std::endl;
+            std::cout << ss.str();
+            ss.str("");
+#endif
+            pattern->run<Instance>(dimensions);
+            
+            // Reduce on CPU
+            for (unsigned int x = 0; x < max; x++) {
+                tasks[t].total += tasks[t].result[x];
+            }
+
+        } catch (GSPar::GSParException &ex) {
+            std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl;
+            exit(-1);
+        }
+    }
+
+    delete pattern;
+
+}
+
+void process_tasks(const unsigned int max, unsigned int tasks_size, Task* tasks, unsigned int workers) {
+    // Sequential version, for debugging purposes
+    // for (unsigned int t = 0; t < tasks_size; t++) {
+    //     tasks[t].total = 0;
+    //     for (unsigned int x = 0; x < max; x++) {
+    //         tasks[t].result[x] = tasks[t].a[x] + tasks[t].b[x];
+    //         tasks[t].total += tasks[t].result[x];
+    //     }
+    // }
+    // return;
+
+    // We assume that tasks_size is divisible by workers
+    const unsigned int work_for_each = tasks_size/workers;
+    std::cout << "Starting " << workers << " workers to process " << tasks_size << " tasks, " << work_for_each << " tasks for each worker" << std::endl;
+
+    auto pattern = new Map(GSPAR_STRINGIZE_SOURCE(
+        result[x] = a[x] + b[x];
+    ));
+
+    try {
+
+        // Fixed value parameters can be set. Parameter placeholder are for compiling the kernel.
+        pattern->setParameterPlaceholder<float*>("a")
+            .setParameterPlaceholder<float*>("b")
+            .setParameterPlaceholder<float*>("result", GSPAR_PARAM_POINTER, GSPAR_PARAM_OUT);
+        
+        // Compile the kernel once before cloning the pattern. The compiled Kernel would be copied over to all pattern's clones
+        unsigned long dimensions[3] = {max, 0, 0};
+        pattern->compile<Instance>(dimensions);
+
+    } catch (GSPar::GSParException &ex) {
+        std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl;
+        exit(-1);
+    }
+
+    // std::cout << "Pattern have " << pattern->getParameterList().size() << " parameters" << std::endl;
+
+    std::thread* threads = new std::thread[workers];
+    for (unsigned int w = 0; w < workers; w++) {
+        unsigned int from = w*work_for_each;
+        unsigned int to = from+work_for_each;
+        // Pattern must be cloned for each thread. The compiled kernel is thread-safe and therefore is carried over.
+        auto patternCopy = pattern->clone<Instance>();
+        threads[w] = std::thread(vector_sum, from, to, max, tasks, patternCopy);
+    }
+
+    for (unsigned int w = 0; w < workers; w++) {
+        threads[w].join();
+    }
+}
+
+void print_vector(unsigned int size, const float* vector, float total = 0, bool compact = false) {
+    if (compact || size > 100) {
+        std::cout << vector[0] << "..." << vector[size-1];
+        if (total) std::cout << " = " << total;
+    } else {
+        for (unsigned int i = 0; i < size; i++) {
+            std::cout << vector[i] << " ";
+        }
+        if (total) std::cout << "= " << total;
+    }
+    std::cout << std::endl;
+}
+
+int main(int argc, const char * argv[]) {
+    if (argc < 4) {
+        std::cerr << "Use: " << argv[0] << " <vector_size> <workers> <tasks>" << std::endl;
+        exit(-1);
+    }
+
+    const unsigned int VECTOR_SIZE = std::stoi(argv[1]);
+    const unsigned int WORKERS = std::stoi(argv[2]);
+    const unsigned int NUM_TASKS = std::stoi(argv[3]);
+
+    if (NUM_TASKS % WORKERS != 0) {
+        std::cerr << "Number of tasks (" << NUM_TASKS << ") must be divisible by number of workers (" << WORKERS << ")!" << std::endl;
+        exit(-1);
+    }
+    std::cout << "Summing vectors:" << std::endl;
+
+    Task* tasks = new Task[NUM_TASKS];
+    // Create memory objects
+    for (unsigned int t = 0; t < NUM_TASKS; t++) {
+        tasks[t].result = new float[VECTOR_SIZE];
+        tasks[t].a = new float[VECTOR_SIZE];
+        tasks[t].b = new float[VECTOR_SIZE];
+        for (unsigned int i = 0; i < VECTOR_SIZE; i++) {
+            tasks[t].a[i] = (float)(i+t);
+            tasks[t].b[i] = (float)((i+t) * 2);
+            tasks[t].result[i] = 0;
+        }
+
+        std::cout << "Task " << (t+1) << " vector A: ";
+        print_vector(VECTOR_SIZE, tasks[t].a);
+        std::cout << "Task " << (t+1) << " vector B: ";
+        print_vector(VECTOR_SIZE, tasks[t].b);
+    }
+
+    auto t_start = std::chrono::steady_clock::now();
+
+    process_tasks(VECTOR_SIZE, NUM_TASKS, tasks, WORKERS);
+
+    auto t_end = std::chrono::steady_clock::now();
+
+    // Output the result buffer
+    std::cout << "Results: " << std::endl;
+    for (unsigned int t = 0; t < NUM_TASKS; t++) {
+        std::cout << "Task " << (t+1) << ": ";
+        print_vector(VECTOR_SIZE, tasks[t].result, tasks[t].total);
+
+        delete[] tasks[t].result;
+        delete[] tasks[t].a;
+        delete[] tasks[t].b;
+    }
+    delete tasks;
+
+    std::cout << "Test finished succesfully in " << std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count() << " ms " << std::endl;
+
+    return 0;
+}
diff --git a/examples/pattern_api/vector_sum_mapreduce.cpp b/examples/pattern_api/vector_sum_mapreduce.cpp
new file mode 100644
index 0000000..1ad9077
--- /dev/null
+++ b/examples/pattern_api/vector_sum_mapreduce.cpp
@@ -0,0 +1,135 @@
+#include <iostream>
+#include <chrono>
+#include <iomanip>
+
+std::chrono::steady_clock::time_point tInitialization;
+std::chrono::steady_clock::time_point tComputation;
+std::chrono::steady_clock::time_point tFinishing;
+std::chrono::steady_clock::time_point tEnd;
+
+#ifdef GSPARDRIVER_OPENCL
+    #include "GSPar_OpenCL.hpp"
+    using namespace GSPar::Driver::OpenCL;
+#else
+    #include "GSPar_CUDA.hpp"
+    using namespace GSPar::Driver::CUDA;
+#endif
+
+#include "GSPar_PatternComposition.hpp"
+using namespace GSPar::Pattern;
+
+void print_vector(unsigned long size, const unsigned long *vector, bool compact = false) {
+    if (compact || size > 100) {
+        std::cout << vector[0] << "..." << vector[size-1];
+    } else {
+        for (unsigned long i = 0; i < size; i++) {
+            std::cout << vector[i] << " ";
+        }
+    }
+    std::cout << std::endl;
+}
+
+unsigned long vector_sum(const unsigned long max, const unsigned long *a, const unsigned long *b, unsigned long *result) {
+    try {
+
+        auto map = new Map("result[x] = a[x] + b[x];");
+        map->setParameter("a", sizeof(unsigned long) * max, a)
+            .setParameter("b", sizeof(unsigned long) * max, b)
+            .setParameter("result", sizeof(unsigned long) * max, result, GSPAR_PARAM_INOUT);
+
+        unsigned long total = 5;
+        // "result" is the vector with the data
+        // "+" is the binary associative operator
+        // "total" must be an OUT pointer parameter
+        auto reduce = new Reduce("result", "+", "total");
+        reduce->setParameter("result", sizeof(unsigned long) * max, result, GSPAR_PARAM_INOUT)
+               .setParameter("total", sizeof(unsigned long), &total, GSPAR_PARAM_INOUT);
+
+        // Using initializer_list
+        // PatternComposition mapReduce {map, reduce};
+        // Using variadic templates constructor
+        auto mapReduce = new PatternComposition(map, reduce);
+
+        mapReduce->compilePatterns<Instance>({max, 0});
+
+
+        tComputation = std::chrono::steady_clock::now(); // Ends initialization, start computation
+
+
+        mapReduce->run<Instance>();
+
+
+        tFinishing = std::chrono::steady_clock::now(); // Ends computation, start finishing
+
+        delete mapReduce;
+        delete reduce;
+        delete map;
+
+        return total;
+
+    } catch (GSPar::GSParException &ex) {
+        std::cerr << "Exception: " << ex.what() << " - " << ex.getDetails() << std::endl;
+        exit(-1);
+    }
+}
+
+int main(int argc, const char * argv[]) {
+    if (argc < 2) {
+        std::cerr << "Use: " << argv[0] << " <vector_size>" << std::endl;
+        exit(-1);
+    }
+    std::cout << std::fixed << std::setprecision(0);
+
+    const unsigned long VECTOR_SIZE = std::stoul(argv[1]);
+
+    // Create memory objects
+    unsigned long *result = new unsigned long[VECTOR_SIZE];
+    unsigned long *a = new unsigned long[VECTOR_SIZE];
+    unsigned long *b = new unsigned long[VECTOR_SIZE];
+    for (unsigned long i = 0; i < VECTOR_SIZE; i++) {
+        a[i] = i;
+        b[i] = i + 1;
+        result[i] = 0;
+    }
+
+#ifdef DEBUG
+    std::cout << "Summing vectors:" << std::endl;
+    std::cout << "Vector A: ";
+    print_vector(VECTOR_SIZE, a);
+    std::cout << "Vector B: ";
+    print_vector(VECTOR_SIZE, b);
+#endif
+
+    tInitialization = std::chrono::steady_clock::now(); // Begins initialization
+
+    unsigned long total = vector_sum(VECTOR_SIZE, a, b, result);
+
+    tEnd = std::chrono::steady_clock::now(); // Ends finish
+
+#ifdef DEBUG
+    // Output the result buffer
+    std::cout << "Result:   ";
+    print_vector(VECTOR_SIZE, result);
+#endif
+
+    delete[] result;
+    delete[] a;
+    delete[] b;
+
+    double msTotal = std::chrono::duration_cast<std::chrono::milliseconds>(tEnd - tInitialization).count();
+    double msInitialization = std::chrono::duration_cast<std::chrono::milliseconds>(tComputation - tInitialization).count();
+    double msComputation = std::chrono::duration_cast<std::chrono::milliseconds>(tFinishing - tComputation).count();
+    double msFinishing = std::chrono::duration_cast<std::chrono::milliseconds>(tEnd - tFinishing).count();
+
+#ifdef DEBUG
+    std::cout << "Total:    " << total << std::endl;
+    std::cout << "Total: " << msTotal << " ms" << std::endl;
+    std::cout << "Initialization: " << msInitialization << " ms" << std::endl;
+    std::cout << "Computation: " << msComputation << " ms" << std::endl;
+    std::cout << "Finishing: " << msFinishing << " ms" << std::endl;
+#else
+    std::cout << total << ";" << msTotal << ";" << msInitialization << ";" << msComputation << ";" << msFinishing << std::endl;
+#endif
+
+    return 0;
+}
diff --git a/examples/sequential/mandel.cpp b/examples/sequential/mandel.cpp
new file mode 100644
index 0000000..315cd0d
--- /dev/null
+++ b/examples/sequential/mandel.cpp
@@ -0,0 +1,100 @@
+#include <iostream>
+#include <chrono>
+#include <iomanip>
+#ifdef DEBUG
+#include "marX2/marX2.h"
+#endif
+
+std::chrono::steady_clock::time_point tInitialization;
+std::chrono::steady_clock::time_point tComputation;
+std::chrono::steady_clock::time_point tFinishing;
+std::chrono::steady_clock::time_point tEnd;
+
+void mandelbrot(const double init_a, const double init_b, const double range, const unsigned long dim, const unsigned long niter, unsigned char *M) {
+    double step = range/((double) dim);
+
+    tComputation = std::chrono::steady_clock::now(); // Ends initialization, start computation
+
+
+    for(unsigned long i = 0; i < dim; i++) {
+        double im=init_b+(step*i);
+        for (unsigned long j = 0; j < dim; j++) {
+            double cr;
+            double a=cr=init_a+step*j;
+            double b=im;
+            unsigned long k = 0;
+            for (k = 0; k < niter; k++) {
+                double a2=a*a;
+                double b2=b*b;
+                if ((a2+b2)>4.0) break;
+                b=2*a*b+im;
+                a=a2-b2+cr;
+            }
+            M[i*dim+j]= (unsigned char)(255-((k*255/niter)));
+        }
+    }
+
+
+    tFinishing = std::chrono::steady_clock::now(); // Ends computation, start finishing
+}
+
+int main(int argc, char **argv) {
+    double init_a=-2.125,init_b=-1.5,range=3.0;
+    unsigned long dim = 1000;
+    unsigned long niter = 1000;
+    std::cout << std::fixed << std::setprecision(0);
+
+    #ifndef DEBUG
+        if (argc<3) {
+            std::cerr << "Usage: " << argv[0] << " <size> <niterations>" << std::endl;
+            exit(-1);
+        }
+    #endif
+    if (argc > 1) {
+        dim = strtoul(argv[1], 0, 10);
+    }
+    if (argc > 2) {
+        niter = strtoul(argv[2], 0, 10);
+    }
+
+    unsigned char *M = new unsigned char[dim*dim];
+
+    #ifdef DEBUG
+        SetupXWindows(dim,dim,1,NULL,"Mandelbroot");
+    #endif
+
+    tInitialization = std::chrono::steady_clock::now(); // Begins initialization
+
+    mandelbrot(init_a, init_b, range, dim, niter, M);
+
+    tEnd = std::chrono::steady_clock::now(); // Ends finish
+
+    #ifdef DEBUG
+        for(unsigned long i=0; i<dim; i++) {
+            ShowLine(&M[i*dim],dim,i);
+        }
+    #endif
+
+    double msTotal = std::chrono::duration_cast<std::chrono::milliseconds>(tEnd - tInitialization).count();
+    double msInitialization = std::chrono::duration_cast<std::chrono::milliseconds>(tComputation - tInitialization).count();
+    double msComputation = std::chrono::duration_cast<std::chrono::milliseconds>(tFinishing - tComputation).count();
+    double msFinishing = std::chrono::duration_cast<std::chrono::milliseconds>(tEnd - tFinishing).count();
+
+    #ifdef DEBUG
+        std::cout << "Teste: " << argv[0] << " " << dim << " " << niter << std::endl;
+        std::cout << "Total: " << msTotal << " ms" << std::endl;
+        std::cout << "Initialization: " << msInitialization << " ms" << std::endl;
+        std::cout << "Computation: " << msComputation << " ms" << std::endl;
+        std::cout << "Finishing: " << msFinishing << " ms" << std::endl;
+    #else
+        std::cout << argv[0] << " " << dim << " " << niter << ";" << msTotal << ";" << msInitialization << ";" << msComputation << ";" << msFinishing << std::endl;
+    #endif
+
+    #ifdef DEBUG
+        getchar();
+        CloseXWindows();
+    #endif
+
+    delete[] M;
+    return 0;
+}
diff --git a/examples/sequential/matrix_multi_cm.cpp b/examples/sequential/matrix_multi_cm.cpp
new file mode 100644
index 0000000..8093ba3
--- /dev/null
+++ b/examples/sequential/matrix_multi_cm.cpp
@@ -0,0 +1,101 @@
+#include <iostream>
+#include <chrono>
+#include <iomanip>
+
+std::chrono::steady_clock::time_point tInitialization;
+std::chrono::steady_clock::time_point tComputation;
+std::chrono::steady_clock::time_point tFinishing;
+std::chrono::steady_clock::time_point tEnd;
+
+
+void multiply(const unsigned long size, const float *matrixA, const float *matrixB, float *result) {
+    tComputation = std::chrono::steady_clock::now(); // Ends initialization, start computation
+
+    for (unsigned long i = 0; i < size; i++) {
+        for (unsigned long j = 0; j < size; j++) {
+            float sum = 0;
+            for(unsigned long k = 0; k < size; k++) {
+                // result[i * size + j] += matrixA[i * size + k] * matrixB[k * size + j]; // Row-major
+                sum += matrixA[k * size + i] * matrixB[j * size + k]; // Column-major
+            }
+            result[j * size + i] = sum;
+        }
+    }
+
+    tFinishing = std::chrono::steady_clock::now(); // Ends computation, start finishing
+}
+
+void printMatrix(const unsigned long size, float *matrix, bool compact = false) {
+    if (compact || size > 100) {
+        std::cout << matrix[0] << ".." << matrix[size-1] << ".." << matrix[size*size-1];
+    } else {
+        for (unsigned long i = 0; i < size; i++) {
+            std::cout << std::endl;
+            for (unsigned long j = 0; j < size; j++) {
+                // std::cout << matrix[i * size + j] << " "; // Row-major
+                std::cout << matrix[j * size + i] << " "; // Column-major
+            }
+        }
+    }
+    std::cout << std::endl;
+}
+
+int main(int argc, char const *argv[]) {
+    if (argc < 2) {
+        std::cerr << "Use: " << argv[0] << " <matrix_size>" << std::endl;
+        exit(-1);
+    }
+    std::cout << std::fixed << std::setprecision(0);
+    
+    const unsigned long MX = std::stoi(argv[1]);
+
+    float *matrixA = new float[MX * MX];
+    float *matrixB = new float[MX * MX];
+    float *result = new float[MX * MX];  
+    for (unsigned long i = 0; i < MX; i++) {
+        for (unsigned long j = 0; j < MX; j++) {
+            // Row-major
+            // matrixA[i * MX + j] = i+1;
+            // matrixB[i * MX + j] = j+1;
+            // result[i * MX + j] = 0;
+            // Column-major
+            matrixA[j * MX + i] = i+1;
+            matrixB[j * MX + i] = j+1;
+            result[j * MX + i] = 0;
+        }
+    }
+
+    #ifdef DEBUG
+        std::cout << "Multiplying matrixes of " << MX << " x " << MX << std::endl;
+        std::cout << "Matrix A: ";
+        printMatrix(MX, matrixA);
+        std::cout << "Matrix B: ";
+        printMatrix(MX, matrixB);
+    #endif
+
+    tInitialization = std::chrono::steady_clock::now(); // Begins initialization
+
+    multiply(MX, matrixA, matrixB, result);
+
+    tEnd = std::chrono::steady_clock::now(); // Ends finish
+
+    double msTotal = std::chrono::duration_cast<std::chrono::milliseconds>(tEnd - tInitialization).count();
+    double msInitialization = std::chrono::duration_cast<std::chrono::milliseconds>(tComputation - tInitialization).count();
+    double msComputation = std::chrono::duration_cast<std::chrono::milliseconds>(tFinishing - tComputation).count();
+    double msFinishing = std::chrono::duration_cast<std::chrono::milliseconds>(tEnd - tFinishing).count();
+
+    #ifdef DEBUG
+        std::cout << "Result: ";
+        printMatrix(MX, result);
+        std::cout << "Total: " << msTotal << " ms" << std::endl;
+        std::cout << "Initialization: " << msInitialization << " ms" << std::endl;
+        std::cout << "Computation: " << msComputation << " ms" << std::endl;
+        std::cout << "Finishing: " << msFinishing << " ms" << std::endl;
+    #else
+        std::cout << result[0] << ".." << result[MX-1] << ".." << result[MX*MX-1] << ";" << msTotal << ";" << msInitialization << ";" << msComputation << ";" << msFinishing << std::endl;
+    #endif
+
+    delete[] matrixA;
+    delete[] matrixB;
+    delete[] result;
+}
diff --git a/examples/sequential/matrix_multi_rm.cpp b/examples/sequential/matrix_multi_rm.cpp
new file mode 100644
index 0000000..6462321
--- /dev/null
+++ b/examples/sequential/matrix_multi_rm.cpp
@@ -0,0 +1,95 @@
+#include <iostream>
+#include <chrono>
+#include <iomanip>
+
+std::chrono::steady_clock::time_point tInitialization;
+std::chrono::steady_clock::time_point tComputation;
+std::chrono::steady_clock::time_point tFinishing;
+std::chrono::steady_clock::time_point tEnd;
+
+
+void multiply(const unsigned long size, const float *matrixA, const float *matrixB, float *result) {
+    tComputation = std::chrono::steady_clock::now(); // Ends initialization, start computation
+
+    for (unsigned long i = 0; i < size; i++) {
+        for (unsigned long j = 0; j < size; j++) {
+            float sum = 0;
+            for(unsigned long k = 0; k < size; k++) {
+                sum += matrixA[i * size + k] * matrixB[k * size + j]; // Row-major
+            }
+            result[i * size + j] = sum;
+        }
+    }
+
+    tFinishing = std::chrono::steady_clock::now(); // Ends computation, start finishing
+}
+
+void printMatrix(const unsigned long size, float *matrix, bool compact = false) {
+    if (compact || size > 100) {
+        std::cout << matrix[0] << ".." << matrix[size-1] << ".." << matrix[size*size-1];
+    } else {
+        for (unsigned long i = 0; i < size; i++) {
+            std::cout << std::endl;
+            for (unsigned long j = 0; j < size; j++) {
+                std::cout << matrix[i * size + j] << " "; // Row-major
+            }
+        }
+    }
+    std::cout << std::endl;
+}
+
+int main(int argc, char const *argv[]) {
+    if (argc < 2) {
+        std::cerr << "Use: " << argv[0] << " <matrix_size>" << std::endl;
+        exit(-1);
+    }
+    std::cout << std::fixed << std::setprecision(0);
+    
+    const unsigned long MX = std::stoi(argv[1]);
+
+    float *matrixA = new float[MX * MX];
+    float *matrixB = new float[MX * MX];
+    float *result = new float[MX * MX];
+    for (unsigned long i = 0; i < MX; i++) {
+        for (unsigned long j = 0; j < MX; j++) {
+            // Row-major
+            matrixA[i * MX + j] = i+1;
+            matrixB[i * MX + j] = j+1;
+            result[i * MX + j] = 0;
+        }
+    }
+
+    #ifdef DEBUG
+        std::cout << "Multiplying matrixes of " << MX << " x " << MX << std::endl;
+        std::cout << "Matrix A: ";
+        printMatrix(MX, matrixA);
+        std::cout << "Matrix B: ";
+        printMatrix(MX, matrixB);
+    #endif
+
+    tInitialization = std::chrono::steady_clock::now(); // Begins initialization
+
+    multiply(MX, matrixA, matrixB, result);
+
+    tEnd = std::chrono::steady_clock::now(); // Ends finish
+
+    double msTotal = std::chrono::duration_cast<std::chrono::milliseconds>(tEnd - tInitialization).count();
+    double msInitialization = std::chrono::duration_cast<std::chrono::milliseconds>(tComputation - tInitialization).count();
+    double msComputation = std::chrono::duration_cast<std::chrono::milliseconds>(tFinishing - tComputation).count();
+    double msFinishing = std::chrono::duration_cast<std::chrono::milliseconds>(tEnd - tFinishing).count();
+
+    #ifdef DEBUG
+        std::cout << "Result: ";
+        printMatrix(MX, result);
+        std::cout << "Total: " << msTotal << " ms" << std::endl;
+        std::cout << "Initialization: " << msInitialization << " ms" << std::endl;
+        std::cout << "Computation: " << msComputation << " ms" << std::endl;
+        std::cout << "Finishing: " << msFinishing << " ms" << std::endl;
+    #else
+        std::cout << result[0] << ".." << result[MX-1] << ".." << result[MX*MX-1] << ";" << msTotal << ";" << msInitialization << ";" << msComputation << ";" << msFinishing << std::endl;
+    #endif
+
+    delete[] matrixA;
+    delete[] matrixB;
+    delete[] result;
+}
diff --git a/examples/sequential/primer.cpp b/examples/sequential/primer.cpp
new file mode 100644
index 0000000..7e93895
--- /dev/null
+++ b/examples/sequential/primer.cpp
@@ -0,0 +1,79 @@
+/* ***************************************************************************
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ *  As a special exception, you may use this file as part of a free software
+ *  library without restriction.  Specifically, if other files instantiate
+ *  templates or use macros or inline functions from this file, or you compile
+ *  this file and link it with other files to produce an executable, this
+ *  file does not by itself cause the resulting executable to be covered by
+ *  the GNU General Public License.  This exception does not however
+ *  invalidate any other reasons why the executable file might be covered by
+ *  the GNU General Public License.
+ *
+ ****************************************************************************
+ *  Authors: Dalvan Griebler <dalvangriebler@gmail.com>
+ *         
+ *  Copyright: GNU General Public License
+ *  Description: Application that counts the number of primes between 1 and N (argument [...] are optional).
+ *  File Name: prime.cpp
+ *  Version: 1.0 (17/07/2016)
+ *  Compilation Command: g++ -std=c++1y prime.cpp -o exe
+ *	Exacution Command: ./exe -h
+*/
+
+#include <cstdlib>
+#include <cstdio>
+#include <chrono>
+#include <iostream>
+#include <getopt.h>
+#include <cstring>
+
+int prime_number ( int n ){
+	int total = 0;
+	for (int i = 2; i <= n; i++ ){
+		int prime = 1;
+		for (int j = 2; j < i; j++ ){
+			if ( i % j == 0 ){
+				prime = 0;
+				break;
+			}
+		}
+		// if (prime) {
+		// 	std::cout << "Prime found: " << i << std::endl;
+		// }
+		total = total + prime;
+	}
+	return total;
+}
+
+
+int main ( int argc, char *argv[]){
+	int n = 0;
+	if (argc != 2){
+		std::cout << "Usage: " << argv[0] << " <max n>" << std::endl;
+		exit(1);
+	}
+	n = atoi(argv[1]);
+
+	auto t_start = std::chrono::high_resolution_clock::now();
+
+	int total_primes = prime_number( n );
+	
+	auto t_end = std::chrono::high_resolution_clock::now();
+
+	std::cout << n << " max\t" << total_primes << " primes\t" << std::chrono::duration_cast<std::chrono::milliseconds>(t_end-t_start).count() << "ms" << std::endl;
+
+	return 0;
+}
+/******************************************************************************/
\ No newline at end of file
diff --git a/examples/sequential/raytracer.cpp b/examples/sequential/raytracer.cpp
new file mode 100644
index 0000000..3eae6a3
--- /dev/null
+++ b/examples/sequential/raytracer.cpp
@@ -0,0 +1,524 @@
+// [header]
+// A very basic raytracer example.
+// [/header]
+// [compile]
+// c++ -o raytracer -O3 -Wall raytracer.cpp
+// [/compile]
+// [ignore]
+// Copyright (C) 2012  www.scratchapixel.com
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+// [/ignore]
+#include <cstdlib>
+#include <cstdio>
+#include <cmath>
+#include <fstream>
+#include <vector>
+#include <iostream>
+#include <cassert>
+#include <cstring>
+#include <string>
+#include <sstream>
+#include <iomanip>
+#include <iostream>
+#include "rapidxml-1.13/rapidxml.hpp"
+
+#if defined __linux__ || defined __APPLE__
+// "Compiled for Linux
+#else
+// Windows doesn't define these values by default, Linux does
+#define M_PI 3.141592653589793
+#define INFINITY 1e8
+#endif
+
+// This variable controls the maximum recursion depth
+#define MAX_RAY_DEPTH 5
+// This variable controls if it should work in memory. If it is not defined, works in disk
+#define WORK_IN_MEMORY
+
+#ifdef WORK_IN_MEMORY
+#define WORKING_MEDIA "memory"
+#else
+#define WORKING_MEDIA "disk"
+#endif
+
+template<typename T>
+class Vec3
+{
+public:
+    T x, y, z;
+    Vec3() : x(T(0)), y(T(0)), z(T(0)) {}
+    Vec3(T xx) : x(xx), y(xx), z(xx) {}
+    Vec3(T xx, T yy, T zz) : x(xx), y(yy), z(zz) {}
+    Vec3& normalize()
+    {
+        T nor2 = length2();
+        if (nor2 > 0) {
+            T invNor = 1 / sqrt(nor2);
+            x *= invNor, y *= invNor, z *= invNor;
+        }
+        return *this;
+    }
+    Vec3<T> operator * (const T &f) const { return Vec3<T>(x * f, y * f, z * f); }
+    Vec3<T> operator * (const Vec3<T> &v) const { return Vec3<T>(x * v.x, y * v.y, z * v.z); }
+    T dot(const Vec3<T> &v) const { return x * v.x + y * v.y + z * v.z; }
+    Vec3<T> operator - (const Vec3<T> &v) const { return Vec3<T>(x - v.x, y - v.y, z - v.z); }
+    Vec3<T> operator + (const Vec3<T> &v) const { return Vec3<T>(x + v.x, y + v.y, z + v.z); }
+    Vec3<T>& operator += (const Vec3<T> &v) { x += v.x, y += v.y, z += v.z; return *this; }
+    Vec3<T>& operator *= (const Vec3<T> &v) { x *= v.x, y *= v.y, z *= v.z; return *this; }
+    Vec3<T> operator - () const { return Vec3<T>(-x, -y, -z); }
+    T length2() const { return x * x + y * y + z * z; }
+    T length() const { return sqrt(length2()); }
+    friend std::ostream & operator << (std::ostream &os, const Vec3<T> &v)
+    {
+        os << "[" << v.x << " " << v.y << " " << v.z << "]";
+        return os;
+    }
+};
+
+typedef Vec3<float> Vec3f;
+typedef Vec3<bool> Vec3b;
+
+class Sphere
+{
+public:
+    const char *id;
+    Vec3f center;                           /// position of the sphere
+    float radius, radius2;                  /// sphere radius and radius^2
+    Vec3f surfaceColor, emissionColor;      /// surface color and emission (light)
+    float transparency, reflection;         /// surface transparency and reflectivity
+    int animation_frame;
+    Vec3b animation_position_rand;
+    Vec3f animation_position;
+    Sphere() { }
+    Sphere(
+        const char *id,
+        const Vec3f &c,
+        const float &r,
+        const Vec3f &sc,
+        const float &refl = 0,
+        const float &transp = 0,
+        const Vec3f &ec = 0) :
+        id(id), center(c), radius(r), radius2(r * r), surfaceColor(sc),
+        emissionColor(ec), transparency(transp), reflection(refl)
+    {
+        animation_frame = 0;
+    }
+    //[comment]
+    // Compute a ray-sphere intersection using the geometric solution
+    //[/comment]
+    bool intersect(const Vec3f &rayorig, const Vec3f &raydir, float &t0, float &t1) const
+    {
+        Vec3f l = center - rayorig;
+        float tca = l.dot(raydir);
+        if (tca < 0) return false;
+        float d2 = l.dot(l) - tca * tca;
+        if (d2 > radius2) return false;
+        float thc = sqrt(radius2 - d2);
+        t0 = tca - thc;
+        t1 = tca + thc;
+        
+        return true;
+    }
+};
+
+float mix(const float &a, const float &b, const float &mix)
+{
+    return b * mix + a * (1 - mix);
+}
+
+//[comment]
+// This is the main trace function. It takes a ray as argument (defined by its origin
+// and direction). We test if this ray intersects any of the geometry in the scene.
+// If the ray intersects an object, we compute the intersection point, the normal
+// at the intersection point, and shade this point using this information.
+// Shading depends on the surface property (is it transparent, reflective, diffuse).
+// The function returns a color for the ray. If the ray intersects an object that
+// is the color of the object at the intersection point, otherwise it returns
+// the background color.
+//[/comment]
+Vec3f trace(
+    const Vec3f &rayorig,
+    const Vec3f &raydir,
+    const Sphere *spheres,
+    const unsigned int spheres_size,
+    const int &depth)
+{
+    //if (raydir.length() != 1) std::cerr << "Error " << raydir << std::endl;
+    float tnear = INFINITY;
+    const Sphere* sphere = NULL;
+    // find intersection of this ray with the sphere in the scene
+    for (unsigned i = 0; i < spheres_size; ++i) {
+        float t0 = INFINITY, t1 = INFINITY;
+        if (spheres[i].intersect(rayorig, raydir, t0, t1)) {
+            if (t0 < 0) t0 = t1;
+            if (t0 < tnear) {
+                tnear = t0;
+                sphere = &spheres[i];
+            }
+        }
+    }
+    // if there's no intersection return black or background color
+    if (!sphere) return Vec3f(2);
+    Vec3f surfaceColor = 0; // color of the ray/surfaceof the object intersected by the ray
+    Vec3f phit = rayorig + raydir * tnear; // point of intersection
+    Vec3f nhit = phit - sphere->center; // normal at the intersection point
+    nhit.normalize(); // normalize normal direction
+    // If the normal and the view direction are not opposite to each other
+    // reverse the normal direction. That also means we are inside the sphere so set
+    // the inside bool to true. Finally reverse the sign of IdotN which we want
+    // positive.
+    float bias = 1e-4; // add some bias to the point from which we will be tracing
+    bool inside = false;
+    if (raydir.dot(nhit) > 0) nhit = -nhit, inside = true;
+    if ((sphere->transparency > 0 || sphere->reflection > 0) && depth < MAX_RAY_DEPTH) {
+        float facingratio = -raydir.dot(nhit);
+        // change the mix value to tweak the effect
+        float fresneleffect = mix(pow(1 - facingratio, 3), 1, 0.1);
+        // compute reflection direction (not need to normalize because all vectors
+        // are already normalized)
+        Vec3f refldir = raydir - nhit * 2 * raydir.dot(nhit);
+        refldir.normalize();
+        Vec3f reflection = trace(phit + nhit * bias, refldir, spheres, spheres_size, depth + 1);
+        Vec3f refraction = 0;
+        // if the sphere is also transparent compute refraction ray (transmission)
+        if (sphere->transparency) {
+            float ior = 1.1, eta = (inside) ? ior : 1 / ior; // are we inside or outside the surface?
+            float cosi = -nhit.dot(raydir);
+            float k = 1 - eta * eta * (1 - cosi * cosi);
+            Vec3f refrdir = raydir * eta + nhit * (eta * cosi - sqrt(k));
+            refrdir.normalize();
+            refraction = trace(phit - nhit * bias, refrdir, spheres, spheres_size, depth + 1);
+        }
+        // the result is a mix of reflection and refraction (if the sphere is transparent)
+        surfaceColor = (
+            reflection * fresneleffect +
+            refraction * (1 - fresneleffect) * sphere->transparency) * sphere->surfaceColor;
+    }
+    else {
+        // it's a diffuse object, no need to raytrace any further
+        for (unsigned i = 0; i < spheres_size; ++i) {
+            if (spheres[i].emissionColor.x > 0) {
+                // this is a light
+                Vec3f transmission = 1;
+                Vec3f lightDirection = spheres[i].center - phit;
+                lightDirection.normalize();
+                for (unsigned j = 0; j < spheres_size; ++j) {
+                    if (i != j) {
+                        float t0, t1;
+                        if (spheres[j].intersect(phit + nhit * bias, lightDirection, t0, t1)) {
+                            transmission = 0;
+                            break;
+                        }
+                    }
+                }
+                surfaceColor += sphere->surfaceColor * transmission *
+                std::max(float(0), nhit.dot(lightDirection)) * spheres[i].emissionColor;
+            }
+        }
+    }
+    
+    return surfaceColor + sphere->emissionColor;
+}
+
+
+void save_image(const std::string output_folder, const int frame, const unsigned int width, const unsigned int height, Vec3f *image) {
+    // Save result to a PPM image (keep these flags if you compile under Windows)
+    std::stringstream ss;
+    ss << std::setfill('0') << std::setw(5) << frame;
+    std::string filename = output_folder + "/frame" + ss.str() + ".ppm";
+#ifdef DEBUG
+    std::cout << "[Work] Writing frame " << frame << " to " << filename << std::endl;
+#endif
+    std::ofstream ofs(filename, std::ios::out | std::ios::binary);
+    ofs << "P6\n" << width << " " << height << "\n255\n";
+    for (unsigned i = 0; i < width * height; ++i) {
+        ofs << (unsigned char)(std::min(float(1), image[i].x) * 255) <<
+            (unsigned char)(std::min(float(1), image[i].y) * 255) <<
+            (unsigned char)(std::min(float(1), image[i].z) * 255);
+    }
+    ofs.close();
+}
+
+
+void raytrace(std::string output_folder, int total_frames, unsigned int width, unsigned int height, const std::vector<Sphere> &initial_spheres) {
+    float invWidth = 1 / float(width);
+    float invHeight = 1 / float(height);
+    float fov = 30;
+    float aspectratio = width / float(height);
+    float angle = tan(M_PI * 0.5 * fov / 180.);
+
+#ifdef WORK_IN_MEMORY
+    unsigned int total_memory = sizeof(Vec3f)*total_frames*width*height;
+    std::string total_memory_unit = " bytes";
+    if (total_memory > 1024) {
+        total_memory = (total_frames*width*height)/1024;
+        total_memory_unit = " KB";
+    }
+    if (total_memory > (10*1024)) {
+        total_memory /= 1024;
+        total_memory_unit = " MB";
+    }
+#ifdef DEBUG
+    std::cout << "[Init] Allocating " << total_memory << total_memory_unit << " of memory to store images" << std::endl;
+#endif
+    Vec3f **images = new Vec3f*[total_frames];
+    for (int f=0; f<total_frames; f++) {
+        images[f] = new Vec3f[width * height];
+    }
+#endif
+
+#   ifndef NO_TIME_MEASUREMENT
+#ifdef DEBUG
+    std::cout << "[Time] Starting time measurement" << std::endl;
+#endif
+    time_t wall_start, wall_end;
+    time(&wall_start);
+    clock_t cpu_start = clock();
+#   endif
+
+    for (int frame = 1; frame <= total_frames; frame++) {
+#ifdef DEBUG
+        std::cout << "[Work] Generating frame " << frame << "..." << std::endl;
+#endif
+        // Set up the scenne
+        unsigned int spheres_size = initial_spheres.size();
+        Sphere* spheres = new Sphere[spheres_size];
+        memcpy(spheres, initial_spheres.data(), sizeof(Sphere) * spheres_size);
+
+        // Animation of each frame
+        for(unsigned long i = 0; i != spheres_size; i++) {
+            if (spheres[i].animation_frame != 0 &&
+                    (spheres[i].animation_frame > 0 && frame < spheres[i].animation_frame)) {
+                continue;
+            }
+
+            int adjusted_frame = frame;
+            if (spheres[i].animation_frame < 0) {
+                if (frame > spheres[i].animation_frame*-1) {
+                    adjusted_frame = spheres[i].animation_frame*-1;
+                }
+            } else if (spheres[i].animation_frame > 0) {
+                adjusted_frame -= spheres[i].animation_frame;
+            }
+
+            if (spheres[i].animation_position.x) {
+                if (spheres[i].animation_position_rand.x) {
+                    spheres[i].center.x += (drand48()*spheres[i].animation_position.x);
+                } else {
+                    spheres[i].center.x += adjusted_frame*spheres[i].animation_position.x;
+                }
+            }
+            if (spheres[i].animation_position.y) {
+                if (spheres[i].animation_position_rand.y) {
+                    spheres[i].center.y += (drand48()*spheres[i].animation_position.y);
+                } else {
+                    spheres[i].center.y += adjusted_frame*spheres[i].animation_position.y;
+                }
+            }
+            if (spheres[i].animation_position.z) {
+                if (spheres[i].animation_position_rand.z) {
+                    spheres[i].center.z += (drand48()*spheres[i].animation_position.z);
+                } else {
+                    spheres[i].center.z += adjusted_frame*spheres[i].animation_position.z;
+                }
+            }
+        }
+
+#ifdef WORK_IN_MEMORY
+        Vec3f *image = images[frame-1];
+#else
+        Vec3f *image = new Vec3f[width * height];
+#endif
+        // Trace rays
+        for (unsigned y = 0; y < height; ++y) {
+            for (unsigned x = 0; x < width; ++x) {
+                float xx = (2 * ((x + 0.5) * invWidth) - 1) * angle * aspectratio;
+                float yy = (1 - 2 * ((y + 0.5) * invHeight)) * angle;
+                Vec3f raydir(xx, yy, -1);
+                raydir.normalize();
+                image[y*width+x] = trace(Vec3f(0), raydir, spheres, spheres_size, 0);
+            }
+        }
+
+#ifndef WORK_IN_MEMORY
+        save_image(output_folder, frame, width, height, image);
+        delete [] image;
+#endif
+    }
+
+#   ifndef NO_TIME_MEASUREMENT
+#ifdef DEBUG
+    std::cout << "[Time] Stopping time measurement" << std::endl;
+#endif
+    clock_t cpu_end = clock();
+    time(&wall_end);
+    double cpu_time_seconds = ((double) (cpu_end - cpu_start)) / CLOCKS_PER_SEC;
+    double wall_time_seconds = difftime(wall_end, wall_start);
+    printf("The generation of %d frames in %s of %u x %u with %lu spheres took:\n", total_frames, WORKING_MEDIA, width, height, initial_spheres.size());
+    printf("%.0f wall-clock seconds (%.2f FPS)\n", wall_time_seconds, ((double)total_frames)/wall_time_seconds);
+    printf("%.2f CPU time seconds\n", cpu_time_seconds);
+#   endif
+
+#ifdef WORK_IN_MEMORY
+    for (int frame = 1; frame <= total_frames; frame++) {
+        save_image(output_folder, frame, width, height, images[frame-1]);
+        delete [] images[frame-1];
+    }
+    delete [] images;
+#endif
+}
+
+
+int main(int argc, char **argv)
+{
+    int image_size_parameter = 2;
+    int total_frames = 1;
+
+    if (argc < 3) {
+        std::cerr << "Usage: " << argv[0] << " <scene.xml> <output_folder>[ <image_size>[ <frames>]]" << std::endl;
+        std::cerr << " <scene.xml>: XML with the scene description" << std::endl;
+        std::cerr << " <output_folder>: Folder on which produce output images" << std::endl;
+        std::cerr << " <image_size>: Size of images to generate, a single integer meaning 1=320x180, 2=640x360, 4=HD, 6=FHD and so on. Defaults to " << image_size_parameter << std::endl;
+        std::cerr << " <frames>: Number of frames to produce. Defaults to " << total_frames << std::endl;
+        exit(1);
+    }
+    srand48(13);
+
+    std::string scene_filename(argv[1]);
+    std::string output_folder = argv[2];
+    if (argc > 3) {
+        image_size_parameter = atoi(argv[3]);
+    }
+    if (argc > 4) {
+        total_frames = atoi(argv[4]);
+    }
+
+    // 1 = 320x180
+    // 2 = 640x360
+    // 4 = 1280x720 (HD)
+    // 6 = 1920x1080 (FHD)
+    unsigned int image_size_multiplier = 20*image_size_parameter;
+
+    unsigned int width = image_size_multiplier*16;
+    unsigned int height = image_size_multiplier*9;
+    
+    std::vector<Sphere> initial_spheres;
+
+#ifdef DEBUG
+    std::cout << "[Init] Generating " << total_frames << " frames of " << width << "x" << height << " in " << WORKING_MEDIA << " in " << output_folder << std::endl;
+    std::cout << "[Init] Loading scene from " << scene_filename << std::endl;
+#endif
+
+    // Parses the scene
+    std::ifstream scene_file(scene_filename, std::ios::binary | std::ios::ate);
+    std::streamsize scene_file_size = scene_file.tellg();
+    scene_file.seekg(0, std::ios::beg);
+    char *scene_buffer = new char[scene_file_size];
+    if (scene_file.read(scene_buffer, scene_file_size)) {
+        rapidxml::xml_document<> doc;
+        doc.parse<0>(scene_buffer);
+        rapidxml::xml_node<> *scene_node = doc.first_node("scene");
+
+        rapidxml::xml_node<> *spheres_node = scene_node->first_node("spheres");
+        rapidxml::xml_node<> *sphere_node = spheres_node->first_node();
+        while (sphere_node != 0) {
+            // position, radius, surface color, reflectivity, transparency, emission color
+            initial_spheres.emplace_back(
+                sphere_node->first_attribute("id")->value(),
+                Vec3f( //Center position
+                    atof(sphere_node->first_node("position")->first_attribute("x")->value()),
+                    atof(sphere_node->first_node("position")->first_attribute("y")->value()),
+                    atof(sphere_node->first_node("position")->first_attribute("z")->value())
+                ),
+                atof(sphere_node->first_node("size")->first_attribute("radius")->value()), // Radius
+                Vec3f( //Surface color
+                    atof(sphere_node->first_node("surface_color")->first_attribute("red")->value()),
+                    atof(sphere_node->first_node("surface_color")->first_attribute("green")->value()),
+                    atof(sphere_node->first_node("surface_color")->first_attribute("blue")->value())
+                ),
+                atof(sphere_node->first_node("reflectivity")->first_attribute("value")->value()), // Reflectivity
+                atof(sphere_node->first_node("transparency")->first_attribute("value")->value()) // Transparency
+            );
+            if (sphere_node->first_node("emission_color")) {
+                initial_spheres.back().emissionColor = Vec3f(
+                    atof(sphere_node->first_node("emission_color")->first_attribute("red")->value()),
+                    atof(sphere_node->first_node("emission_color")->first_attribute("green")->value()),
+                    atof(sphere_node->first_node("emission_color")->first_attribute("blue")->value())
+                );
+            }
+            sphere_node = sphere_node->next_sibling();
+        }
+#ifdef DEBUG
+        std::cout << "[Init] Loaded " << initial_spheres.size() << " spheres, looking for animations" << std::endl;
+#endif
+
+        rapidxml::xml_node<> *animation_node = scene_node->first_node("animation");
+        for (rapidxml::xml_node<> *sphere_animation = animation_node->first_node();
+                sphere_animation; sphere_animation = sphere_animation->next_sibling()) {
+            std::string id = sphere_animation->first_attribute("id")->value();
+            for(unsigned long i = 0; i != initial_spheres.size(); i++) {
+                if (id == initial_spheres[i].id) {
+                    rapidxml::xml_node<> *position_node = sphere_animation->first_node("position");
+                    if (position_node) {
+                        rapidxml::xml_attribute<> *attr;
+                        attr = position_node->first_attribute("after");
+                        if (attr) {
+                            initial_spheres[i].animation_frame = atoi(attr->value());
+                        }
+                        attr = position_node->first_attribute("before");
+                        if (attr) {
+                            initial_spheres[i].animation_frame = atoi(attr->value())*-1;
+                        }
+                        attr = position_node->first_attribute("x");
+                        if (attr) {
+                            if (strcmp(attr->value(), "random") == 0) {
+                                initial_spheres[i].animation_position_rand.x = true;
+                                initial_spheres[i].animation_position.x = atof(position_node->first_attribute("random")->value());
+                            } else {
+                                initial_spheres[i].animation_position.x = atof(attr->value());
+                            }
+                        }
+                        attr = position_node->first_attribute("y");
+                        if (attr) {
+                            if (strcmp(attr->value(), "random") == 0) {
+                                initial_spheres[i].animation_position_rand.y = true;
+                                initial_spheres[i].animation_position.y = atof(position_node->first_attribute("random")->value());
+                            } else {
+                                initial_spheres[i].animation_position.y = atof(position_node->first_attribute("y")->value());
+                            }
+                        }
+                        attr = position_node->first_attribute("z");
+                        if (attr) {
+                            if (strcmp(attr->value(), "random") == 0) {
+                                initial_spheres[i].animation_position_rand.z = true;
+                                initial_spheres[i].animation_position.z = atof(position_node->first_attribute("random")->value());
+                            } else {
+                                initial_spheres[i].animation_position.z = atof(position_node->first_attribute("z")->value());
+                            }
+                        }
+                    }
+                }
+            }
+        }
+#ifdef DEBUG
+        std::cout << "[Init] Finished loading animation for spheres" << std::endl;
+#endif
+
+    }
+
+    raytrace(output_folder, total_frames, width, height, initial_spheres);
+
+    return 0;
+}
\ No newline at end of file
diff --git a/examples/sequential/reduce.cpp b/examples/sequential/reduce.cpp
new file mode 100644
index 0000000..1d0d541
--- /dev/null
+++ b/examples/sequential/reduce.cpp
@@ -0,0 +1,79 @@
+#include <iostream>
+#include <chrono>
+#include <iomanip>
+
+std::chrono::steady_clock::time_point tInitialization;
+std::chrono::steady_clock::time_point tComputation;
+std::chrono::steady_clock::time_point tFinishing;
+std::chrono::steady_clock::time_point tEnd;
+
+
+unsigned int reduce_vector(const size_t vector_size, const unsigned int* vector) {
+    tInitialization = std::chrono::steady_clock::now(); // Begins initialization
+
+    tComputation = std::chrono::steady_clock::now(); // Ends initialization, start computation
+
+    unsigned int total = 0;
+    for (size_t i = 0; i < vector_size; i++) {
+        total += vector[i];
+    }
+
+    tFinishing = std::chrono::steady_clock::now(); // Ends computation, start finishing
+
+    tEnd = std::chrono::steady_clock::now(); // Ends finish
+
+    return total;
+}
+
+void print_vector(size_t size, const unsigned int* vector, bool compact = false) {
+    if (compact || size > 100) {
+        std::cout << vector[0] << "..." << vector[size-1];
+    } else {
+        for (size_t i = 0; i < size; i++) {
+            std::cout << vector[i] << " ";
+        }
+    }
+    std::cout << std::endl;
+}
+
+int main(int argc, const char * argv[]) {
+    if (argc < 2) {
+        std::cerr << "Use: " << argv[0] << " <vector_size>" << std::endl;
+        exit(-1);
+    }
+    std::cout << std::fixed << std::setprecision(0);
+
+    const size_t VECTOR_SIZE = std::stoi(argv[1]);
+
+    // Create memory objects
+    unsigned int *vector = new unsigned int[VECTOR_SIZE];
+    for (size_t i = 0; i < VECTOR_SIZE; i++) {
+        vector[i] = 1;
+    }
+
+#ifdef DEBUG
+    std::cout << "Reducing vector:" << std::endl;
+    print_vector(VECTOR_SIZE, vector);
+#endif
+
+    unsigned int total = reduce_vector(VECTOR_SIZE, vector);
+
+    delete[] vector;
+
+    double msTotal = std::chrono::duration_cast<std::chrono::milliseconds>(tEnd - tInitialization).count();
+    double msInitialization = std::chrono::duration_cast<std::chrono::milliseconds>(tComputation - tInitialization).count();
+    double msComputation = std::chrono::duration_cast<std::chrono::milliseconds>(tFinishing - tComputation).count();
+    double msFinishing = std::chrono::duration_cast<std::chrono::milliseconds>(tEnd - tFinishing).count();
+
+#ifdef DEBUG
+    std::cout << "Result: " << total << std::endl;
+    std::cout << "Total: " << msTotal << " ms" << std::endl;
+    std::cout << "Initialization: " << msInitialization << " ms" << std::endl;
+    std::cout << "Computation: " << msComputation << " ms" << std::endl;
+    std::cout << "Finishing: " << msFinishing << " ms" << std::endl;
+#else
+    std::cout << total << ";" << msTotal << ";" << msInitialization << ";" << msComputation << ";" << msFinishing << std::endl;
+#endif
+
+    return 0;
+}
diff --git a/examples/sequential/saxpy.cpp b/examples/sequential/saxpy.cpp
new file mode 100644
index 0000000..cbcc5ae
--- /dev/null
+++ b/examples/sequential/saxpy.cpp
@@ -0,0 +1,96 @@
+#include <iostream>
+#include <chrono>
+#include <iomanip>
+
+std::chrono::steady_clock::time_point tInitialization;
+std::chrono::steady_clock::time_point tComputation;
+std::chrono::steady_clock::time_point tFinishing;
+std::chrono::steady_clock::time_point tEnd;
+
+
+unsigned long saxpy(const unsigned long vector_size, const unsigned long scal, const unsigned long* a, const unsigned long* b, unsigned long* result) {
+    tInitialization = std::chrono::steady_clock::now(); // Begins initialization
+
+    tComputation = std::chrono::steady_clock::now(); // Ends initialization, start computation
+
+    unsigned long total = 0;
+    for (unsigned long i = 0; i < vector_size; i++) {
+        result[i] = scal*a[i] + b[i];
+        total += result[i];
+    }
+
+    tFinishing = std::chrono::steady_clock::now(); // Ends computation, start finishing
+
+    tEnd = std::chrono::steady_clock::now(); // Ends finish
+
+    return total;
+}
+
+void print_vector(unsigned long size, const unsigned long* vector, bool compact = false) {
+    if (compact || size > 100) {
+        std::cout << vector[0] << "..." << vector[size-1];
+    } else {
+        for (unsigned long i = 0; i < size; i++) {
+            std::cout << vector[i] << " ";
+        }
+    }
+    std::cout << std::endl;
+}
+
+int main(int argc, const char * argv[]) {
+    if (argc < 3) {
+        std::cerr << "Use: " << argv[0] << " <vector_size> <scalar>" << std::endl;
+        exit(-1);
+    }
+    std::cout << std::fixed << std::setprecision(0);
+
+    const unsigned long VECTOR_SIZE = std::stoul(argv[1]);
+    const unsigned long SCALAR = std::stoul(argv[2]);
+
+    // Create memory objects
+    unsigned long *result = new unsigned long[VECTOR_SIZE];
+    unsigned long *a = new unsigned long[VECTOR_SIZE];
+    unsigned long *b = new unsigned long[VECTOR_SIZE];
+    for (unsigned long i = 0; i < VECTOR_SIZE; i++) {
+        a[i] = (unsigned long)i;
+        b[i] = (unsigned long)i + 1;
+        result[i] = 0;
+    }
+
+#ifdef DEBUG
+    std::cout << "Summing vectors:" << std::endl;
+    std::cout << "Vector A: ";
+    print_vector(VECTOR_SIZE, a);
+    std::cout << "Vector B: ";
+    print_vector(VECTOR_SIZE, b);
+#endif
+
+    unsigned long total = saxpy(VECTOR_SIZE, SCALAR, a, b, result);
+
+#ifdef DEBUG
+    // Output the result buffer
+    std::cout << "Result:   ";
+    print_vector(VECTOR_SIZE, result);
+#endif
+
+    delete[] result;
+    delete[] a;
+    delete[] b;
+
+    double msTotal = std::chrono::duration_cast<std::chrono::milliseconds>(tEnd - tInitialization).count();
+    double msInitialization = std::chrono::duration_cast<std::chrono::milliseconds>(tComputation - tInitialization).count();
+    double msComputation = std::chrono::duration_cast<std::chrono::milliseconds>(tFinishing - tComputation).count();
+    double msFinishing = std::chrono::duration_cast<std::chrono::milliseconds>(tEnd - tFinishing).count();
+
+#ifdef DEBUG
+    std::cout << "Total:    " << total << std::endl;
+    std::cout << "Total: " << msTotal << " ms" << std::endl;
+    std::cout << "Initialization: " << msInitialization << " ms" << std::endl;
+    std::cout << "Computation: " << msComputation << " ms" << std::endl;
+    std::cout << "Finishing: " << msFinishing << " ms" << std::endl;
+#else
+    std::cout << total << ";" << msTotal << ";" << msInitialization << ";" << msComputation << ";" << msFinishing << std::endl;
+#endif
+
+    return 0;
+}
diff --git a/examples/sequential/vector_sum.cpp b/examples/sequential/vector_sum.cpp
new file mode 100644
index 0000000..ac12986
--- /dev/null
+++ b/examples/sequential/vector_sum.cpp
@@ -0,0 +1,95 @@
+#include <iostream>
+#include <chrono>
+#include <iomanip>
+
+std::chrono::steady_clock::time_point tInitialization;
+std::chrono::steady_clock::time_point tComputation;
+std::chrono::steady_clock::time_point tFinishing;
+std::chrono::steady_clock::time_point tEnd;
+
+
+unsigned long vector_sum(const unsigned long vector_size, const unsigned long *a, const unsigned long *b, unsigned long *result) {
+    tComputation = std::chrono::steady_clock::now(); // Ends initialization, start computation
+
+    unsigned long total = 0;
+    for (unsigned long i = 0; i < vector_size; i++) {
+        result[i] = a[i] + b[i];
+        total += result[i];
+    }
+
+    tFinishing = std::chrono::steady_clock::now(); // Ends computation, start finishing
+
+    return total;
+}
+
+void print_vector(unsigned long size, const unsigned long* vector, bool compact = false) {
+    if (compact || size > 100) {
+        std::cout << vector[0] << "..." << vector[size-1];
+    } else {
+        for (unsigned long i = 0; i < size; i++) {
+            std::cout << vector[i] << " ";
+        }
+    }
+    std::cout << std::endl;
+}
+
+int main(int argc, const char * argv[]) {
+    if (argc < 2) {
+        std::cerr << "Use: " << argv[0] << " <vector_size>" << std::endl;
+        exit(-1);
+    }
+    std::cout << std::fixed << std::setprecision(0);
+
+    const unsigned long VECTOR_SIZE = std::stoul(argv[1]);
+
+    // Create memory objects
+    unsigned long *result = new unsigned long[VECTOR_SIZE];
+    unsigned long *a = new unsigned long[VECTOR_SIZE];
+    unsigned long *b = new unsigned long[VECTOR_SIZE];
+    for (unsigned long i = 0; i < VECTOR_SIZE; i++) {
+        a[i] = i;
+        b[i] = i + 1;
+        result[i] = 0;
+    }
+
+#ifdef DEBUG
+    std::cout << "Summing vectors:" << std::endl;
+    std::cout << "Vector A: ";
+    print_vector(VECTOR_SIZE, a);
+    std::cout << "Vector B: ";
+    print_vector(VECTOR_SIZE, b);
+#endif
+
+    tInitialization = std::chrono::steady_clock::now(); // Begins initialization
+
+    unsigned long total = vector_sum(VECTOR_SIZE, a, b, result);
+
+    tEnd = std::chrono::steady_clock::now(); // Ends finish
+
+#ifdef DEBUG
+    // Output the result buffer
+    std::cout << "Result:   ";
+    print_vector(VECTOR_SIZE, result);
+#endif
+
+    delete[] result;
+    delete[] a;
+    delete[] b;
+
+    double msTotal = std::chrono::duration_cast<std::chrono::milliseconds>(tEnd - tInitialization).count();
+    double msInitialization = std::chrono::duration_cast<std::chrono::milliseconds>(tComputation - tInitialization).count();
+    double msComputation = std::chrono::duration_cast<std::chrono::milliseconds>(tFinishing - tComputation).count();
+    double msFinishing = std::chrono::duration_cast<std::chrono::milliseconds>(tEnd - tFinishing).count();
+
+#ifdef DEBUG
+    std::cout << "Total:    " << total << std::endl;
+    std::cout << "Total: " << msTotal << " ms" << std::endl;
+    std::cout << "Initialization: " << msInitialization << " ms" << std::endl;
+    std::cout << "Computation: " << msComputation << " ms" << std::endl;
+    std::cout << "Finishing: " << msFinishing << " ms" << std::endl;
+#else
+    std::cout << total << ";" << msTotal << ";" << msInitialization << ";" << msComputation << ";" << msFinishing << std::endl;
+#endif
+
+    return 0;
+}
diff --git a/examples/workloads/raytracer_scene.xml b/examples/workloads/raytracer_scene.xml
new file mode 100644
index 0000000..62180e7
--- /dev/null
+++ b/examples/workloads/raytracer_scene.xml
@@ -0,0 +1,137 @@
+<scene>
+    <spheres>
+        <sphere id="main_base">
+            <position x="0.0" y="-10004.0" z="-20" />
+            <size radius="10000.0" />
+            <surface_color red="0.20" green="0.20" blue="0.20" />
+            <reflectivity value="0" />
+            <transparency value="0.0" />
+        </sphere>
+        <sphere id="main_light">
+            <position x="-5.0" y="20.0" z="-30" />
+            <size radius="3.0" />
+            <surface_color red="0.00" green="0.00" blue="0.00" />
+            <reflectivity value="0" />
+            <transparency value="0.0" />
+            <emission_color red="2" green="2" blue="2" />
+        </sphere>
+
+        <sphere id="yellow">
+            <position x="2.0" y="2.0" z="-12" />
+            <size radius="1.0" />
+            <surface_color red="0.90" green="0.76" blue="0.46" />
+            <reflectivity value="1" />
+            <transparency value="0.5" />
+        </sphere>
+        <sphere id="red">
+            <position x="0.0" y="0.0" z="-20" />
+            <size radius="2.0" />
+            <surface_color red="1.00" green="0.32" blue="0.36" />
+            <reflectivity value="1" />
+            <transparency value="0.7" />
+        </sphere>
+        <sphere id="dark_gray">
+            <position x="-2.0" y="-2.6" z="-15" />
+            <size radius="1.5" />
+            <surface_color red="0.90" green="0.90" blue="0.90" />
+            <reflectivity value="1" />
+            <transparency value="0.5" />
+        </sphere>
+        <sphere id="blue">
+            <position x="3.5" y="1.0" z="-23" />
+            <size radius="1.5" />
+            <surface_color red="0.65" green="0.77" blue="0.97" />
+            <reflectivity value="1" />
+            <transparency value="0.0" />
+        </sphere>
+        <sphere id="glass">
+            <position x="3.5" y="-1.0" z="-18" />
+            <size radius="1.5" />
+            <surface_color red="1.00" green="1.00" blue="1.00" />
+            <reflectivity value="1" />
+            <transparency value="1.0" />
+        </sphere>
+        <sphere id="green">
+            <position x="-10.0" y="10.0" z="-50" />
+            <size radius="1.5" />
+            <surface_color red="0.25" green="1.00" blue="0.30" />
+            <reflectivity value="1" />
+            <transparency value="0.0" />
+        </sphere>
+
+        <sphere id="floor_blue">
+            <position x="-7" y="-3.5" z="-20" />
+            <size radius="0.5" />
+            <surface_color red="0.00" green="0.50" blue="1.00" />
+            <reflectivity value="1" />
+            <transparency value="0.7" />
+        </sphere>
+        <sphere id="floor_green">
+            <position x="-7" y="-3.3" z="-23" />
+            <size radius="0.7" />
+            <surface_color red="0.00" green="1.00" blue="0.50" />
+            <reflectivity value="1" />
+            <transparency value="0.5" />
+        </sphere>
+        <sphere id="floor_purple">
+            <position x="-4" y="-3.0" z="-30" />
+            <size radius="1" />
+            <surface_color red="1.26" green="0.30" blue="1.56" />
+            <reflectivity value="1" />
+            <transparency value="0.5" />
+        </sphere>
+        <sphere id="floor_orange">
+            <position x="-10" y="-3.2" z="-25" />
+            <size radius="0.8" />
+            <surface_color red="1.00" green="0.50" blue="0.00" />
+            <reflectivity value="1" />
+            <transparency value="0.8" />
+        </sphere>
+        <sphere id="floor_light_lime">
+            <position x="-8" y="-3.0" z="-25" />
+            <size radius="1" />
+            <surface_color red="0.18" green="0.83" blue="0.71" />
+            <reflectivity value="1" />
+            <transparency value="0.8" />
+        </sphere>
+        <sphere id="floor_small_pink">
+            <position x="5.3" y="-3.7" z="-16" />
+            <size radius="0.3" />
+            <surface_color red="2.55" green="1.29" blue="1.92" />
+            <reflectivity value="1" />
+            <transparency value="0.2" />
+        </sphere>
+        <sphere id="floor_periwinkle">
+            <position x="0.3" y="-3.0" z="-100" />
+            <size radius="1" />
+            <surface_color red="0.55" green="0.51" blue="1.00" />
+            <reflectivity value="1" />
+            <transparency value="0.3" />
+        </sphere>
+        
+        <sphere id="small_sun">
+            <position x="6.0" y="-2.8" z="-15" />
+            <size radius="0.5" />
+            <surface_color red="1.00" green="1.00" blue="1.00" />
+            <reflectivity value="0" />
+            <transparency value="0.0" />
+            <emission_color red="1.80" green="1.46" blue="0.76" />
+        </sphere>
+    </spheres>
+    <animation>
+        <sphere id="red"><position before="320" x="-0.05" y="0.025" z="-0.05" /></sphere>
+        <sphere id="blue"><position before="600" z="0.03" /></sphere>
+        <sphere id="glass"><position before="60" y="-0.025" /></sphere>
+        <sphere id="dark_gray"><position before="250" x="-0.03" z="-0.03" /></sphere>
+        <sphere id="yellow"><position after="200" x="-0.02" z="-0.02" /></sphere>
+        <sphere id="green"><position before="125" x="0.04" y="-0.1" z="0.1" /></sphere>
+        <sphere id="floor_purple"><position after="125" y="0.02" /></sphere>
+        <sphere id="floor_blue"><position before="625" x="0.025" /></sphere>
+        <sphere id="floor_green"><position before="600" x="0.03" /></sphere>
+        <sphere id="floor_orange"><position before="700" x="0.02" /></sphere>
+        <sphere id="floor_light_lime"><position before="700" x="0.02" /></sphere>
+        <sphere id="floor_small_pink"><position before="425" x="-0.03" /></sphere>
+        <sphere id="small_sun"><position before="600" z="-0.2" /></sphere>
+        <sphere id="floor_periwinkle"><position before="600" z="0.1" /></sphere>
+    </animation>
+</scene>
diff --git a/src/GSPar.hpp b/src/GSPar.hpp
new file mode 100644
index 0000000..572f677
--- /dev/null
+++ b/src/GSPar.hpp
@@ -0,0 +1,13 @@
+
+#ifndef __GSPAR_INCLUDED__
+#define __GSPAR_INCLUDED__
+
+// Include Drivers
+#include "GSPar_CUDA.hpp"
+#include "GSPar_OpenCL.hpp"
+
+// Include Patterns
+#include "GSPar_PatternMap.hpp"
+#include "GSPar_PatternReduce.hpp"
+
+#endif
diff --git a/src/GSPar_Base.cpp b/src/GSPar_Base.cpp
new file mode 100644
index 0000000..ab95146
--- /dev/null
+++ b/src/GSPar_Base.cpp
@@ -0,0 +1,28 @@
+
+#include <chrono>
+#include <string>
+#include <algorithm> //std::generate_n
+
+namespace GSPar {
+    static bool srandInitiated = false;
+
+    std::string getRandomString(short length) {
+        if (!srandInitiated) {
+            // Initialize random seed with ms since linux epoch
+            std::srand(std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count());
+            srandInitiated = true;
+        }
+
+        auto randchar = []() -> char {
+            const char charset[] =
+            "0123456789"
+            "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+            "abcdefghijklmnopqrstuvwxyz";
+            const size_t max_index = (sizeof(charset) - 1);
+            return charset[ std::rand() % max_index ];
+        };
+        std::string generatedName(length,0);
+        std::generate_n(generatedName.begin(), length, randchar);
+        return generatedName;
+    }
+}
diff --git a/src/GSPar_Base.hpp b/src/GSPar_Base.hpp
new file mode 100644
index 0000000..0dfc0f8
--- /dev/null
+++ b/src/GSPar_Base.hpp
@@ -0,0 +1,37 @@
+
+#ifndef __GSPAR_BASE_INCLUDED__
+#define __GSPAR_BASE_INCLUDED__
+
+#include <chrono>
+#include <string>
+#include <algorithm> //std::generate_n
+
+#define GSPAR_STRINGIZE_SOURCE(...) #__VA_ARGS__
+
+namespace GSPar {
+
+    class GSParException : public std::exception {
+    protected:
+        std::string msg;
+        std::string details;
+
+    public:
+        GSParException() : std::exception() { }
+        explicit GSParException(std::string msg, std::string details = "") {
+            this->msg = msg;
+            this->details = details;
+        }
+        virtual std::string what() { return this->msg; }
+        virtual std::string getDetails() { return this->details; }
+    };
+
+    // Auxiliary functions
+    std::string getRandomString(short length);
+
+    template<typename Base, typename T>
+    inline bool instanceof(const T*) {
+        return std::is_base_of<Base, T>::value;
+    }
+}
+
+#endif
diff --git a/src/GSPar_BaseGPUDriver.hpp b/src/GSPar_BaseGPUDriver.hpp
new file mode 100644
index 0000000..c95f33c
--- /dev/null
+++ b/src/GSPar_BaseGPUDriver.hpp
@@ -0,0 +1,796 @@
+
+#ifndef __GSPAR_BASEGPUDRIVER_INCLUDED__
+#define __GSPAR_BASEGPUDRIVER_INCLUDED__
+
+#define SUPPORTED_DIMS 3
+
+#include <string>
+#include <list>
+#include <iosfwd>
+#include <ostream>
+#include <mutex>
+#include <math.h>
+#ifdef GSPAR_DEBUG
+#include <iostream> //std::cout and std::cerr
+#endif
+
+///// Forward declarations /////
+
+namespace GSPar {
+    namespace Driver {
+
+        enum Runtime {
+            GSPAR_RT_NONE,
+            GSPAR_RT_CUDA,
+            GSPAR_RT_OPENCL
+        };
+
+        struct SingleDimension {
+            unsigned long max;
+            unsigned long min;
+            // TODO step
+            // unsigned long step;
+
+            SingleDimension() : SingleDimension(0, 0) { }
+            SingleDimension(unsigned long max) : SingleDimension(max, 0) { }
+            SingleDimension(unsigned long max, unsigned long min) : max(max), min(min) { }
+
+            unsigned long delta() { return this->max - this->min; }
+
+            std::string toString() {
+                std::string out;
+                if (this->min) {
+                    out += std::to_string(this->min) + " to ";
+                }
+                out += std::to_string(this->max);
+                return out;
+            }
+
+            SingleDimension& operator= (SingleDimension other) { // https://en.cppreference.com/w/cpp/language/copy_assignment
+                if (&other == this) return *this;
+                this->max = other.max;
+                this->min = other.min;
+                return *this;
+            }
+            explicit operator bool() const { return this->max > 0; }
+            bool operator==(SingleDimension& other) {
+                return this->max == other.max && this->min == other.min;
+            }
+            bool operator!=(SingleDimension& other) { return !(*this == other); }
+            SingleDimension& operator*=(unsigned int number) {
+                this->max *= number;
+                this->min *= number;
+                return *this;
+            }
+            SingleDimension operator*(unsigned int number) { return SingleDimension(this->max*number, this->min*number); }
+        };
+
+        struct Dimensions {
+            // TODO remove this crap
+            SingleDimension _empty;
+            SingleDimension x;
+            SingleDimension y;
+            SingleDimension z;
+
+            Dimensions() : _empty(0), x(0), y(0), z(0) { };
+            Dimensions(SingleDimension x, SingleDimension y) : Dimensions() {
+                this->x = x;
+                this->y = y;
+            }
+            Dimensions(SingleDimension x, SingleDimension y, SingleDimension z) : Dimensions(x, y) {
+                this->z = z;
+            }
+            Dimensions(unsigned long maxX, unsigned long maxY, unsigned long maxZ) : Dimensions(SingleDimension(maxX), SingleDimension(maxY), SingleDimension(maxZ)) { };
+            /**
+             * Creates a 3-Dimensions with specified max values and min=0
+             * @param max Max values for the 3 dimensions
+             */
+            Dimensions(unsigned long max[3]) : Dimensions(max[0], max[1], max[2]) { };
+            /**
+             * Created a 3-Dimensions with specified max and min values.
+             * Eg.: dims[0][0] is max value for X dim, dims[0][1] is min value for X dim, dims[1] is Y, dims[2] is Z
+             * @param dims Max and min values for dimensions
+             */
+            Dimensions(unsigned long dims[3][2]) : Dimensions(SingleDimension(dims[0][0], dims[0][1]), SingleDimension(dims[1][0], dims[1][1]), SingleDimension(dims[2][0], dims[2][1])) { };
+            // This constructor gets called instead of copy assignment when assignin directly or passing values to function
+            Dimensions(const Dimensions &other) : Dimensions(other.x, other.y, other.z) { };
+
+            bool is(unsigned int dimension) { return (bool)((*this)[dimension]); };
+            int getCount() const { return (bool)this->x + (bool)this->y + (bool)this->z; }
+
+            std::string getName(unsigned int dimension) {
+                if (this->is(dimension)) {
+                    switch (dimension) {
+                    case 0: return "x";
+                    case 1: return "y";
+                    case 2: return "z";
+                    }
+                }
+                return NULL;
+            }
+
+            std::string toString() {
+                std::string out;
+                out += "[dim" + std::to_string(this->getCount()) + ":";
+                for (int d = 0; d < this->getCount(); d++) {
+                    out += (*this)[d].toString() + "x";
+                }
+                out.pop_back();
+                out += "]";
+                return out;
+            }
+
+            // https://en.cppreference.com/w/cpp/language/operators
+            SingleDimension& operator[] (const int index) {
+                if (index == 0) return this->x;
+                if (index == 1) return this->y;
+                if (index == 2) return this->z;
+                return this->_empty; // TODO Should we throw an exception?
+            }
+            Dimensions& operator= (Dimensions& other) { // https://en.cppreference.com/w/cpp/language/copy_assignment
+                if (&other == this) return *this;
+                this->_empty = other._empty;
+                this->x = other.x;
+                this->y = other.y;
+                this->z = other.z;
+                return *this;
+            }
+            bool operator==(Dimensions& other) {
+                bool ret = this->getCount() == other.getCount();
+                for (int d = 0; ret && d < 3; d++) {
+                    ret = ret && (*this)[d] == other[d];
+                }
+                return ret;
+            }
+            bool operator!=(Dimensions& other) { return !(*this == other); }
+            Dimensions& operator*=(unsigned int number) {
+                for (int d = 0; d < this->getCount(); d++) {
+                    (*this)[d] *= number;
+                }
+                return *this;
+            }
+            Dimensions operator*(unsigned int number) { return Dimensions(
+                this->x ? this->x*number : 0,
+                this->y ? this->y*number : 0,
+                this->z ? this->z*number : 0
+            ); }
+            explicit operator bool() const { return this->getCount() > 0 && (bool)this->x; }
+        };
+
+        template <class TLibCode>
+        class BaseException;
+
+        class BaseExecutionFlowBase;
+
+        template <class TExecutionFlow, class TDevice, class TLibFlowObject>
+        class BaseExecutionFlow;
+
+        template <class TLibAsyncObj>
+        class BaseAsyncExecutionSupport;
+
+        class BaseInstanceBase;
+
+        template <class TExecutionFlow, class TDevice, class TKernel, class TMemoryObject, class TChunkedMemoryObject, class TKernelGenerator>
+        class BaseInstance;
+
+        class BaseDeviceBase;
+
+        template <class TExecutionFlow, class TKernel, class TMemoryObject, class TChunkedMemoryObject, class TLibContext, class TLibDevice, class TLibFlowObject>
+        class BaseDevice;
+
+        class BaseKernelBase;
+
+        template <class TExecutionFlow, class TDevice, class TMemoryObject, class TChunkedMemoryObject, class TLibAsyncObj>
+        class BaseKernel;
+
+        /**
+         * Class to allow storing pointers to BaseMemoryObject without templates.
+         */
+        class BaseMemoryObjectBase {
+        protected:
+            size_t size;
+            void* hostPtr = NULL;
+        public:
+            BaseMemoryObjectBase() {}
+            virtual ~BaseMemoryObjectBase() {}
+
+            size_t getSize() { return this->size; }
+            void* getHostPointer() { return this->hostPtr; }
+        };
+
+        template <class TException, class TExecutionFlow, class TDevice, class TLibMemoryObject, class TLibAsyncObj>
+        class BaseMemoryObject;
+
+        template <class TException, class TExecutionFlow, class TDevice, class TLibMemoryObject, class TLibAsyncObj>
+        class BaseChunkedMemoryObject;
+        
+        template <class TExecutionFlow, class TDevice, class TLibAsyncObj, class TLibFlowObject>
+        class BaseStreamElement;
+
+        class BaseKernelGeneration;
+
+    }
+}
+
+#include "GSPar_Base.hpp"
+#include "GSPar_BaseParallelPattern.hpp"
+
+namespace GSPar {
+    namespace Driver {
+
+        #define defaultExceptionDetails() std::string(__func__) + " in " + std::string(__FILE__) + ":" + std::to_string(__LINE__)
+        #define throwExceptionIfFailed( code ) Exception::throwIfFailed( code, defaultExceptionDetails() )
+
+        /**
+         * Base class for exceptions
+         * 
+         * @param <TLibCode> Type of the (lib-specific) error code
+         */
+        template <class TLibCode>
+        class BaseException : public GSParException {
+        protected:
+            TLibCode code;
+
+            virtual std::string getErrorString(TLibCode code) = 0;
+
+            template <class TChildException>
+            static TChildException* checkError(TLibCode code, TLibCode successCode, std::string details = "") {
+                if (code != successCode) {
+                    return new TChildException(code, details);
+                }
+                return nullptr;
+            }
+
+            template <class TChildException>
+            static void throwIfFailed(TLibCode code, TLibCode sucessCode, std::string details = "") {
+                TChildException* ex = BaseException::checkError<TChildException>(code, sucessCode, details);
+                if (ex != nullptr) {
+                    throw *ex;
+                }
+            }
+
+        public:
+            BaseException() : GSParException() { }
+            explicit BaseException(std::string msg, std::string details = "") : GSParException(msg, details) { }
+            explicit BaseException(TLibCode code, std::string details = "") : GSParException("", details) {
+                this->code = code;
+                this->details = details;
+                // This virtual method call must be placed in child's implementation
+                // this->msg = this->getErrorString(code);
+            }
+            TLibCode getCode() {
+                return this->code;
+            }
+        };
+
+        /**
+         * Class to allow storing pointers to BaseExecutionFlow without templates.
+         */
+        class BaseExecutionFlowBase {
+        public:
+            BaseExecutionFlowBase() {}
+            virtual ~BaseExecutionFlowBase() {}
+        };
+
+        /**
+         * Classes that manage an execution flow should inherit from this class.
+         * 
+         * @param <TExecutionFlow> Type of the specialized BaseExecutionFlow class
+         * @param <TDevice> Type of the specialized BaseDevice class
+         * @param <TLibFlowObject> Type of the (lib-specific) underlying flow control object
+         */
+        template <class TExecutionFlow, class TDevice, class TLibFlowObject>
+        class BaseExecutionFlow : public BaseExecutionFlowBase {
+        protected:
+            TDevice* device = NULL;
+            TLibFlowObject flowObject = NULL;
+
+        public:
+            BaseExecutionFlow() : BaseExecutionFlowBase() { }
+            explicit BaseExecutionFlow(TDevice* device) {
+                this->device = device;
+            }
+            virtual ~BaseExecutionFlow() { }
+            virtual void setBaseFlowObject(TLibFlowObject flowObject) { this->flowObject = flowObject; }
+            virtual TLibFlowObject getBaseFlowObject() { return this->flowObject; }
+            virtual void setDevice(TDevice* device) { this->device = device; }
+            virtual TDevice* getDevice() { return this->device; }
+
+            /**
+             * Start the execution flow if it hasn't been started yet.
+             * Can be safely called multiple times.
+             */
+            virtual TLibFlowObject start() = 0;
+            /**
+             * Wait for the operations in this execution flow to complete.
+             */
+            virtual void synchronize() = 0;
+
+            /**
+             * Check if the execution flow was provided and get the device's default execution flow otherwise.
+             * Start the execution flow and returns
+             * 
+             * @param device The device from which get the default execution flow if the executionFlow is NULL
+             * @param executionFlow The execution flow to start
+             */
+            static TLibFlowObject checkAndStartFlow(TDevice* device, TExecutionFlow* executionFlow = NULL) {
+                if (executionFlow) {
+                    return executionFlow->start();
+                } else {
+                    return device->startDefaultExecutionFlow();
+                }
+            }
+        };
+
+        /**
+         * Classes that support asynchronous execution should inherit from this class.
+         * 
+         * @param <TLibAsyncObj> Type of the (lib-specific) underlying async object
+         */
+        template <class TLibAsyncObj>
+        class BaseAsyncExecutionSupport {
+        protected:
+            TLibAsyncObj asyncObject = NULL;
+            bool runningAsync = false;
+
+            virtual void clearRunningAsync() {
+                this->runningAsync = false;
+            }
+
+        public:
+            BaseAsyncExecutionSupport(TLibAsyncObj asyncObj = NULL) {
+                if (asyncObj) this->asyncObject = asyncObj;
+            }
+            virtual ~BaseAsyncExecutionSupport() { }
+            virtual void setBaseAsyncObject(TLibAsyncObj asyncObject) { this->asyncObject = asyncObject; }
+            virtual TLibAsyncObj getBaseAsyncObject() { return this->asyncObject; }
+            virtual bool isRunningAsync() { return this->runningAsync; }
+
+            /**
+             * Wait for the async operations represented by this async object to complete
+             */
+            virtual void waitAsync() = 0;
+        };
+
+        /**
+         * Class to allow references to BaseInstance without templates.
+         */
+        class BaseInstanceBase {
+        protected:
+            Runtime runtime;
+            BaseInstanceBase(Runtime rt) : runtime(rt) { }
+        public:
+            BaseInstanceBase() { }
+            virtual ~BaseInstanceBase() { }
+        };
+
+        /**
+         * This class represents the entry point of the API.
+         * 
+         * @param <TExecutionFlow> Type of the specialized BaseExecutionFlow class
+         * @param <TDevice> Type of the specialized BaseDevice class
+         * @param <TKernel> Type of the specialized BaseKernel class
+         * @param <TMemoryObject> Type of the specialized BaseMemoryObject class
+         * @param <TChunkedMemoryObject> Type of the specialized BaseChunkedMemoryObject class
+         * @param <TKernelGenerator> Type of the specialized BaseKernelGenerator class
+         */
+        template <class TExecutionFlow, class TDevice, class TKernel, class TMemoryObject, class TChunkedMemoryObject, class TKernelGenerator>
+        class BaseInstance :
+            public BaseInstanceBase {
+        private:
+            TKernelGenerator* kernelGenerator = nullptr;
+
+        protected:
+            bool instanceInitiated = false;
+            std::vector<TDevice*> devices;
+            virtual void loadGpuList() = 0;
+            virtual void clearGpuList() {
+                for (size_t i = 0; i < this->devices.size(); i++) {
+                    delete this->devices[i];
+                }
+                this->devices.clear();
+            }
+            
+            BaseInstance(Runtime rt) : BaseInstanceBase(rt) { }
+
+        public:
+            BaseInstance() {}
+            virtual ~BaseInstance() {
+                if (!this->devices.empty()) {
+                    this->clearGpuList();
+                }
+            }
+            virtual void init() = 0;
+            virtual unsigned int getGpuCount() = 0;
+            virtual std::vector<TDevice*> getGpuList() {
+                if (this->devices.empty()) {
+                    this->loadGpuList();
+                }
+                return this->devices;
+            }
+            virtual TDevice* getGpu(unsigned int index) {
+                std::vector<TDevice*> gpus = this->getGpuList();
+                if (gpus.size() > index) {
+                    return gpus.at(index);
+                }
+                return nullptr;
+            }
+            virtual TKernelGenerator* getKernelGenerator() {
+                // TODO implement thread safety
+                if (!this->kernelGenerator) {
+                    this->kernelGenerator = new TKernelGenerator();
+                }
+                return this->kernelGenerator;
+            }
+
+            static TExecutionFlow getExecutionFlowType() { return TExecutionFlow(); }
+            static TDevice getDeviceType() { return TDevice(); }
+            static TKernel getKernelType() { return TKernel(); }
+            static TMemoryObject getMemoryObjectType() { return TMemoryObject(); }
+            static TChunkedMemoryObject getChunkedMemoryObjectType() { return TChunkedMemoryObject(); }
+        };
+
+        /**
+         * Class to allow references to BaseDevice without templates.
+         */
+        class BaseDeviceBase {
+        public:
+            BaseDeviceBase() { }
+            virtual ~BaseDeviceBase() { }
+        };
+
+        /**
+         * Class that represent a single GPU device
+         * 
+         * @param <TExecutionFlow> Type of the specialized BaseExecutionFlow class
+         * @param <TKernel> Type of the specialized BaseKernel class
+         * @param <TMemoryObject> Type of the specialized BaseMemoryObject class
+         * @param <TChunkedMemoryObject> Type of the specialized BaseChunkedMemoryObject class
+         * @param <TLibContext> Type of the (lib-specific) underlying context object
+         * @param <TLibDevice> Type of the (lib-specific) underlying device object
+         * @param <TLibFlowObject> Type of the (lib-specific) underlying async execution flow object (the same used when inheriting BaseAsyncExecutionSupport)
+         */
+        template <class TExecutionFlow, class TKernel, class TMemoryObject, class TChunkedMemoryObject, class TLibContext, class TLibDevice, class TLibFlowObject>
+        class BaseDevice :
+            public BaseDeviceBase {
+        protected:
+            mutable std::mutex libContextMutex;
+            TLibContext libContext = NULL;
+            TLibDevice libDevice = NULL;
+            mutable std::mutex defaultExecutionFlowMutex;
+            TExecutionFlow* defaultExecutionFlow = NULL; //TODO use a smart pointer
+
+        public:
+            BaseDevice() { }
+            virtual ~BaseDevice() { }
+            virtual TExecutionFlow* getDefaultExecutionFlow() = 0;
+            virtual void setBaseDeviceObject(TLibDevice device) { this->libDevice = device; }
+            virtual TLibDevice getBaseDeviceObject() { return this->libDevice; }
+            virtual void setContext(TLibContext context) { this->libContext = context; }
+            virtual TLibContext getContext() { return this->libContext; }
+
+            virtual TLibFlowObject startDefaultExecutionFlow() = 0;
+            virtual const std::string getName() = 0;
+            virtual unsigned int getComputeUnitsCount() = 0; // Number of multiprocessors
+            virtual unsigned int getWarpSize() = 0;
+            virtual unsigned int getMaxThreadsPerBlock() = 0;
+            /**
+             * Device's global memory size
+             */
+            virtual unsigned long getGlobalMemorySizeBytes() = 0;
+            /**
+             * Device's local (block-shared) memory size
+             */
+            virtual unsigned long getLocalMemorySizeBytes() = 0;
+            /**
+             * Device's amount of shared memory per compute unit
+             */
+            virtual unsigned long getSharedMemoryPerComputeUnitSizeBytes() = 0;
+            virtual unsigned int getClockRateMHz() = 0;
+            virtual bool isIntegratedMainMemory() = 0;
+            // virtual bool supportUnifiedMemory() = 0; //CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING
+            virtual TMemoryObject* malloc(long size, void* hostPtr = nullptr, bool readOnly = false, bool writeOnly = false) = 0;
+            virtual TMemoryObject* malloc(long size, const void* hostPtr = nullptr) = 0;
+            virtual TChunkedMemoryObject* mallocChunked(unsigned int chunks, long chunkSize, void** hostPointers = nullptr, bool readOnly = false, bool writeOnly = false) = 0;
+            virtual TChunkedMemoryObject* mallocChunked(unsigned int chunks, long chunkSize, const void** hostPointers = nullptr) = 0;
+            // Can't convert this BaseGPUDriver instance to child Driver instance
+            // virtual TMemoryObject* malloc(long size, void* hostPtr = NULL) {
+            //     return new TMemoryObject(this, size, hostPtr, false, false);
+            // }
+            virtual TKernel* prepareKernel(const std::string kernelSource, const std::string kernelName) = 0;
+            virtual std::vector<TKernel*> prepareKernels(const std::string kernelSource, const std::vector<std::string> kernelNames) = 0;
+        };
+
+        /**
+         * Class to allow storing pointers to BaseKernel without templates.
+         */
+        class BaseKernelBase {
+        public:
+            BaseKernelBase() {}
+            virtual ~BaseKernelBase() {}
+
+            virtual void cloneInto(BaseKernelBase* other) { }
+            virtual Dimensions getNumBlocksAndThreads(Dimensions dims, const unsigned int maxThreadsPerBlock, size_t* maxThreadsDimension) { return dims; }
+            virtual Dimensions getNumBlocksAndThreadsFor(Dimensions dims) { return dims; }
+        };
+
+        /**
+         * Class that represent a single GPU kernel, which can be invoked multiple times.
+         * 
+         * @param <TExecutionFlow> Type of the specialized BaseExecutionFlow class
+         * @param <TDevice> Type of the specialized BaseDevice class
+         * @param <TMemoryObject> Type of the specialized BaseMemoryObject class
+         * @param <TChunkedMemoryObject> Type of the specialized BaseChunkedMemoryObject class
+         * @param <TLibAsyncObj> Type of the (lib-specific) underlying async object (the same used when inheriting BaseAsyncExecutionSupport)
+         */
+        template <class TExecutionFlow, class TDevice, class TMemoryObject, class TChunkedMemoryObject, class TLibAsyncObj>
+        class BaseKernel :
+            public BaseKernelBase,
+            virtual public BaseAsyncExecutionSupport<TLibAsyncObj> {
+        protected:
+            std::string kernelName;
+            TDevice* device;
+            unsigned int parameterCount = 0;
+            unsigned int sharedMemoryBytes = 0;
+            Dimensions numThreadsPerBlock = {0, 0, 0};
+
+            BaseKernel(TDevice* device) : BaseKernel() {
+                this->device = device;
+            }
+            virtual Dimensions getNumBlocksAndThreads(Dimensions dims, const unsigned int maxThreadsPerBlock, size_t* maxThreadsDimension) override {
+                #ifdef GSPAR_DEBUG
+                    std::stringstream ss; // Using stringstream eases multi-threaded debugging
+                    ss.str("");
+                #endif
+                // maxThreadsDimension is unsigned int[SUPPORTED_DIMS]
+                // Max is threads, min is blocks
+                Dimensions blocksAndThreads = {
+                    {1, 1}, // X
+                    {1, 1}, // Y
+                    {1, 1}  // Z
+                };
+
+                if (dims.y) {
+                    if (dims.z) {
+
+                        // TODO support 3D kernels
+                        throw GSParException("3-dimensional kernels not supported");
+                        
+                    } else {
+                        if ((dims.x.max * dims.y.max) > maxThreadsPerBlock) {
+                            int maxThreads2D = sqrt(maxThreadsPerBlock);
+                            maxThreadsDimension[0] = maxThreads2D;
+                            maxThreadsDimension[1] = maxThreads2D;
+                        }
+                    }
+                }
+                
+                #ifdef GSPAR_DEBUG
+                    if (this->numThreadsPerBlock) {
+                        ss << "[GSPar Kernel " << this << "] Configured num of threads per block is " << this->numThreadsPerBlock.toString() << std::endl;
+                        std::cout << ss.str();
+                        ss.str("");
+                    }
+                #endif
+
+                for (int d = 0; d < SUPPORTED_DIMS; d++) {
+                    if (dims[d]) {
+                        if (numThreadsPerBlock[d] && numThreadsPerBlock[d].max < maxThreadsDimension[d]) {
+                            maxThreadsDimension[d] = numThreadsPerBlock[d].max;
+                        }
+                        if (dims[d].delta() <= maxThreadsDimension[d]) {
+                            blocksAndThreads[d].min = 1; // Blocks
+                            blocksAndThreads[d].max = dims[d].delta(); // Threads
+                        } else {
+                            blocksAndThreads[d].min = ceil((double)dims[d].delta()/maxThreadsDimension[d]); // Blocks
+                            blocksAndThreads[d].max = maxThreadsDimension[d]; // Threads
+                        }
+                    }
+                }
+
+                return blocksAndThreads;
+            }
+
+        public:
+            BaseKernel() { }
+            BaseKernel(TDevice* device, const std::string kernelSource, const std::string kernelName) : BaseKernel(device) {
+                this->kernelName = kernelName;
+            }
+            virtual ~BaseKernel() { }
+            virtual void cloneInto(BaseKernelBase* baseOther) override {
+                BaseKernelBase::cloneInto(baseOther);
+                BaseKernel* other = static_cast<BaseKernel*>(baseOther);
+                other->kernelName = this->kernelName;
+                other->device = this->device;
+                other->parameterCount = this->parameterCount;
+                other->sharedMemoryBytes = this->sharedMemoryBytes;
+            }
+            virtual void setSharedMemoryAllocation(unsigned int sharedMemoryBytes) {
+                this->sharedMemoryBytes = sharedMemoryBytes;
+            }
+            virtual BaseKernel& setNumThreadsPerBlockForX(unsigned long num) { this->numThreadsPerBlock[0] = num; return *this; }
+            virtual BaseKernel& setNumThreadsPerBlockForY(unsigned long num) { this->numThreadsPerBlock[1] = num; return *this; }
+            virtual BaseKernel& setNumThreadsPerBlockForZ(unsigned long num) { this->numThreadsPerBlock[2] = num; return *this; }
+            virtual BaseKernel& setNumThreadsPerBlockFor(int dim, unsigned long num) {
+                this->numThreadsPerBlock[dim] = num;
+                return *this;
+            }
+            virtual BaseKernel& setNumThreadsPerBlock(unsigned long numX, unsigned long numY, unsigned long numZ) {
+                this->numThreadsPerBlock[0] = numX;
+                this->numThreadsPerBlock[1] = numY;
+                this->numThreadsPerBlock[2] = numZ;
+                return *this;
+            }
+            // TODO setParameter should return the Kernel object itself to allow fluent programming, such as BaseParallelPattern
+            virtual int setParameter(TMemoryObject* memoryObject) = 0;
+            virtual int setParameter(TChunkedMemoryObject* chunkedMemoryObject) = 0;
+            virtual int setParameter(size_t parmSize, void* parm) = 0;
+            virtual int setParameter(size_t parmSize, const void* parm) = 0;
+            virtual void clearParameters() {
+                this->parameterCount = 0;
+            }
+            virtual void runAsync(unsigned long max[3], TExecutionFlow* executionFlow = NULL) {
+                this->runAsync(Dimensions(max), executionFlow);
+            }
+            virtual void runAsync(Dimensions max, TExecutionFlow* executionFlow = NULL) = 0;
+        };
+
+        /**
+         * Class that represent a single memory object.
+         * It is bound to a device. It holds a (optional) host and a device pointer.
+         * 
+         * @param <TException> Type of the specialized BaseException class
+         * @param <TExecutionFlow> Type of the specialized BaseExecutionFlow class
+         * @param <TDevice> Type of the specialized BaseDevice class
+         * @param <TLibMemoryObject> Type of the (lib-specific) underlying error code
+         * @param <TLibAsyncObj> Type of the (lib-specific) underlying async object (the same used when inheriting BaseAsyncExecutionSupport)
+         */
+        template <class TException, class TExecutionFlow, class TDevice, class TLibMemoryObject, class TLibAsyncObj>
+        class BaseMemoryObject :
+            virtual public BaseMemoryObjectBase,
+            virtual public BaseAsyncExecutionSupport<TLibAsyncObj> {
+        protected:
+            // https://www.learncpp.com/cpp-tutorial/3-8a-bit-flags-and-bit-masks/
+            static const unsigned char CAN_READ_FLAG = 1 << 0;
+            static const unsigned char CAN_WRITE_FLAG = 1 << 1;
+
+            TDevice* device;
+            TLibMemoryObject devicePtr = NULL;
+            unsigned char flags = CAN_READ_FLAG | CAN_WRITE_FLAG;
+            bool _isPinnedHostMemory = false;
+
+            /**
+             * @param readOnly identify that this memory object can only be read inside kernel
+             * @param writeOnly identify that this memory object can only be written inside kernel
+             */
+            explicit BaseMemoryObject(bool readOnly, bool writeOnly) {
+                if (readOnly && writeOnly) {
+                    throw TException("A memory object can't be read-only and write-only at the same time");
+                } else if (readOnly) {
+                    this->flags &= ~CAN_WRITE_FLAG;
+                } else if (writeOnly) {
+                    this->flags &= ~CAN_READ_FLAG;
+                }
+            }
+            explicit BaseMemoryObject(TDevice* device, size_t size, void* hostPtr, bool readOnly, bool writeOnly) : BaseMemoryObject(readOnly, writeOnly) {
+                this->device = device;
+                this->hostPtr = hostPtr;
+                this->size = size;
+            }
+            explicit BaseMemoryObject(TDevice* device, size_t size, const void* hostPtr) :
+                // const pointer must be read-only
+                BaseMemoryObject(device, size, const_cast<void*>(hostPtr), true, false) { }
+
+        public:
+            BaseMemoryObject() {}
+            virtual ~BaseMemoryObject() {}
+            TLibMemoryObject getBaseMemoryObject() { return this->devicePtr; }
+            bool isReadOnly() { return !(this->flags & CAN_WRITE_FLAG); }
+            bool isWriteOnly() { return !(this->flags & CAN_READ_FLAG); }
+            void bindTo(void* hostPtr) { this->hostPtr = hostPtr; }
+            void bindTo(void* hostPtr, size_t size) {
+                this->bindTo(hostPtr);
+                this->size = size;
+            }
+            virtual void pinHostMemory() { this->setPinnedHostMemory(true); }
+            virtual void setPinnedHostMemory(bool pinned) { this->_isPinnedHostMemory = pinned; }
+            virtual bool isPinnedHostMemory() { return this->_isPinnedHostMemory; }
+            virtual void copyIn() = 0;
+            virtual void copyOut() = 0;
+            virtual void copyInAsync(TExecutionFlow* executionFlow = NULL) = 0;
+            virtual void copyOutAsync(TExecutionFlow* executionFlow = NULL) = 0;
+        };
+
+        /**
+         * Class that represent a chunked memory object.
+         * It is bound to a device. It holds a bunch of host pointers (the chunks) and a single device pointer.
+         * 
+         * @param <TException> Type of the specialized BaseException class
+         * @param <TExecutionFlow> Type of the specialized BaseExecutionFlow class
+         * @param <TDevice> Type of the specialized BaseDevice class
+         * @param <TLibMemoryObject> Type of the (lib-specific) underlying error code
+         * @param <TLibAsyncObj> Type of the (lib-specific) underlying async object (the same used when inheriting BaseAsyncExecutionSupport)
+         */
+        template <class TException, class TExecutionFlow, class TDevice, class TLibMemoryObject, class TLibAsyncObj>
+        class BaseChunkedMemoryObject :
+            virtual public BaseMemoryObjectBase,
+            virtual public BaseMemoryObject<TException, TExecutionFlow, TDevice, TLibMemoryObject, TLibAsyncObj> {
+        protected:
+            void** hostPointers = NULL;
+            unsigned int chunks = 0;
+            // We use the base property size for the chunkSize (size of each data chunk)
+
+            // TODO shouldn't we call the base constructor?
+            explicit BaseChunkedMemoryObject(TDevice* device, unsigned int chunks, size_t chunkSize, void** hostPointers, bool readOnly, bool writeOnly) {
+                this->device = device;
+                this->hostPtr = NULL;
+                this->size = chunkSize;
+                this->hostPointers = hostPointers;
+                this->chunks = chunks;
+            }
+            explicit BaseChunkedMemoryObject(TDevice* device, unsigned int chunks, size_t chunkSize, const void** hostPointers) :
+                // const pointer must be read-only
+                BaseChunkedMemoryObject(device, chunks, chunkSize, const_cast<void**>(hostPointers), true, false) { }
+
+        public:
+            BaseChunkedMemoryObject() : BaseMemoryObject<TException, TExecutionFlow, TDevice, TLibMemoryObject, TLibAsyncObj>() { }
+            virtual ~BaseChunkedMemoryObject() { }
+            size_t getChunkSize() { return this->size; }
+            unsigned int getChunkCount() { return this->chunks; }
+        };
+
+        /**
+         * Class that will end up being part of the stream elements
+         * 
+         * @param <TExecutionFlow> Type of the specialized BaseExecutionFlow class
+         * @param <TDevice> Type of the specialized BaseDevice class
+         * @param <TLibAsyncObj> Type of the (lib-specific) underlying async object (the same used when inheriting BaseAsyncExecutionSupport)
+         * @param <TLibFlowObject> Type of the (lib-specific) underlying async execution flow object (the same used when inheriting BaseAsyncExecutionSupport)
+         */
+        template <class TExecutionFlow, class TDevice, class TLibAsyncObj, class TLibFlowObject>
+        class BaseStreamElement :
+            virtual public BaseAsyncExecutionSupport<TLibAsyncObj>,
+            virtual public BaseExecutionFlow<TExecutionFlow, TDevice, TLibFlowObject> {
+        public:
+            explicit BaseStreamElement(TDevice* device) {
+                // We should extend BaseExecutionFlow::constructor(device)
+                this->device = device;
+            }
+            virtual ~BaseStreamElement() {}
+
+        };
+
+        /**
+         * Base class for kernel code generation
+         */
+        class BaseKernelGenerator {
+        protected:
+            std::array<std::string, 3> defaultStdVarNames = {"x", "y", "z"};
+
+        public:
+            virtual const std::string getKernelPrefix() = 0;
+            virtual std::string generateStdFunctions() = 0;
+            virtual std::string replaceMacroKeywords(std::string kernelSource) = 0;
+            virtual std::string generateInitKernel(Pattern::BaseParallelPattern* pattern, Dimensions dims) = 0;
+            virtual std::string generateParams(Pattern::BaseParallelPattern* pattern, Dimensions dims) = 0;
+            virtual std::string generateStdVariables(Pattern::BaseParallelPattern* pattern, Dimensions dims) = 0;
+            virtual std::string generateBatchedParametersInitialization(Pattern::BaseParallelPattern* pattern, Dimensions dims) = 0;
+            virtual std::string getStdVarNameForDimension(std::array<std::string, 3>& patternNames, int dimension) {
+                if (patternNames[dimension].empty()) {
+                    return this->defaultStdVarNames[dimension];
+                }
+                return patternNames[dimension];
+            }
+            virtual std::array<std::string, 3> getStdVarNames(std::array<std::string, 3>& patternNames) {
+                return {
+                    this->getStdVarNameForDimension(patternNames, 0),
+                    this->getStdVarNameForDimension(patternNames, 1),
+                    this->getStdVarNameForDimension(patternNames, 2)
+                };
+            }
+        };
+
+    }
+}
+
+#endif
diff --git a/src/GSPar_BaseParallelPattern.hpp b/src/GSPar_BaseParallelPattern.hpp
new file mode 100644
index 0000000..f7c7770
--- /dev/null
+++ b/src/GSPar_BaseParallelPattern.hpp
@@ -0,0 +1,1129 @@
+
+#ifndef __GSPAR_BASEPARALLELPATTERN_INCLUDED__
+#define __GSPAR_BASEPARALLELPATTERN_INCLUDED__
+
+#include <string>
+#include <vector>
+#include <map>
+#include <mutex>
+#include <iostream> //std::cout and std::cerr
+#include <chrono>
+#include <algorithm> //std::generate_n
+#ifdef GSPAR_DEBUG
+#include <sstream>
+#include <thread>
+#endif
+
+// Includes for getTypeName
+#include <type_traits>
+#include <typeinfo>
+#ifndef _MSC_VER
+#   include <cxxabi.h>
+#endif
+#include <memory>
+#include <cstdlib>
+
+///// Forward declarations /////
+
+namespace GSPar {
+    namespace Pattern {
+
+        enum ParameterValueType {
+            GSPAR_PARAM_VALUE,
+            GSPAR_PARAM_POINTER
+        };
+
+        enum ParameterDirection {
+            GSPAR_PARAM_NONE,
+            GSPAR_PARAM_IN,
+            GSPAR_PARAM_OUT,
+            GSPAR_PARAM_INOUT,
+            GSPAR_PARAM_PRESENT // It avoids memory transfers when using a MemoryObject from user
+        };
+
+        struct VarType {
+            std::string name;
+            bool isPointer; //std::is_pointer
+            // Remember that struct are classes also
+            bool isClass; //std::is_class
+            bool isConst; //std::is_const
+            bool isVolatile; //std::is_volatile
+            bool isLValueRef; //std::is_lvalue_reference
+            bool isRValueRef; //std::is_rvalue_reference
+
+            std::string getDeclarationName() {
+                return std::string("")
+                    // Classes are not supported in OpenCL C99, so we assume the class is a struct
+                    + (isClass ? "struct " : "")
+                    + (isConst ? "const " : "")
+                    + (isVolatile ? "volatile " : "")
+                    + this->getFullName();
+            }
+
+            std::string getFullName() {
+                return std::string("")
+                    + (isLValueRef ? "&" : "")
+                    + (isRValueRef ? "&&" : "")
+                    + name;
+            }
+
+            std::string toString() {
+                return getFullName()
+                    + (isPointer ? "*" : "");
+            }
+        };
+
+        /**
+         * Base class for pattern parameters
+         */
+        class BaseParameter {
+        protected:
+            bool complete = true; // Placeholder parameters are not complete
+            bool batched = false; // If the parameter is part of the batch
+        public:
+            std::string name;
+            VarType type;
+            size_t size;
+            ParameterValueType paramValueType;
+            ParameterDirection direction;
+
+            BaseParameter() { }
+            BaseParameter(std::string name, VarType type, size_t size, ParameterValueType paramValueType, ParameterDirection direction = GSPAR_PARAM_IN, bool batched = false) :
+                    name(name), type(type), size(size), paramValueType(paramValueType), direction(direction), batched(batched) {
+                    //  std::cout << "Creating parameter " << type.name << " " << name << " of " << size << " bytes" << (batched ? " [batched]" : "") << std::endl;
+                    };
+            virtual ~BaseParameter() { }
+
+            virtual std::string toString() {
+                return this->type.getFullName() + " " + name;
+            }
+            virtual std::string getNonPointerTypeName() {
+                auto type = this->type.getFullName();
+                if (type.back() == '*') { // Should we check isPointer instead?
+                    type.pop_back();
+                }
+                return type;
+            }
+            virtual bool isComplete() {
+                return this->complete;
+            }
+            virtual void setComplete(bool complete) {
+                this->complete = complete;
+            }
+            virtual bool isBatched() {
+                return this->batched;
+            }
+            virtual bool isConstant() {
+                return type.isConst;
+            }
+            virtual bool isIn() {
+                return this->direction == GSPAR_PARAM_IN || this->direction == GSPAR_PARAM_INOUT;
+            }
+            virtual bool isOut() {
+                return this->direction == GSPAR_PARAM_OUT || this->direction == GSPAR_PARAM_INOUT;
+            }
+            /**
+             * Returns the parameter type for use inside the kernel
+             */
+            virtual std::string toKernelParameter() {
+                std::string type = this->type.getFullName();
+                if (this->isBatched() && paramValueType == GSPAR_PARAM_VALUE) {
+                    // A batched parameter is a pointer of values.
+                    // If it's a PointerParameter, we already ripped off the extra * and will flatten the pointers.
+                    // If it's a ValueParameter, we need to add an extra * (we will use a pointer of values)
+                    type += "*";
+                }
+                return type + " " + this->getKernelParameterName();
+            }
+            virtual std::string getKernelParameterName() {
+                return (this->isBatched() ? "gspar_batched_" : "") + this->name;
+            }
+            virtual bool isValueTyped() = 0;
+        };
+
+        template<class T>
+        class TypedParameter;
+
+        class ValueParameter;
+
+        class PointerParameter;
+
+        class BaseParallelPattern;
+
+    }
+}
+
+#include "GSPar_Base.hpp"
+#include "GSPar_BaseGPUDriver.hpp"
+
+namespace GSPar {
+    namespace Pattern {
+
+        // TODO this specialized classes are completely useless. We could work all out with only BaseParameter and it would be far simpler
+
+        /**
+         * A pattern typed parameter
+         */
+        template<class T>
+        class TypedParameter
+            : public BaseParameter {
+        protected:
+            T value;
+            std::unique_ptr<Driver::BaseMemoryObjectBase> memoryObject;
+            Driver::BaseMemoryObjectBase* userMemoryObject = nullptr; // MemoryObject from user
+        public:
+            size_t numberOfElements;
+
+            TypedParameter() { }
+            TypedParameter(std::string name, VarType type, size_t size, T value,
+                    ParameterValueType paramValueType, ParameterDirection direction = GSPAR_PARAM_IN, bool batched = false) :
+                    BaseParameter(name, type, size, paramValueType, direction, batched), value(value) { };
+            virtual ~TypedParameter() { }
+
+            virtual Driver::BaseMemoryObjectBase *getMemoryObject() {
+                if(userMemoryObject != nullptr){
+                    return this->userMemoryObject;
+                }
+                return this->memoryObject.get();
+            }
+
+            virtual void setUserMemoryObject(Driver::BaseMemoryObjectBase* memoryObjectFromUser) {
+                this->userMemoryObject = memoryObjectFromUser;
+            }
+
+            // virtual T getValue() { return this->value; }
+        };
+
+        /**
+         * A value parameter for pattern
+         */
+        class ValueParameter
+            : public TypedParameter<void*> {
+        public:
+            ValueParameter() : TypedParameter() { }
+            ValueParameter(std::string name, VarType type, size_t size, void *value, ParameterDirection direction = GSPAR_PARAM_IN, bool batched = false) :
+                TypedParameter(name, type, size, value, ParameterValueType::GSPAR_PARAM_VALUE, direction, batched) {
+                    if (value == nullptr) { // It may be just a placeholder
+                        this->complete = false;
+                    }
+                };
+            virtual ~ValueParameter() { }
+
+            virtual bool isValueTyped() override { return true; }
+            virtual void* getPointer() { return this->value; }
+
+            template <class TDevice>
+            Driver::BaseMemoryObjectBase *malloc(TDevice gpu, unsigned int batchSize) {
+                if (this->isBatched()) {
+                    // By default, it is a read-only parameter
+                    this->memoryObject = std::unique_ptr<Driver::BaseMemoryObjectBase>(gpu->malloc(batchSize * this->size, this->getPointer(), true, false));
+                }
+                // If it is a non-batched ValueParameter, we return a nullptr
+                return this->memoryObject.get();
+            }
+        };
+
+        /**
+         * A pointer parameter for pattern
+         */
+        class PointerParameter
+            : public TypedParameter<void*> {
+        public:
+            PointerParameter() : TypedParameter() { }
+            // Constructor with no MemoryObject from user
+            PointerParameter(std::string name, VarType type, size_t size, void *value, ParameterDirection direction = GSPAR_PARAM_IN, bool batched = false) :
+                    TypedParameter(name, type, size, value, ParameterValueType::GSPAR_PARAM_POINTER, direction, batched) {
+                if (!value) { // It is just a placeholder
+                    this->complete = false;
+                }
+            };
+            // Constructor with MemoryObject from user
+            PointerParameter(std::string name, VarType type, Driver::BaseMemoryObjectBase* userMemoryObject, ParameterDirection direction = GSPAR_PARAM_IN, bool batched =false) :
+                    TypedParameter(name, type, userMemoryObject->getSize(), userMemoryObject->getHostPointer(), ParameterValueType::GSPAR_PARAM_POINTER, direction, batched) {
+                this->setUserMemoryObject(userMemoryObject);
+            };
+            virtual ~PointerParameter() { }
+
+            virtual bool isValueTyped() override { return false; }
+            virtual void* getPointer() { return this->value; }
+
+            template <class TDevice>
+            Driver::BaseMemoryObjectBase *malloc(TDevice gpu, unsigned int batchSize) {
+                // If it is only IN, the kernel won't write, if is OUT, the kernel won't read
+                bool readOnly = (this->direction == Pattern::ParameterDirection::GSPAR_PARAM_IN);
+                bool writeOnly = (this->direction == Pattern::ParameterDirection::GSPAR_PARAM_OUT);
+                if (this->isBatched()) {
+                    // A batched PointerParameter is conversible to void**
+                    this->memoryObject = std::unique_ptr<Driver::BaseMemoryObjectBase>(gpu->mallocChunked(batchSize, this->size, (void**)this->getPointer(), readOnly, writeOnly));
+                } else {
+                    this->memoryObject = std::unique_ptr<Driver::BaseMemoryObjectBase>(gpu->malloc(this->size, this->getPointer(), readOnly, writeOnly));
+                }
+                return this->memoryObject.get();
+            }
+        };
+
+        /**
+         * Base class for parallel patterns
+         */
+        class BaseParallelPattern {
+        private:
+            unsigned int gpuIndex = 0;
+            Driver::BaseDeviceBase* gpuDevice = nullptr;
+
+        protected:
+            std::unique_ptr<Driver::BaseExecutionFlowBase> executionFlow;
+            bool batched = false;
+            unsigned int batchSize = 1; //TODO what if Dimension max is not divisible by batchSize? It actually segfaults
+            bool _isKernelCompiled = false;
+            bool isKernelStale = false; // Do we need to recompile the kernel?
+            mutable std::mutex compiledKernelMutex;
+            // Should we use a std::map to support multiple pre-compiled kernels?
+            Driver::Dimensions compiledKernelDimension;
+            std::shared_ptr<Driver::BaseKernelBase> compiledKernel;
+            std::string kernelName;
+            std::string userKernel;
+            std::string extraKernelCode;
+            std::vector<std::string> paramsOrder;
+            // Set the thread block size (it is an optional paramenter) #gabriell
+            int numThreadsPerBlock[3] = {0, 0, 0};
+            /**
+             * We use a shared_ptr of parameters, so they can be safely cloned together with the Pattern
+             * And they'll be automatically released as soon as all clones are destroyed
+             */
+            std::map<std::string, std::shared_ptr<BaseParameter>> params;
+            std::array<std::string, 3> stdVarNames;
+            bool useSharedMemory = false;
+            mutable std::mutex sharedMemoryParameterMutex;
+            PointerParameter* sharedMemoryParameter = nullptr;
+
+            // Parameters
+
+            /**
+             * Get the type (as string) of the template argument
+             * from https://stackoverflow.com/a/20170989/
+             */
+            template <typename T>
+            VarType getTemplatedType() {
+                typedef typename std::remove_reference<T>::type TR;
+                std::unique_ptr<char, void(*)(void*)> own (
+                #ifndef _MSC_VER
+                    abi::__cxa_demangle(typeid(TR).name(), nullptr, nullptr, nullptr),
+                #else
+                    nullptr,
+                #endif
+                    std::free
+                );
+                VarType varType;
+                varType.name = own != nullptr ? own.get() : typeid(TR).name();
+                varType.isPointer = std::is_pointer<TR>::value;
+                if (varType.isPointer) {
+                    typedef typename std::remove_pointer<TR>::type TNoPtr;
+                    varType.isClass = std::is_class<TNoPtr>::value;
+                } else {
+                    varType.isClass = std::is_class<TR>::value;
+                }
+                varType.isConst = std::is_const<TR>::value;
+                varType.isVolatile = std::is_volatile<TR>::value;
+                varType.isLValueRef = std::is_lvalue_reference<T>::value;
+                if (!varType.isLValueRef) { // Can't be both
+                    varType.isRValueRef = std::is_rvalue_reference<T>::value;
+                }
+                return varType;
+            }
+
+            virtual void setPointerParameter(std::string name, VarType type, size_t size, void *value, ParameterDirection direction = GSPAR_PARAM_IN, bool batched = false) {
+                // GSPAR_PARAM_PRESENT is incorrect when using a host pointer instead of a MemoryObject 
+                if (direction == GSPAR_PARAM_PRESENT) {
+                    throw GSParException("Pattern parameter \"" + name + "\": GSPAR_PARAM_PRESENT is only allowed when a MemoryObject is provided");
+                }
+                std::shared_ptr<BaseParameter> parameter(new PointerParameter(name, type, size, value, direction, batched));
+                this->setParameter(parameter);
+            }
+            // Using MemoryObject from user
+            virtual void setPointerParameter(std::string name, VarType type, Driver::BaseMemoryObjectBase* userMemoryObject, ParameterDirection direction = GSPAR_PARAM_IN, bool batched = false) {
+                // new PointParameter with MemoryObject from user
+                std::shared_ptr<BaseParameter> parameter(new PointerParameter(name, type, userMemoryObject, direction, batched));
+                this->setParameter(parameter);
+            }
+            virtual void setValueParameter(std::string name, VarType type, size_t size, void *value, ParameterDirection direction = GSPAR_PARAM_IN, bool batched = false) {
+                std::shared_ptr<BaseParameter> parameter(new ValueParameter(name, type, size, value, direction, batched));
+                this->setParameter(parameter);
+            }
+            virtual void setParameter(std::shared_ptr<BaseParameter> parameter) {
+                // std::cout << "Setting BaseParameter " << parameter->type.getFullName() << " " << parameter->name << " of " << parameter->size << " bytes" << (parameter->isBatched() ? " [batched]" : "") << std::endl;
+                auto paramName = parameter.get()->name;
+                if (std::find(this->paramsOrder.begin(), this->paramsOrder.end(), paramName) == this->paramsOrder.end()) {
+                    this->paramsOrder.push_back(paramName);
+                    this->isKernelStale = true; // There is a new parameter, we need to recompile the kernel
+                }
+                this->params[paramName] = parameter;
+            }
+
+            template<class TDriverInstance>
+            decltype(TDriverInstance::getExecutionFlowType())* getExecutionFlow() {
+                return dynamic_cast<decltype(TDriverInstance::getExecutionFlowType())*>(this->executionFlow.get());
+            }
+
+            // Main run function for Parallel Pattern
+            template<class TDriverInstance>
+            void run(Driver::Dimensions pDims, bool useCompiledDim) {
+                Driver::Dimensions dimsToUse = useCompiledDim ? this->compiledKernelDimension : pDims;
+                if (!dimsToUse.getCount()) {
+                    throw GSParException("No dimensions set to run the pattern");
+                }
+                #ifdef GSPAR_DEBUG
+                    std::stringstream ss;
+                #endif
+
+                // TODO validade if dimsToUse is valid
+
+                Driver::Dimensions dimsToRun = dimsToUse;
+                if (this->isBatched()) {
+                    dimsToRun *= this->batchSize;
+                    #ifdef GSPAR_DEBUG
+                        ss << "[" << std::this_thread::get_id() << " GSPar Pattern "<<this<<"] Batched pattern, asked for " << dimsToUse.toString() << " * ";
+                        ss << this->batchSize << " batch size, so we'll run for " << dimsToRun.toString() << std::endl;
+                        std::cout << ss.str();
+                        ss.str("");
+                    #endif
+                }
+
+                this->compile<TDriverInstance>(dimsToUse);
+
+                // #ifdef GSPAR_DEBUG
+                //     auto gpu = this->getGpu<TDriverInstance>();
+                //     ss << "[" << std::this_thread::get_id() << " GSPar Pattern "<<this<<"] Working with GPU " << gpu << " - " << gpu->getName() << std::endl;
+                //     std::cout << ss.str();
+                //     ss.str("");
+                // #endif
+
+                auto kernel = this->getCompiledKernel<TDriverInstance>();
+                kernel->clearParameters();
+
+                // Set the thread block size (it is an optional paramenter)
+                if (numThreadsPerBlock[0] != 0) {
+                    kernel->setNumThreadsPerBlockForX(numThreadsPerBlock[0]);
+                }
+                if (numThreadsPerBlock[1] != 0) {
+                    kernel->setNumThreadsPerBlockForY(numThreadsPerBlock[1]);
+                }
+                if (numThreadsPerBlock[2] != 0) {
+                    kernel->setNumThreadsPerBlockForZ(numThreadsPerBlock[2]);
+                }
+
+                this->callbackBeforeAllocatingMemoryOnGpu(dimsToUse, kernel);
+
+                this->mallocParametersInGpu<TDriverInstance>();
+
+                this->copyParametersFromHostToGpuAsync<TDriverInstance>();
+
+                this->setSharedMemoryInKernel<TDriverInstance>(kernel, dimsToUse);
+
+                this->setParametersInKernel<TDriverInstance>(kernel, dimsToUse);
+
+                this->callbackAfterCopyDataFromHostToGpu();
+                this->callbackBeforeRunInGpu();
+
+                auto executionFlow = this->getExecutionFlow<TDriverInstance>();
+
+                #ifdef GSPAR_DEBUG
+                    ss << "[" << std::this_thread::get_id() << " GSPar Pattern "<<this<<"] Running kernel " << kernel << " for " << dimsToRun.toString() << " in flow " << executionFlow << std::endl;
+                    std::cout << ss.str();
+                    ss.str("");
+                #endif
+
+                kernel->runAsync(dimsToRun, executionFlow);
+
+                #ifdef GSPAR_DEBUG
+                    ss << "[" << std::this_thread::get_id() << " GSPar Pattern "<<this<<"] Started running kernel " << kernel << " in flow " << executionFlow << std::endl;
+                    std::cout << ss.str();
+                    ss.str("");
+                #endif
+
+                kernel->waitAsync();
+
+                #ifdef GSPAR_DEBUG
+                    ss << "[" << std::this_thread::get_id() << " GSPar Pattern "<<this<<"] Finished running kernel " << kernel << " in flow " << executionFlow << std::endl;
+                    std::cout << ss.str();
+                    ss.str("");
+                #endif
+
+                this->callbackAfterRunInGpu();
+
+                this->copyParametersFromGpuToHostAsync<TDriverInstance>();
+
+                this->callbackAfterCopyDataFromGpuToHost(dimsToUse, kernel);
+
+                #ifdef GSPAR_DEBUG
+                    ss << "[" << std::this_thread::get_id() << " GSPar Pattern "<<this<<"] Finished running pattern" << std::endl;
+                    std::cout << ss.str();
+                    ss.str("");
+                #endif
+            }
+
+        public:
+            BaseParallelPattern() { }
+            BaseParallelPattern(std::string kernelSource) : userKernel(kernelSource) { }
+            // This constructor gets called instead of copy assignment when assignin directly or passing values to function
+            BaseParallelPattern(const BaseParallelPattern &other) {
+                other.cloneIntoNonTemplated(this);
+            };
+            
+            virtual ~BaseParallelPattern() { }
+
+            virtual bool isBatched() {
+                return this->batched;
+            }
+
+            virtual BaseParallelPattern& setBatchSize(unsigned int batchSize) {
+                if (!batchSize) { // Set not batched
+                    if (this->isBatched()) {
+                        // The pattern was batched and now it isn't, we need to recompile the kernel
+                        this->isKernelStale = true;
+                    }
+                    this->batched = false;
+                } else {
+                    if (!this->isBatched()) {
+                        // The pattern wasn't batched and now it is, we need to recompile the kernel
+                        this->isKernelStale = true;
+                    }
+                    this->batched = true;
+                }
+                this->batchSize = batchSize;
+                return *this;
+            }
+
+            // TODO support using GPUs based on some scheduler (round-robin, etc)
+            virtual void setGpuIndex(unsigned int index) {
+                if (this->gpuIndex != index) {
+                    this->isKernelStale = true; // If the GPU changed, we need to recompile the kernel
+                    this->gpuDevice = nullptr;
+                    this->executionFlow.reset();
+                    this->gpuIndex = index;
+                }
+            }
+            virtual unsigned int getGpuIndex() {
+                return this->gpuIndex;
+            }
+
+            template<class TDriverInstance>
+            void cloneInto(BaseParallelPattern* other) const {
+                this->cloneIntoNonTemplated(other);
+                // Clone templated values
+                other->setGpu<TDriverInstance>((decltype(TDriverInstance::getDeviceType())*)this->gpuDevice);
+                // executionFlow is not copied, each instance uses it's own. setGpu call initializes it also
+                
+                if (this->_isKernelCompiled && !this->isKernelStale) { // We only copy the kernel if it's a valid (and usable) one
+                    std::lock_guard<std::mutex> lock(other->compiledKernelMutex); // Auto-unlock, RAII
+                    other->_isKernelCompiled = this->_isKernelCompiled;
+                    other->isKernelStale = this->isKernelStale;
+                    // compiledKernelMutex is (quite obviously) unique for each instance
+                    if (this->compiledKernelDimension.getCount()) {
+                        Driver::Dimensions compiledKernelDimension = this->compiledKernelDimension;
+                        other->compiledKernelDimension = compiledKernelDimension;
+                    }
+                    if (this->compiledKernel.get()) {
+                        other->compiledKernel = std::shared_ptr<decltype(TDriverInstance::getKernelType())>(new decltype(TDriverInstance::getKernelType())());
+                        auto localKernel = this->getCompiledKernel<TDriverInstance>();
+                        localKernel->cloneInto(other->compiledKernel.get());
+                    }
+                }
+            }
+
+            void cloneIntoNonTemplated(BaseParallelPattern* other) const {
+                // Clone
+                other->gpuIndex = this->gpuIndex;
+                other->batched = this->batched;
+                other->batchSize = this->batchSize;
+                other->kernelName = this->kernelName;
+                other->userKernel = this->userKernel;
+                other->extraKernelCode = this->extraKernelCode;
+                other->paramsOrder = this->paramsOrder;
+                other->params = this->params;
+                other->stdVarNames = this->stdVarNames;
+                other->useSharedMemory = this->useSharedMemory;
+                other->sharedMemoryParameter = this->sharedMemoryParameter;
+            }
+
+            template<class TDriverInstance>
+            BaseParallelPattern& setCompiledKernel(decltype(TDriverInstance::getKernelType())* kernel, Driver::Dimensions dims) {
+                std::lock_guard<std::mutex> lock(this->compiledKernelMutex); // Auto-unlock, RAII
+                this->compiledKernel = std::shared_ptr<Driver::BaseKernelBase>(kernel);
+                this->compiledKernelDimension = dims;
+                this->_isKernelCompiled = true;
+                this->isKernelStale = false;
+                return *this;
+                // Auto-unlock of compiledKernelMutex, RAII
+            }
+
+            template<class TDriverInstance>
+            decltype(TDriverInstance::getKernelType())* getCompiledKernel() const {
+                return static_cast<decltype(TDriverInstance::getKernelType())*>(this->compiledKernel.get());
+            }
+
+            template<class TDriverInstance>
+            void setGpu(decltype(TDriverInstance::getDeviceType())* device) {
+                if (this->gpuDevice != device) {
+                    this->gpuDevice = device;
+                    auto executionFlow = new decltype(TDriverInstance::getExecutionFlowType())(device);
+                    executionFlow->start();
+                    this->executionFlow = std::unique_ptr<decltype(TDriverInstance::getExecutionFlowType())>(executionFlow);
+                }
+            }
+            template<class TDriverInstance>
+            decltype(TDriverInstance::getDeviceType())* getGpu() {
+            // Driver::BaseDeviceBase* getGpu() {
+                if (this->gpuDevice == nullptr) {
+                    TDriverInstance* driver = TDriverInstance::getInstance();
+                    // Driver::CUDA::Instance driver = TDriverInstance::getInstance(); //Provides autocomplete
+                    driver->init();
+
+                    if (driver->getGpuCount() == 0) {
+                        return nullptr;
+                    }
+
+                    auto gpu = driver->getGpu(this->gpuIndex);
+                    this->setGpu<TDriverInstance>(gpu);
+                }
+                return (decltype(TDriverInstance::getDeviceType())*)this->gpuDevice;
+            }
+
+            virtual BaseParallelPattern& addExtraKernelCode(std::string extraKernelCode) {
+                this->extraKernelCode += extraKernelCode;
+                this->isKernelStale = true; // The kernel code changed, we need to recompile it
+                return *this;
+            }
+
+            virtual std::pair<std::string, std::string> generateDefaultControlIf(Driver::Dimensions dims, std::array<std::string, 3> stdVarNames) {
+                std::string r = "if (";
+                for(int d = 0; d < SUPPORTED_DIMS; d++) {
+                    if (dims[d]) {
+                        if (this->isBatched()) {
+                            r += "(gspar_batch_" + stdVarNames[d] + " < gspar_batch_size)&&";
+                        }
+                        r += "(" + stdVarNames[d] + " < gspar_max_" + stdVarNames[d] + ")&&";
+                    }
+                }
+                // Removes last &&
+                r.pop_back();
+                r.pop_back();
+                r += ") {\n";
+                return std::make_pair(r, "}");
+            }
+
+            template<class TDriverInstance>
+            std::string generateKernelSource(Driver::Dimensions dims) {
+
+                auto codeGenerator = TDriverInstance::getInstance()->getKernelGenerator();
+                std::string kernelName = this->getKernelName();
+
+                std::pair<std::string, std::string> ifDimensions = this->generateDefaultControlIf(dims, codeGenerator->getStdVarNames(this->stdVarNames));
+
+                return (!this->extraKernelCode.empty() ? this->extraKernelCode + "\n" : "")
+                    + codeGenerator->getKernelPrefix() + " " + kernelName + "("
+                    + codeGenerator->generateParams(this, dims) + ") {\n"
+                    + codeGenerator->generateInitKernel(this, dims) + "\n"
+                    + codeGenerator->generateStdVariables(this, dims)
+                    + codeGenerator->generateBatchedParametersInitialization(this, dims) + "\n"
+                    + ifDimensions.first
+                    + this->getKernelCore(dims, codeGenerator->getStdVarNames(this->stdVarNames))
+                    + "\n" + ifDimensions.second + "\n" // if (dims)
+                    + "}\n"; // kernel
+            }
+
+            virtual std::string getKernelName() {
+                if (this->kernelName.empty()) {
+                    this->kernelName = "gspar_kernel_" + getRandomString(7);
+                }
+                return this->kernelName;
+            }
+
+            virtual void setKernelName(std::string kernelName) {
+                this->kernelName = kernelName;
+            }
+
+            std::array<std::string, 3>& getStdVarNames() {
+                return this->stdVarNames;
+            }
+            BaseParallelPattern& setStdVarNames(std::array<std::string, 3> names) {
+                this->stdVarNames = names;
+                this->isKernelStale = true; // The kernel code changed, we need to recompile it
+                // TODO should we check if the names really changed?
+                return *this;
+            }
+
+            virtual std::string getKernelCore(Driver::Dimensions dims, std::array<std::string, 3> stdVarNames) {
+                return std::string(this->getUserKernel());
+            }
+            std::string getUserKernel() {
+                return userKernel;
+            }
+
+            bool isUsingSharedMemory() {
+                return this->useSharedMemory;
+            }
+            virtual PointerParameter* generateSharedMemoryParameter(Driver::Dimensions dims, Driver::BaseKernelBase *kernel) {
+                return this->getSharedMemoryParameter();
+            }
+            virtual PointerParameter* getSharedMemoryParameter() {
+                return this->sharedMemoryParameter;
+            }
+            BaseParameter* getParameter(std::string name) {
+                auto it = this->params.find(name);
+                if (it == this->params.end()) {
+                    return nullptr;
+                }
+                return it->second.get();
+            }
+            virtual std::vector<BaseParameter*> getParameterList() {
+                std::vector<BaseParameter*> paramList;
+                for (auto &paramName : this->paramsOrder) {
+                    paramList.push_back(this->getParameter(paramName));
+                }
+                return paramList;
+            }
+            // Set the thread block size (it is an optional paramenter) #gabriell
+            virtual BaseParallelPattern& setNumThreadsPerBlockForX(unsigned long num) {
+                return this->setNumThreadsPerBlockFor(0, num);
+            }
+            virtual BaseParallelPattern& setNumThreadsPerBlockForY(unsigned long num) {
+                return this->setNumThreadsPerBlockFor(1, num);
+            }
+            virtual BaseParallelPattern& setNumThreadsPerBlockForZ(unsigned long num) {
+                return this->setNumThreadsPerBlockFor(2, num);
+            }
+            virtual BaseParallelPattern& setNumThreadsPerBlockFor(int dim, unsigned long num) {
+                this->numThreadsPerBlock[dim] = num;
+                return *this;
+            }
+            virtual BaseParallelPattern& setNumThreadsPerBlock(unsigned long numX, unsigned long numY, unsigned long numZ) {
+                this->numThreadsPerBlock[0] = numX;
+                this->numThreadsPerBlock[1] = numY;
+                this->numThreadsPerBlock[2] = numZ;
+                return *this;
+            }
+
+            /**
+             * Parameter placeholder
+             */
+            template <typename T>
+            BaseParallelPattern& setParameterPlaceholder(std::string name, ParameterValueType parameterType = GSPAR_PARAM_POINTER, ParameterDirection direction = GSPAR_PARAM_IN, bool batched = false) {
+                VarType varType = getTemplatedType<T>();
+                if (parameterType == ParameterValueType::GSPAR_PARAM_POINTER) {
+                    this->setPointerParameter(name, varType, 0, nullptr, direction, batched);
+                } else if (parameterType == ParameterValueType::GSPAR_PARAM_VALUE) {
+                    this->setValueParameter(name, varType, sizeof(T), nullptr, direction, batched);
+                }
+                if (batched) {
+                    this->batched = true;
+                }
+                return *this;
+            }
+
+            /**
+             * Pointer parameters
+             */
+            template <typename T>
+            BaseParallelPattern& setParameter(std::string name, size_t size, T* value, ParameterDirection direction = GSPAR_PARAM_IN) {
+                VarType varType = getTemplatedType<decltype(value)>();
+                this->setPointerParameter(name, varType, size, value, direction);
+                return *this;
+            }
+            template <typename T>
+            BaseParallelPattern& setParameter(std::string name, size_t size, const T* value) {
+                // Can't call setParameter(non-const T) because getTemplatedType would lost const information
+                VarType varType = getTemplatedType<decltype(value)>();
+                // A const parameter must be IN, as it can't be modified
+                this->setPointerParameter(name, varType, size, const_cast<T*>(value), GSPAR_PARAM_IN);
+                return *this;
+            }
+            // Using MemoryObject from user
+            template <typename T>
+            BaseParallelPattern& setParameter(std::string name, Driver::BaseMemoryObjectBase* userMemoryObject, ParameterDirection direction = GSPAR_PARAM_IN) {
+                VarType varType = getTemplatedType<T>();
+                this->setPointerParameter(name, varType, userMemoryObject, direction);
+                return *this;
+            }
+
+            /**
+             * Value parameters
+             */
+            template <typename T>
+            BaseParallelPattern& setParameter(std::string name, T value) {
+                VarType varType = getTemplatedType<decltype(value)>();
+                // We need a pointer, so we allocate memory and copy the value
+                T* value_copy = new T;
+                *value_copy = value;
+                // A value parameter must be IN, as it can't be modified
+                this->setValueParameter(name, varType, sizeof(T), value_copy, GSPAR_PARAM_IN);
+                return *this;
+            }
+
+            /**
+             * Batched (pointer and value) parameters
+             */
+            template <typename T>
+            BaseParallelPattern& setBatchedParameter(std::string name, size_t sizeOfEachBatch, T** value, ParameterDirection direction = GSPAR_PARAM_IN) {
+                this->batched = true;
+                VarType varType = getTemplatedType<decltype(value)>();
+                varType.name.pop_back(); // We receive ** due to the batch. So the kernel type is only * (we flatten the pointers)
+                this->setPointerParameter(name, varType, sizeOfEachBatch, value, direction, true);
+                return *this;
+            }
+            template <typename T>
+            BaseParallelPattern& setBatchedParameter(std::string name, size_t sizeOfEachBatch, const T** value) {
+                // Can't call setBatchedParameter(non-const T) because getTypeName would lost const information
+                this->batched = true;
+                VarType varType = getTemplatedType<decltype(value)>();
+                varType.name.pop_back(); // We receive ** due to the batch. So the kernel type is only * (we flatten the pointers)
+                // A const parameter must be IN, as it can't be modified
+                this->setPointerParameter(name, varType, sizeOfEachBatch, const_cast<T**>(value), GSPAR_PARAM_IN, true);
+                return *this;
+            }
+            template <typename T>
+            BaseParallelPattern& setBatchedParameter(std::string name, const T* value) {
+                this->batched = true;
+                VarType varType = getTemplatedType<decltype(value)>();
+                varType.name.pop_back(); // We receive * due to the batch.
+                // The effective kernel type is a pure value, but for the parameters we still need it to be a pointer (check BaseParameter::toKernelParameter).
+                this->setValueParameter(name, varType, sizeof(T), const_cast<T*>(value), GSPAR_PARAM_IN, true);
+                return *this;
+            }
+
+            virtual bool isKernelCompiledFor(Driver::Dimensions dims) {
+                // We only compile if the kernel wasn't compiled yet and the configuration didn't change
+                return this->_isKernelCompiled && !this->isKernelStale &&
+                    // TODO #10 Do we really need the exact same dimension? The sizes are passed in parameters.
+                    this->compiledKernelDimension == dims;
+            }
+
+            /**
+             * Compiles the pattern (including the generation and compilation of the GPU kernel) for the dims Dimensions.
+             * 
+             * @param <TDriverInstance> Type of the specialized BaseInstance class
+             * @param dims The Dimensions for which the pattern should be compiled
+             */
+            template<class TDriverInstance>
+            BaseParallelPattern& compile(Driver::Dimensions dims) {
+                // We only compile if the kernel wasn't compiled yet and the configuration didn't change
+                if (this->isKernelCompiledFor(dims)) {
+                    return *this;
+                }
+                std::lock_guard<std::mutex> lock(this->compiledKernelMutex); // Auto-unlock, RAII
+                #ifdef GSPAR_DEBUG
+                    std::stringstream ss;
+                    ss << "[" << std::this_thread::get_id() << " GSPar "<<this<<"] Compiling Kernel for ParallelPattern with " << dims.toString() << std::endl;
+                    std::cout << ss.str();
+                    ss.str("");
+                #endif
+
+                auto gpu = this->getGpu<TDriverInstance>();
+                if (gpu == nullptr) {
+                    throw GSParException("No GPU found for Pattern compilation");
+                }
+
+                std::string kernelName = this->getKernelName();
+
+                this->callbackBeforeGeneratingKernelSource();
+
+                std::string kernelSource = this->generateKernelSource<TDriverInstance>(dims);
+
+                #ifdef GSPAR_DEBUG
+                    ss << "[" << std::this_thread::get_id() << " GSPar "<<this<<"] Compiling kernel source for " << kernelName << ":" << std::endl;
+                    ss << kernelSource << std::endl;
+                    std::cout << ss.str();
+                    ss.str("");
+                #endif
+                // this->compiledKernel = std::unique_ptr<void, void(*)(void*)>{
+                //     (void*)(gpu->prepareKernel(kernelSource.c_str(), kernelName.c_str())),
+                //     [](void *ptr) { delete static_cast<decltype(TDriverInstance::getKernelType())*>(ptr); }
+                // };
+                auto kernel = gpu->prepareKernel(kernelSource.c_str(), kernelName.c_str());
+                this->compiledKernel = std::shared_ptr<Driver::BaseKernelBase>(kernel);
+                this->compiledKernelDimension = dims;
+                this->_isKernelCompiled = true;
+                this->isKernelStale = false;
+                return *this;
+                // Auto-unlock of compiledKernelMutex, RAII
+            }
+
+            // TODO most of the following functions should have protected visibility
+
+            /**
+             * Set shared memory allocation in kernel object
+             * 
+             * @param <TDriverInstance> Type of the specialized BaseInstance class
+             * @param kernel The kernel on which the shared memory will be configured
+             * @param dims The Dimensions for which the shared memory will be configured
+             */
+            template<class TDriverInstance>
+            void setSharedMemoryInKernel(decltype(TDriverInstance::getKernelType())* kernel, Driver::Dimensions dims) {
+                if (!this->isUsingSharedMemory()) {
+                    return;
+                }
+                #ifdef GSPAR_DEBUG
+                    std::stringstream ss;
+                    ss << "[" << std::this_thread::get_id() << " GSPar Pattern "<<this<<"] Is using shared memory, generating it in kernel " << kernel << std::endl;
+                    std::cout << ss.str();
+                    ss.str("");
+                #endif
+                auto shmemParam = this->generateSharedMemoryParameter(dims, kernel);
+                kernel->setSharedMemoryAllocation(shmemParam->size);
+            }
+
+            /**
+             * Allocates memory in GPU device for this pattern's parameters
+             * 
+             * @param <TDriverInstance> Type of the specialized BaseInstance class
+             */
+            template<class TDriverInstance>
+            void mallocParametersInGpu() {
+                auto device = this->getGpu<TDriverInstance>();
+                if (device == nullptr) {
+                    throw GSParException("No GPU found to allocate memory for parameters for Pattern");
+                }
+                for (auto &paramName : this->paramsOrder) {
+                    auto param = this->getParameter(paramName);
+                    if (!param || !param->isComplete()) {
+                        throw GSParException("Pattern parameter \"" + param->name + "\" is just a placeholder. The parameter list must be complete to run the parallel pattern.");
+                    }
+                    if (param->paramValueType == Pattern::ParameterValueType::GSPAR_PARAM_POINTER) { // It is a PointerParameter
+                        auto paramPointer = static_cast<Pattern::PointerParameter*>(param);
+                        if (paramPointer->getMemoryObject() == nullptr) { // It returns a MemoryObject from user, if available
+                            paramPointer->malloc(device, this->batchSize); //TODO check if the batchSize changed since the last parameter allocation
+                            #ifndef GSPAR_PATTERN_DISABLE_PINNED_MEMORY
+                                // In some cases, copyInAsync fails with CUDA_ERROR_INVALID_VALUE: invalid argument. According to the docs:
+                                //   Memory regions requested must be either entirely registered with CUDA, or in the case of host pageable transfers, not registered at all.
+                                //   Memory regions spanning over allocations that are both registered and not registered with CUDA are not supported and will return CUDA_ERROR_INVALID_VALUE.
+                                // We confirmed that avoiding pinned memory eliminates the failure, but we are still unsure why it happens
+                                if (paramPointer->direction == GSPAR_PARAM_INOUT || paramPointer->direction == GSPAR_PARAM_OUT) {
+                                    // Pinned memory allows for memory operations overlapping in CUDA
+                                    if (paramPointer->isBatched()) {
+                                        auto chunkedMemObj = dynamic_cast<decltype(TDriverInstance::getChunkedMemoryObjectType())*>(paramPointer->getMemoryObject());
+                                        chunkedMemObj->pinHostMemory();
+                                    } else {
+                                        auto singleMemObj = dynamic_cast<decltype(TDriverInstance::getMemoryObjectType())*>(paramPointer->getMemoryObject());
+                                        singleMemObj->pinHostMemory();
+                                    }
+                                }
+                            #endif
+                        }
+                    } else if (param->paramValueType == Pattern::ParameterValueType::GSPAR_PARAM_VALUE) {
+                        auto paramValue = static_cast<Pattern::ValueParameter*>(param);
+                        if (paramValue->getMemoryObject() == nullptr) {
+                            paramValue->malloc(device, this->batchSize);
+                        }
+                    }
+                }
+            }
+
+            /**
+             * Copies IN and INOUT parameters from host to device (asynchronously)
+             * 
+             * @param <TDriverInstance> Type of the specialized BaseInstance class
+             */
+            template<class TDriverInstance>
+            void copyParametersFromHostToGpuAsync() {
+                #ifdef GSPAR_DEBUG
+                    std::stringstream ss; ss.str("");
+                #endif
+                // We use the same execution flow as the kernel itself, so we don't need to wait the async copies to finish
+                // Waiting the async copies to finish causes OpenCL to hang (possibly a deadlock?)
+                auto executionFlow = this->getExecutionFlow<TDriverInstance>();
+
+                for (auto &paramName : this->paramsOrder) {
+                    auto param = this->getParameter(paramName);
+                    if (param && param->isIn()) {
+                        if (param->paramValueType == Pattern::ParameterValueType::GSPAR_PARAM_POINTER) {
+                            auto paramPointer = static_cast<Pattern::PointerParameter*>(param);
+                            #ifdef GSPAR_DEBUG
+                                ss << "[" << std::this_thread::get_id() << " GSPar Pattern "<<this<<"] Copying " << param->name << " (" << paramPointer->getMemoryObject() << ") to GPU in flow " << executionFlow << std::endl;
+                                std::cout << ss.str();
+                                ss.str("");
+                            #endif
+                            if (param->isBatched()) {
+                                auto chunkedMemObj = dynamic_cast<decltype(TDriverInstance::getChunkedMemoryObjectType())*>(paramPointer->getMemoryObject());
+                                if (this->batchSize != chunkedMemObj->getChunkCount()) {
+                                    // The pattern batch size changed from when the parameter was created.
+                                    // If it is lower than the parameter batch size, we copy only the related chunks
+                                    // TODO what if it is higher?
+                                    for (unsigned int c = 0; c < this->batchSize; c++) {
+                                        chunkedMemObj->copyInAsync(c, executionFlow);
+                                    }
+                                } else {
+                                    chunkedMemObj->copyInAsync(executionFlow); // Copy all the chunks
+                                }
+                            } else {
+                                auto singleMemObj = dynamic_cast<decltype(TDriverInstance::getMemoryObjectType())*>(paramPointer->getMemoryObject());
+                                singleMemObj->copyInAsync(executionFlow);
+                            }
+                        } else if (param->paramValueType == Pattern::ParameterValueType::GSPAR_PARAM_VALUE) {
+                            if (param->isBatched()) {
+                                auto paramValue = static_cast<Pattern::ValueParameter*>(param);
+                                auto memObj = dynamic_cast<decltype(TDriverInstance::getMemoryObjectType())*>(paramValue->getMemoryObject());
+                                memObj->copyInAsync(executionFlow);
+                            }
+                        }
+                    }
+                }
+            }
+
+            template<class TDriverInstance>
+            void copyParametersFromGpuToHostAsync() {
+                // #ifdef GSPAR_DEBUG
+                //     std::stringstream ss;
+                // #endif
+                for (auto& paramName : this->paramsOrder) {
+                    // #ifdef GSPAR_DEBUG
+                    //     ss << "[GSPar Pattern "<<this<<"] Copying parameter " << paramName << " from GPU to host" << std::endl;
+                    //     std::cout << ss.str();
+                    //     ss.str("");
+                    // #endif
+                    auto param = this->getParameter(paramName);
+                    if (param && param->isOut() && param->paramValueType == Pattern::ParameterValueType::GSPAR_PARAM_POINTER) {
+                        auto paramPointer = static_cast<Pattern::PointerParameter*>(param);
+                        // TODO copy async
+                        // memObj->copyOutAsync();
+                        // std::cout << "Asking to copy " << param->name << " back from GPU" << std::endl;
+                        if (param->isBatched()) {
+                            auto chunkedMemObj = dynamic_cast<decltype(TDriverInstance::getChunkedMemoryObjectType())*>(paramPointer->getMemoryObject());
+                            if (this->batchSize != chunkedMemObj->getChunkCount()) {
+                                // The pattern batch size changed from when the parameter was created.
+                                // If it is lower than the parameter batch size, we copy only the related chunks
+                                // TODO what if it is higher?
+                                for (unsigned int c = 0; c < this->batchSize; c++) {
+                                    chunkedMemObj->copyOut(c);
+                                }
+                            } else {
+                                chunkedMemObj->copyOut(); // Copy all the chunks
+                            }
+                        } else {
+                            auto singleMemObj = dynamic_cast<decltype(TDriverInstance::getMemoryObjectType())*>(paramPointer->getMemoryObject());
+                            if (singleMemObj) {
+                                singleMemObj->copyOut();
+                            }
+                        }
+                    }
+                }
+            }
+
+            template<class TDriverInstance>
+            void setParametersInKernel(decltype(TDriverInstance::getKernelType())* kernel, Driver::Dimensions dims) {
+                this->setDimsParametersInKernel<TDriverInstance>(kernel, dims);
+                
+                if (this->isBatched()) {
+                    kernel->setParameter(sizeof(unsigned int), &this->batchSize);
+                }
+
+                // Sets Pattern parameters in Kernel object
+                for (auto &paramName : this->paramsOrder) {
+                    auto param = this->getParameter(paramName);
+                    this->setParameterInKernel<TDriverInstance>(kernel, param);
+                }
+            }
+
+            template<class TDriverInstance>
+            void setDimsParametersInKernel(decltype(TDriverInstance::getKernelType())* kernel, Driver::Dimensions dims) {
+                for(int d = 0; d < dims.getCount(); d++) {
+                    if (dims.is(d)) {
+                        // #ifdef GSPAR_DEBUG
+                        //     std::stringstream ss; ss.str("");
+                        //     ss << "[" << std::this_thread::get_id() << " GSPar Pattern "<<this<<"] Setting max parameter for dimension " << d << ": " << dims[d].max << " (in kernel " << kernel << ")" << std::endl;
+                        //     std::cout << ss.str();
+                        //     ss.str("");
+                        // #endif
+                        kernel->setParameter(sizeof(unsigned long), &(dims[d].max));
+                        if (dims[d].min && !this->isBatched()) { // Same check as codeGenerator
+                            // TODO Support min in batches
+                            // #ifdef GSPAR_DEBUG
+                            //     ss << "[" << std::this_thread::get_id() << " GSPar Pattern "<<this<<"] Setting min parameter for dimension " << d << ": " << dims[d].min << " (in kernel " << kernel << ")" << std::endl;
+                            //     std::cout << ss.str();
+                            //     ss.str("");
+                            // #endif
+                            kernel->setParameter(sizeof(unsigned long), &(dims[d].min));
+                        }
+                    }
+                }
+            }
+
+            template<class TDriverInstance>
+            void setParameterInKernel(decltype(TDriverInstance::getKernelType())* kernel, BaseParameter* parameter) {
+                if (parameter->direction == Pattern::ParameterDirection::GSPAR_PARAM_NONE) {
+                    return; // NONE parameters doesn't go in kernel
+                }
+                #ifdef GSPAR_DEBUG
+                    std::stringstream ss;
+                    ss << "[" << std::this_thread::get_id() << " GSPar Pattern "<<this<<"] Setting parameter '" << parameter->name << "' in kernel " << kernel << std::endl;
+                    std::cout << ss.str();
+                    ss.str("");
+                #endif
+                if (parameter->paramValueType == Pattern::ParameterValueType::GSPAR_PARAM_POINTER) { // It is a PointerParameter
+                    auto paramPointer = static_cast<Pattern::PointerParameter*>(parameter);
+                    if (parameter->isBatched()) {
+                        auto chunkedMemObj = dynamic_cast<decltype(TDriverInstance::getChunkedMemoryObjectType())*>(paramPointer->getMemoryObject());
+                        // We don't need to wait the async copy because they are running in the same execution flow as the kernel itself
+                        // if (chunkedMemObj) {
+                        //     chunkedMemObj->waitAsync(); // Waits for async copy to finish
+                        // }
+                        kernel->setParameter(chunkedMemObj); // We can simply set the memory object
+                    } else {
+                        auto singleMemObj = dynamic_cast<decltype(TDriverInstance::getMemoryObjectType())*>(paramPointer->getMemoryObject());
+                        // We don't need to wait the async copy because they are running in the same execution flow as the kernel itself
+                        // if (singleMemObj) {
+                        //     singleMemObj->waitAsync(); // Waits for async copy to finish
+                        // }
+                        kernel->setParameter(singleMemObj); // We can simply set the memory object
+                    }
+                } else if (parameter->paramValueType == Pattern::ParameterValueType::GSPAR_PARAM_VALUE) { // It is a ValueParameter
+                    auto paramValue = static_cast<Pattern::ValueParameter*>(parameter);
+                    if (parameter->isBatched()) {
+                        // Batched ValueParameters are allocated as a single buffer
+                        auto singleMemObj = dynamic_cast<decltype(TDriverInstance::getMemoryObjectType())*>(paramValue->getMemoryObject());
+                        // We don't need to wait the async copy because they are running in the same execution flow as the kernel itself
+                        // if (singleMemObj) {
+                        //     singleMemObj->waitAsync(); // Waits for async copy to finish
+                        // }
+                        kernel->setParameter(singleMemObj); // We can simply set the memory object
+                    } else {
+                        // We get the pointer directly
+                        auto paramValue = static_cast<Pattern::ValueParameter*>(parameter);
+                        kernel->setParameter(paramValue->size, paramValue->getPointer());
+                    }
+                }
+
+            }
+
+            template<class TDriverInstance>
+            void run() {
+                this->run<TDriverInstance>(Driver::Dimensions(), true);
+            }
+
+            template<class TDriverInstance>
+            void run(unsigned long dims[3][2]) {
+                this->run<TDriverInstance>(Driver::Dimensions(dims), false);
+            }
+
+            template<class TDriverInstance>
+            void run(unsigned long max[3]) {
+                this->run<TDriverInstance>(Driver::Dimensions(max), false);
+            }
+
+            template<class TDriverInstance>
+            void run(Driver::Dimensions dims) {
+                this->run<TDriverInstance>(dims, false);
+            }
+
+            // Overridable callbacks
+            // TODO these callbacks should have protected visibility
+            virtual void callbackBeforeGeneratingKernelSource() { }
+            virtual void callbackBeforeAllocatingMemoryOnGpu(Driver::Dimensions dims, Driver::BaseKernelBase *kernel) { }
+            virtual void callbackAfterCopyDataFromHostToGpu() { }
+            virtual void callbackBeforeRunInGpu() { }
+            virtual void callbackAfterRunInGpu() { }
+            virtual void callbackAfterCopyDataFromGpuToHost(Driver::Dimensions dims, Driver::BaseKernelBase *kernel) { }
+        };
+    
+    }
+}
+
+#endif
diff --git a/src/GSPar_CUDA.cpp b/src/GSPar_CUDA.cpp
new file mode 100644
index 0000000..92048c4
--- /dev/null
+++ b/src/GSPar_CUDA.cpp
@@ -0,0 +1,942 @@
+
+#include <regex>
+#include <iostream>
+#include <cstring>
+#include <vector>
+#ifdef GSPAR_DEBUG
+#include <sstream>
+#endif
+#include <string>
+#include <typeinfo>
+
+#include "GSPar_CUDA.hpp"
+
+using namespace GSPar::Driver::CUDA;
+
+
+///// Exception /////
+
+std::string Exception::getErrorString(CUresult code) {
+    const char* errName;
+    cuGetErrorName(code, &errName);
+    const char* errString;
+    cuGetErrorString(code, &errString);
+    std::string res(errName);
+    res.append(": ");
+    res.append(errString);
+    return res;
+}
+Exception::Exception(std::string msg, std::string details) : BaseException(msg, details) { }
+Exception::Exception(CUresult code, std::string details) : BaseException(code, details) {
+    // Can't call this virtual function in the base constructor
+    this->msg = this->getErrorString(code);
+}
+Exception* Exception::checkError(CUresult code, std::string details) {
+    return BaseException::checkError<Exception>(code, CUDA_SUCCESS, details);
+}
+void Exception::throwIfFailed(CUresult code, std::string details) {
+    // Exception* ex = Exception::checkError(code, details);
+    // if (ex) std::cerr << "Exception: " << ex->what() << " - " << ex->getDetails() << std::endl;
+    BaseException::throwIfFailed<Exception>(code, CUDA_SUCCESS, details);
+}
+
+std::string CompilationException::getErrorString(nvrtcResult code) {
+    const char* errString = nvrtcGetErrorString(code);
+    return std::string(errString);
+}
+CompilationException::CompilationException(std::string msg, std::string details) : BaseException(msg, details) { }
+CompilationException::CompilationException(nvrtcResult code, std::string details) : BaseException(code, details) {
+    // Can't call this virtual function in the base constructor
+    this->msg = this->getErrorString(code);
+}
+CompilationException* CompilationException::checkError(nvrtcResult code, std::string details) {
+    return BaseException::checkError<CompilationException>(code, NVRTC_SUCCESS, details);
+}
+void CompilationException::throwIfFailed(nvrtcResult code, std::string details) {
+    BaseException::throwIfFailed<CompilationException>(code, NVRTC_SUCCESS, details);
+}
+void CompilationException::throwIfFailed(nvrtcResult code, nvrtcProgram cudaProgram, std::string details) {
+    if (code == NVRTC_ERROR_COMPILATION) {
+        size_t logSize;
+        nvrtcGetProgramLogSize(cudaProgram, &logSize);
+        char* log = new char[logSize];
+        nvrtcGetProgramLog(cudaProgram, log);
+        details += "\n" + std::string(log);
+    }
+    BaseException::throwIfFailed<CompilationException>(code, NVRTC_SUCCESS, details);
+}
+
+
+///// ExecutionFlow /////
+
+ExecutionFlow::ExecutionFlow() : BaseExecutionFlow() { }
+ExecutionFlow::ExecutionFlow(Device* device) : BaseExecutionFlow(device) { }
+ExecutionFlow::~ExecutionFlow() {
+    // We don't throw exceptions on destructors
+    #ifdef GSPAR_DEBUG
+        std::stringstream ss; // Using stringstream eases multi-threaded debugging
+    #endif
+    if (this->flowObject) {
+        // In case the device is still doing work in the stream when cuStreamDestroy() is called,
+        // the function will return immediately and the resources associated with the stream will
+        // be released automatically once the device has completed all work in the stream.
+        // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html
+        #ifdef GSPAR_DEBUG
+            ss << "[GSPar Execution Flow " << this << "] clearing CUstream" << std::endl;
+            std::cout << ss.str();
+            ss.str("");
+        #endif
+        Exception* ex = Exception::checkError( cuStreamDestroy(this->flowObject) );
+        if (ex != nullptr) {
+            std::cerr << "Failed when releasing cuda stream of execution flow: ";
+            std::cerr << ex->what() << " - " << ex->getDetails() << std::endl;
+            delete ex;
+        }
+
+        this->flowObject = NULL;
+    }
+}
+CUstream ExecutionFlow::start() {
+    // #ifdef GSPAR_DEBUG
+    //     std::stringstream ss; // Using stringstream eases multi-threaded debugging
+    //     ss << "[GSPar CUDA "<<this<<"] Starting execution flow " << this << " in device " << this->device << std::endl;
+    //     std::cout << ss.str();
+    //     ss.str("");
+    // #endif
+
+    if (!this->device) {
+        // Can't start flow on a NULL device
+        throw Exception("A device is required to start an execution flow", defaultExceptionDetails());
+    }
+    if (!this->flowObject) {
+        this->device->getContext(); // There must be a context to create a stream
+        CUstream stream;
+        throwExceptionIfFailed( cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING) );
+        this->setBaseFlowObject(stream);
+    }
+    return this->getBaseFlowObject();
+}
+void ExecutionFlow::synchronize() {
+    throwExceptionIfFailed( cuStreamSynchronize(this->getBaseFlowObject()) );
+}
+CUstream ExecutionFlow::checkAndStartFlow(Device* device, ExecutionFlow* executionFlow) {
+    return BaseExecutionFlow::checkAndStartFlow(device, executionFlow);
+}
+
+
+///// AsyncExecutionSupport /////
+
+AsyncExecutionSupport::AsyncExecutionSupport(CUstream asyncObj) : BaseAsyncExecutionSupport(asyncObj) { }
+void AsyncExecutionSupport::waitAsync() {
+    if (this->asyncObject) {
+        throwExceptionIfFailed( cuStreamSynchronize(this->asyncObject) );
+        this->runningAsync = false;
+    }
+};
+// static
+void AsyncExecutionSupport::waitAllAsync(std::initializer_list<AsyncExecutionSupport*> asyncs) {
+    for (auto async : asyncs) {
+        throwExceptionIfFailed( cuStreamSynchronize(async->getBaseAsyncObject()) );
+    }
+}
+
+
+///// Instance /////
+
+Instance *Instance::instance = nullptr;
+
+void Instance::loadGpuList() {
+    this->init();
+    this->clearGpuList();
+
+    unsigned int gpuCount = this->getGpuCount();
+    for (unsigned int i = 0; i < gpuCount; ++i) {
+        this->devices.push_back(new Device(i));
+    }
+}
+
+Instance::Instance() : BaseInstance(Runtime::GSPAR_RT_CUDA) { }
+Instance::~Instance() {
+    Instance::instance = nullptr;
+}
+Instance* Instance::getInstance() {
+    // TODO implement thread-safety
+    if (!instance) {
+        instance = new Instance();
+    }
+    return instance;
+}
+
+void Instance::init() {
+    if (!this->instanceInitiated) {
+        throwExceptionIfFailed( cuInit(0) );
+        this->instanceInitiated = true;
+    }
+}
+
+unsigned int Instance::getGpuCount() {
+    this->init();
+    int gpuCount = 0;
+    throwExceptionIfFailed( cuDeviceGetCount(&gpuCount) );
+    return gpuCount;
+}
+
+
+///// Device /////
+
+Device::Device() : BaseDevice() { }
+Device::Device(int ordinal) {
+    this->libDevice = new CUdevice;
+    this->deviceId = ordinal;
+    throwExceptionIfFailed( cuDeviceGet(this->libDevice, ordinal) );
+}
+Device::~Device() {
+    // We don't throw exceptions on destructors
+#ifdef GSPAR_DEBUG
+    std::cout << "[GSPar Device " << this << "] Destructing";
+#endif
+    if (this->defaultExecutionFlow) {
+        delete this->defaultExecutionFlow;
+        this->defaultExecutionFlow = NULL;
+    }
+
+    if (this->libContext && this->libDevice) {
+        Exception* ex = Exception::checkError( cuCtxSynchronize() );
+        if (ex) {
+            std::cerr << "Failed when waiting for context to synchronize on Device's destructor: ";
+            std::cerr << ex->what() << " - " << ex->getDetails() << std::endl;
+            delete ex;
+        }
+
+
+        ex = Exception::checkError( cuDevicePrimaryCtxRelease(*this->libDevice) );
+        if (ex) {
+            std::cerr << "Failed when releasing primary device context on Device's destructor: ";
+            std::cerr << ex->what() << " - " << ex->getDetails() << std::endl;
+            delete ex;
+        }
+        this->libContext = NULL;
+    }
+    if (this->libDevice) {
+        delete this->libDevice;
+        this->libDevice = NULL;
+    }
+#ifdef GSPAR_DEBUG
+    std::cout << "[GSPar Device " << this << "] Destructed successfully";
+#endif
+}
+ExecutionFlow* Device::getDefaultExecutionFlow() {
+    std::lock_guard<std::mutex> lock(this->defaultExecutionFlowMutex); // Auto-unlock, RAII
+    if (!this->defaultExecutionFlow) {
+        this->defaultExecutionFlow = new ExecutionFlow(this);
+    }
+    return this->defaultExecutionFlow;
+    // Auto-unlock of defaultExecutionFlowMutex, RAII
+}
+CUcontext Device::getContext() {
+    if (!this->libContext) {
+        std::lock_guard<std::mutex> lock(this->libContextMutex); // Auto-unlock, RAII
+        if (!this->libContext) { // Check if someone changed it while we were waiting for the lock
+            CUcontext context;
+            throwExceptionIfFailed( cuDevicePrimaryCtxRetain(&context, *this->libDevice) );
+            this->setContext(context);
+        }
+        // Auto-unlock of libContextMutex, RAII
+    }
+    // Sets the context as current for the caller thread
+    throwExceptionIfFailed( cuCtxSetCurrent(this->libContext) );
+    return this->libContext;
+}
+CUstream Device::startDefaultExecutionFlow() {
+    return this->getDefaultExecutionFlow()->start();
+}
+unsigned int Device::getDeviceId() {
+    this->getContext(); // There must be a context to call almost everything
+    return this->deviceId;
+}
+const std::string Device::getName() {
+    this->getContext(); // There must be a context to call almost everything
+    unsigned int default_size = 256;
+    char* name = new char[default_size];
+    throwExceptionIfFailed( cuDeviceGetName(name, default_size, *this->getBaseDeviceObject()) );
+    // Try 6 times more
+    while (default_size <= 16384 && std::string(name).length() > default_size) {
+        default_size *= 2;
+        delete name;
+        name = new char[default_size];
+        throwExceptionIfFailed( cuDeviceGetName(name, default_size, *this->getBaseDeviceObject()) );
+    }
+    return name;
+}
+unsigned int Device::getComputeUnitsCount() {
+    this->getContext(); // There must be a context to call almost everything
+    return this->queryInfoNumeric(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT);
+}
+unsigned int Device::getWarpSize() {
+    this->getContext(); // There must be a context to call almost everything
+    return this->queryInfoNumeric(CU_DEVICE_ATTRIBUTE_WARP_SIZE);
+}
+unsigned int Device::getMaxThreadsPerBlock() {
+    this->getContext(); // There must be a context to call almost everything
+    return this->queryInfoNumeric(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
+}
+unsigned long Device::getGlobalMemorySizeBytes() {
+    this->getContext(); // There must be a context to call almost everything
+    unsigned long bytes;
+    throwExceptionIfFailed( cuDeviceTotalMem(&bytes, *this->getBaseDeviceObject()) );
+    return bytes;
+}
+unsigned long Device::getLocalMemorySizeBytes() {
+    this->getContext(); // There must be a context to call almost everything
+    return this->queryInfoNumeric(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK);
+}
+unsigned long Device::getSharedMemoryPerComputeUnitSizeBytes() {
+    this->getContext(); // There must be a context to call almost everything
+    return this->queryInfoNumeric(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR);
+}
+unsigned int Device::getClockRateMHz() {
+    this->getContext(); // There must be a context to call almost everything
+    return (this->queryInfoNumeric(CU_DEVICE_ATTRIBUTE_CLOCK_RATE) / 1000);
+}
+bool Device::isIntegratedMainMemory() {
+    this->getContext(); // There must be a context to call almost everything
+    return this->queryInfoNumeric(CU_DEVICE_ATTRIBUTE_INTEGRATED);
+}
+MemoryObject* Device::malloc(long size, void* hostPtr, bool readOnly, bool writeOnly) {
+    return new MemoryObject(this, size, hostPtr, readOnly, writeOnly);
+}
+MemoryObject* Device::malloc(long size, const void* hostPtr) {
+    return new MemoryObject(this, size, hostPtr);
+}
+ChunkedMemoryObject* Device::mallocChunked(unsigned int chunks, long chunkSize, void** hostPointers, bool readOnly, bool writeOnly) {
+    return new ChunkedMemoryObject(this, chunks, chunkSize, hostPointers, readOnly, writeOnly);
+}
+ChunkedMemoryObject* Device::mallocChunked(unsigned int chunks, long chunkSize, const void** hostPointers) {
+    return new ChunkedMemoryObject(this, chunks, chunkSize, hostPointers);
+}
+Kernel* Device::prepareKernel(const std::string kernel_source, const std::string kernel_name) {
+    this->getContext(); // There must be a context to call almost everything
+    return new Kernel(this, kernel_source, kernel_name);
+}
+std::vector<Kernel*> Device::prepareKernels(const std::string kernelSource, const std::vector<std::string> kernelNames) {
+    this->getContext(); // There must be a context to call almost everything
+    
+    std::string programName = "program_" + kernelNames.front();
+    
+    auto programAndModule = this->compileCudaProgramAndLoadModule(kernelSource, programName);
+    nvrtcProgram cudaProgram = std::get<0>(programAndModule);
+    CUmodule cudaModule = std::get<1>(programAndModule);
+
+    std::vector<Kernel*> kernels;
+    for (auto name : kernelNames) {
+        kernels.push_back(new Kernel(this, cudaProgram, cudaModule, name));
+    }
+    return kernels;
+}
+const int Device::queryInfoNumeric(CUdevice_attribute paramName, bool cacheable) {
+    // https://www.quora.com/Is-it-thread-safe-to-write-to-distinct-keys-different-key-for-each-thread-in-a-std-map-in-C-for-keys-that-have-existing-entries-in-the-map/answer/John-R-Grout
+    if (cacheable) { // Check if the attribute is cached
+        std::lock_guard<std::mutex> lock(this->attributeCacheMutex); // Auto-unlock, RAII
+        auto it = this->attributeCache.find(paramName);
+        if (it != this->attributeCache.end()) {
+            return it->second;
+        }
+        // Auto-unlock of attributeCacheMutex, RAII
+    }
+
+    int pi;
+    throwExceptionIfFailed( cuDeviceGetAttribute(&pi, paramName, *this->getBaseDeviceObject()) );
+    if (cacheable) { // Stores the attribute in cache
+        std::lock_guard<std::mutex> lock(this->attributeCacheMutex); // Auto-unlock, RAII
+        this->attributeCache[paramName] = pi;
+        // Auto-unlock of attributeCacheMutex, RAII
+    }
+    return pi;
+}
+std::tuple<nvrtcProgram, CUmodule> Device::compileCudaProgramAndLoadModule(std::string source, const std::string programName) {
+#ifdef GSPAR_DEBUG
+    std::stringstream ss; // Using stringstream eases multi-threaded debugging
+    ss << "[GSPar Device " << this << "] Kernel received to compile: [" << programName << "] = \n" << source << std::endl;
+    std::cout << ss.str();
+    ss.str("");
+#endif
+
+    // --------------------------------------------------------------------
+    // gets the compute capability
+    // --------------------------------------------------------------------
+    int computeCapabilityMajor = this->queryInfoNumeric(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR);
+    int computeCapabilityMinor = this->queryInfoNumeric(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR);
+    std::string computeCapabilityArg = "--gpu-architecture=compute_" + std::to_string(computeCapabilityMajor) + std::to_string(computeCapabilityMinor);
+
+    // --------------------------------------------------------------------
+    // Appending additional routines to the kernel source
+    // --------------------------------------------------------------------
+    std::string completeKernelSource = "";
+    if (computeCapabilityMajor < 6) {
+        // atomicAdd() for double-precision floating-point numbers is not available by
+        // default on devices with compute capability lower than 6.0
+        // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicadd
+        completeKernelSource.append(KernelGenerator::ATOMIC_ADD_POLYFILL);
+    }
+    completeKernelSource.append(Instance::getInstance()->getKernelGenerator()->generateStdFunctions());
+    completeKernelSource.append(Instance::getInstance()->getKernelGenerator()->replaceMacroKeywords(source));
+
+#ifdef GSPAR_DEBUG
+    ss << "[GSPar Device " << this << "] Complete kernel for compilation: [" << programName << "] = \n" << completeKernelSource << std::endl;
+    std::cout << ss.str();
+    ss.str("");
+#endif
+
+    nvrtcProgram cudaProgram;
+    CUmodule cudaModule;
+
+    throwCompilationExceptionIfFailed( nvrtcCreateProgram(&cudaProgram, completeKernelSource.c_str(), programName.c_str(), 0, NULL, NULL), cudaProgram );
+
+    // https://docs.nvidia.com/cuda/nvrtc/index.html
+    int numOptions = 7;
+    const char *compilationOptions[numOptions];
+    compilationOptions[0] = "--device-as-default-execution-space";
+    compilationOptions[1] = computeCapabilityArg.c_str();
+    std::string gsparMacroKernel = "--define-macro=GSPAR_DEVICE_KERNEL=" + KernelGenerator::KERNEL_PREFIX;
+    compilationOptions[2] = gsparMacroKernel.c_str();
+    std::string gsparMacroGlobalMemory = "--define-macro=GSPAR_DEVICE_GLOBAL_MEMORY=" + KernelGenerator::GLOBAL_MEMORY_PREFIX;
+    compilationOptions[3] = gsparMacroGlobalMemory.c_str();
+    std::string gsparMacroSharedMemory = "--define-macro=GSPAR_DEVICE_SHARED_MEMORY=" + KernelGenerator::SHARED_MEMORY_PREFIX;
+    compilationOptions[4] = gsparMacroSharedMemory.c_str();
+    std::string gsparMacroConstant = "--define-macro=GSPAR_DEVICE_CONSTANT=" + KernelGenerator::CONSTANT_PREFIX;
+    compilationOptions[5] = gsparMacroConstant.c_str();
+    std::string gsparMacroDevFunction = "--define-macro=GSPAR_DEVICE_FUNCTION=" + KernelGenerator::DEVICE_FUNCTION_PREFIX;
+    compilationOptions[6] = gsparMacroDevFunction.c_str();
+
+#ifdef GSPAR_DEBUG
+    ss << "[GSPar Device " << this << "] Compiling kernel with " << numOptions << " options: ";
+    for (int iDebug = 0; iDebug < numOptions; iDebug++) {
+        ss << compilationOptions[iDebug] << " ";
+    }
+    ss << std::endl;
+    std::cout << ss.str();
+    ss.str("");
+#endif
+
+    throwCompilationExceptionIfFailed( nvrtcCompileProgram(cudaProgram, numOptions, compilationOptions), cudaProgram );
+
+    size_t ptxSize;
+    throwCompilationExceptionIfFailed( nvrtcGetPTXSize(cudaProgram, &ptxSize), cudaProgram );
+    char* ptxSource = new char[ptxSize];
+    throwCompilationExceptionIfFailed( nvrtcGetPTX(cudaProgram, ptxSource), cudaProgram );
+
+    unsigned int error_buffer_size = 1024;
+    std::vector<CUjit_option> options;
+    std::vector<void*> values;
+    char* error_log = new char[error_buffer_size];
+    //Pointer to a buffer in which to print any log messages that reflect errors
+    options.push_back(CU_JIT_ERROR_LOG_BUFFER);
+    values.push_back(error_log);
+    //Log buffer size in bytes. Log messages will be capped at this size (including null terminator)
+    options.push_back(CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES);
+    // Casting through uintptr_t avoids compiler warning [https://stackoverflow.com/a/30106751/3136474]
+    values.push_back((void*)(uintptr_t)error_buffer_size); // https://developer.nvidia.com/nvidia_bug/2917596
+    //Determines the target based on the current attached context (default)
+    options.push_back(CU_JIT_TARGET_FROM_CUCONTEXT);
+    values.push_back(0); //No option value required for CU_JIT_TARGET_FROM_CUCONTEXT
+
+    Exception::throwIfFailed( cuModuleLoadDataEx(&cudaModule, ptxSource, options.size(), options.data(), values.data()), error_log);
+    
+    return std::make_tuple(cudaProgram, cudaModule);
+}
+
+
+///// Kernel /////
+
+void Kernel::loadCudaFunction(const std::string kernelName) {
+    throwExceptionIfFailed( cuModuleGetFunction(&this->cudaFunction, this->cudaModule, kernelName.c_str()) );
+}
+
+Kernel::Kernel() : BaseKernel() { }
+Kernel::Kernel(Device* device, const std::string kernelSource, const std::string kernelName) : BaseKernel(device, kernelSource, kernelName) {
+    std::string programName = "program_" + kernelName;
+
+    auto programAndModule = this->device->compileCudaProgramAndLoadModule(kernelSource, programName);
+    this->cudaProgram = std::get<0>(programAndModule);
+    this->cudaModule = std::get<1>(programAndModule);
+
+    this->isPrecompiled = false; //Kernel owns cudaProgram
+
+    this->loadCudaFunction(kernelName);
+
+    #ifdef GSPAR_DEBUG
+        std::stringstream ss; // Using stringstream eases multi-threaded debugging
+        ss.str("");
+        // See Kernel::getNumBlocksAndThreadsFor for explanation on this code.
+        int deviceRegsPerBlock = this->device->queryInfoNumeric(CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK);
+        int funcNumRegs = this->queryInfoNumeric(CU_FUNC_ATTRIBUTE_NUM_REGS);
+        funcNumRegs *= 1.15; // +15% of margin
+        ss << "[GSPar Kernel " << this << "] " << this->kernelName << " Device Num regs is " << deviceRegsPerBlock << ", Func Num regs is " << funcNumRegs << "." << std::endl;
+        std::cout << ss.str();
+        ss.str("");
+    #endif  
+}
+Kernel::Kernel(Device* device, nvrtcProgram cudaProgram, CUmodule cudaModule, const std::string kernelName) : BaseKernel(device) {
+    this->cudaProgram = cudaProgram;
+    this->isPrecompiled = true; //Kernel shares cudaProgram
+
+    this->cudaModule = cudaModule;
+
+    this->loadCudaFunction(kernelName);
+}
+Kernel::~Kernel() {
+#ifdef GSPAR_DEBUG
+    std::stringstream ss; // Using stringstream eases multi-threaded debugging
+    ss << "[GSPar Kernel " << this << "] Destructing..." << std::endl;
+    std::cout << ss.str();
+    ss.str("");
+#endif
+    if (this->isRunningAsync()) {
+        this->waitAsync();
+    }
+    if (!this->isPrecompiled && this->cudaProgram) {
+        nvrtcDestroyProgram(&this->cudaProgram); // We don't throw exceptions on destructors
+    }
+}
+void Kernel::cloneInto(BaseKernelBase* baseOther) {
+    BaseKernel::cloneInto(baseOther);
+    Kernel* other = static_cast<Kernel*>(baseOther);
+    other->cudaProgram = this->cudaProgram;
+    other->cudaModule = this->cudaModule;
+    other->cudaFunction = this->cudaFunction;
+    other->kernelParams = this->kernelParams;
+    // TODO Who will destroy the NVRTC program?
+    this->isPrecompiled = true; // Now the program is shared
+    other->isPrecompiled = true;
+    other->attributeCache = this->attributeCache;
+}
+int Kernel::setParameter(MemoryObject* memoryObject) {
+    CUdeviceptr* cudaObject = memoryObject->getBaseMemoryObject();
+    this->kernelParams.push_back(cudaObject);
+    return ++this->parameterCount;
+}
+int Kernel::setParameter(ChunkedMemoryObject* chunkedMemoryObject) {
+    CUdeviceptr* cudaObject = chunkedMemoryObject->getBaseMemoryObject();
+    this->kernelParams.push_back(cudaObject);
+    return ++this->parameterCount;
+}
+int Kernel::setParameter(size_t parm_size, void* parm) {
+    void *parmPtr = parm;
+    if (parm_size <= sizeof(unsigned long long)) { // We copy single values
+        // Should we copy all parameters?
+        parmPtr = new unsigned char[parm_size];
+        memcpy(parmPtr, parm, parm_size);
+    }
+    this->kernelParams.push_back(parmPtr);
+    return ++this->parameterCount;
+}
+int Kernel::setParameter(size_t parm_size, const void* parm) {
+    // cuLaunchKernel expects a void**, so we can't work with const
+    // Another nice trick to cast to void*: https://migocpp.wordpress.com/2018/04/16/cuda-runtime-templates/
+    return this->setParameter(parm_size, const_cast<void*>(parm));
+}
+void Kernel::clearParameters() {
+    BaseKernel::clearParameters();
+    this->kernelParams.clear();
+}
+GSPar::Driver::Dimensions Kernel::getNumBlocksAndThreadsFor(Dimensions dims) {
+    #ifdef GSPAR_DEBUG
+        std::stringstream ss; // Using stringstream eases multi-threaded debugging
+        ss.str("");
+    #endif
+
+    unsigned int deviceMaxThreadsPerBlock = this->device->getMaxThreadsPerBlock();
+
+    // #ifdef GSPAR_DEBUG
+    //     ss << "[GSPar Kernel " << this << "] Max threads per block in device " << this->device << ": " << deviceMaxThreadsPerBlock << std::endl;
+    //     std::cout << ss.str();
+    //     ss.str("");
+    // #endif
+
+    // Check if the function uses too much registers
+    int deviceRegsPerBlock = this->device->queryInfoNumeric(CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK);
+    int funcNumRegs = this->queryInfoNumeric(CU_FUNC_ATTRIBUTE_NUM_REGS);
+    // In practice, we've seen CUDA exploding with CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: too many resources requested for launch
+    // when we use the exact number of threads that can be used according to the number of registers reported by CU_FUNC_ATTRIBUTE_NUM_REGS.
+    // The raytracer test is an example of such issue.
+    // So, we increase this number a little bit to have some margin.
+    funcNumRegs *= 1.15; // +15% of margin
+
+    unsigned int regsMaxThreadsPerBlock = (double)deviceRegsPerBlock/funcNumRegs; // Max threads per block according to the register usage
+
+    // Actual max threads per block according to device capability and function register usage
+    unsigned int actualMaxThreadsPerBlock = deviceMaxThreadsPerBlock;
+    if (regsMaxThreadsPerBlock < deviceMaxThreadsPerBlock) {
+        actualMaxThreadsPerBlock = regsMaxThreadsPerBlock;
+    }
+
+    #ifdef GSPAR_DEBUG
+        ss << "[GSPar Kernel " << this << "] " << this->kernelName << " Device Num regs is " << deviceRegsPerBlock << ", Func Num regs is " << funcNumRegs << ", so max threads per block is " << regsMaxThreadsPerBlock;
+        ss << ". Max threads per block of device is " << deviceMaxThreadsPerBlock << ", but actual max threads is " << actualMaxThreadsPerBlock << std::endl;
+        std::cout << ss.str();
+        ss.str("");
+    #endif
+
+    size_t maxThreadsDimension[SUPPORTED_DIMS] = {
+        (size_t)this->device->queryInfoNumeric(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X),
+        (size_t)this->device->queryInfoNumeric(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y),
+        (size_t)this->device->queryInfoNumeric(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z),
+    };
+
+    // #ifdef GSPAR_DEBUG
+    //     ss << "[GSPar Kernel " << this << "] Max threads per dimension is " << maxThreadsDimension[0] << " x " << maxThreadsDimension[1] << " x " << maxThreadsDimension[2] << std::endl;
+    //     std::cout << ss.str();
+    //     ss.str("");
+    // #endif
+
+    return this->getNumBlocksAndThreads(dims, actualMaxThreadsPerBlock, maxThreadsDimension);
+}
+void Kernel::runAsync(Dimensions dims, ExecutionFlow* executionFlow) {
+
+    #ifdef GSPAR_DEBUG
+        std::stringstream ss; // Using stringstream eases multi-threaded debugging
+        ss << "[GSPar Kernel " << this << "] Running kernel async with " << this->kernelParams.size() << " parameters for " << dims.toString() << " in flow " << executionFlow << std::endl;
+        std::cout << ss.str();
+        ss.str("");
+    #endif
+
+    CUstream cudaStream = ExecutionFlow::checkAndStartFlow(this->device, executionFlow);
+
+    if (!dims.x) {
+        throw Exception("The first dimension is required to run a kernel");
+    }
+
+    // #ifdef GSPAR_DEBUG
+    //     ss << "[GSPar Kernel " << this << "] Checking max threads per block in device " << this->device << std::endl;
+    //     std::cout << ss.str();
+    //     ss.str("");
+    // #endif
+
+    Dimensions blocksAndThreads = this->getNumBlocksAndThreadsFor(dims);
+
+    unsigned int numBlocks[SUPPORTED_DIMS] = {
+        (unsigned int)blocksAndThreads.x.min,
+        (unsigned int)blocksAndThreads.y.min,
+        (unsigned int)blocksAndThreads.z.min
+    };
+    unsigned int numThreads[SUPPORTED_DIMS] = {
+        (unsigned int)blocksAndThreads.x.max,
+        (unsigned int)blocksAndThreads.y.max,
+        (unsigned int)blocksAndThreads.z.max
+    };
+
+    #ifdef GSPAR_DEBUG
+        ss << "[GSPar Kernel " << this << "] Starting kernel with " << this->kernelParams.size() << " parameters" << std::endl;
+        ss << "[GSPar Kernel " << this << "] Shall start " << dims.toString() << " threads: ";
+        ss << "starting (" << numThreads[0] << "," << numThreads[1] << "," << numThreads[2] << ") threads ";
+        ss << "in (" << numBlocks[0] << "," << numBlocks[1] << "," << numBlocks[2] << ") blocks ";
+        ss << "using " << this->sharedMemoryBytes << " bytes of shared memory in execution flow " << executionFlow << " (CUstream " << cudaStream << ")" << std::endl;
+        std::cout << ss.str();
+        ss.str("");
+    #endif
+
+    throwExceptionIfFailed( cuLaunchKernel(this->cudaFunction,
+        numBlocks[0], numBlocks[1], numBlocks[2], // 3D blocks
+        numThreads[0], numThreads[1], numThreads[2], // 3D threads
+        this->sharedMemoryBytes, cudaStream, this->kernelParams.data(), NULL) );
+
+    // #ifdef GSPAR_DEBUG
+    //     ss << "[GSPar Kernel " << this << "] Started kernel execution in execution flow " << executionFlow << " (CUstream " << cudaStream << ")" << std::endl;
+    //     std::cout << ss.str();
+    //     ss.str("");
+    // #endif
+
+    this->setBaseAsyncObject(cudaStream);
+
+    this->runningAsync = true;
+}
+const int Kernel::queryInfoNumeric(CUfunction_attribute paramName, bool cacheable) {
+    if (cacheable) { // Check if the attribute is cached
+        // We don't use locks here because the Kernel object is not intended to be shared among threads
+        auto it = this->attributeCache.find(paramName);
+        if (it != this->attributeCache.end()) {
+            return it->second;
+        }
+    }
+
+    int pi;
+    throwExceptionIfFailed( cuFuncGetAttribute(&pi, paramName, this->cudaFunction) );
+    if (cacheable) { // Stores the attribute in cache
+        this->attributeCache[paramName] = pi;
+    }
+    return pi;
+}
+
+
+
+///// MemoryObject /////
+void MemoryObject::allocDeviceMemory() {
+    this->device->getContext(); // There must be a context to call cuMemAlloc
+
+    this->devicePtr = new CUdeviceptr; // It is initialized as NULL, we have to allocate space for it
+    throwExceptionIfFailed( cuMemAlloc(this->devicePtr, size) );
+}
+
+MemoryObject::MemoryObject(Device* device, size_t size, void* hostPtr, bool readOnly, bool writeOnly) : BaseMemoryObject(device, size, hostPtr, readOnly, writeOnly) {
+    this->allocDeviceMemory();
+}
+MemoryObject::MemoryObject(Device* device, size_t size, const void* hostPtr) : BaseMemoryObject(device, size, hostPtr) {
+    this->allocDeviceMemory();
+}
+MemoryObject::~MemoryObject() {
+    if (this->devicePtr) {
+        cuMemFree(*(this->devicePtr)); // We don't throw exceptions on destructors
+        this->devicePtr = NULL;
+    }
+    if (this->isPinnedHostMemory()) {
+        cuMemHostUnregister(this->hostPtr); // We don't throw exceptions on destructors
+    }
+}
+void MemoryObject::pinHostMemory() {
+    if (!this->isPinnedHostMemory()) { // TODO implement thread-safety
+        CUresult result = cuMemHostRegister(this->hostPtr, this->size, 0);
+        if (result != CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED) {
+            throwExceptionIfFailed(result);
+        }
+    }
+    BaseMemoryObject::pinHostMemory();
+}
+
+void MemoryObject::copyIn() {
+    throwExceptionIfFailed( cuMemcpyHtoD(*(this->devicePtr), this->hostPtr, this->size) );
+}
+void MemoryObject::copyOut() {
+    throwExceptionIfFailed( cuMemcpyDtoH(this->hostPtr, *(this->devicePtr), this->size) );
+}
+void MemoryObject::copyInAsync(ExecutionFlow* executionFlow) {
+    CUstream cudaStream = ExecutionFlow::checkAndStartFlow(this->device, executionFlow);
+    throwExceptionIfFailed( cuMemcpyHtoDAsync(*(this->devicePtr), this->hostPtr, this->size, cudaStream) );
+    this->setBaseAsyncObject(cudaStream);
+}
+void MemoryObject::copyOutAsync(ExecutionFlow* executionFlow) {
+    CUstream cudaStream = ExecutionFlow::checkAndStartFlow(this->device, executionFlow);
+    throwExceptionIfFailed( cuMemcpyDtoHAsync(this->hostPtr, *(this->devicePtr), this->size, cudaStream) );
+    this->setBaseAsyncObject(cudaStream);
+}
+
+
+
+///// ChunkedMemoryObject /////
+
+void ChunkedMemoryObject::allocDeviceMemory() {
+    this->device->getContext(); // There must be a context to call cuMemAlloc
+
+    this->devicePtr = new CUdeviceptr; // It is initialized as NULL, we have to allocate space for it
+    throwExceptionIfFailed( cuMemAlloc(this->devicePtr, this->getChunkSize() * this->chunks) ); // We allocate space for all the chunks
+}
+
+ChunkedMemoryObject::ChunkedMemoryObject(Device* device, unsigned int chunks, size_t chunkSize, void** hostPointers, bool readOnly, bool writeOnly) :
+        BaseChunkedMemoryObject(device, chunks, chunkSize, hostPointers, readOnly, writeOnly) {
+    this->allocDeviceMemory();
+}
+ChunkedMemoryObject::ChunkedMemoryObject(Device* device, unsigned int chunks, size_t chunkSize, const void** hostPointers) :
+        BaseChunkedMemoryObject(device, chunks, chunkSize, hostPointers) {
+    this->allocDeviceMemory();
+}
+ChunkedMemoryObject::~ChunkedMemoryObject() { }
+void ChunkedMemoryObject::pinHostMemory() {
+    // TODO implement pinned memory in chunked memory objects
+    // We need to keep this empty method here while it is not implemented so the parent method does not get called
+}
+void ChunkedMemoryObject::copyIn() {
+    for (unsigned int chunk = 0; chunk < this->chunks; chunk++) {
+        this->copyIn(chunk);
+    }
+}
+void ChunkedMemoryObject::copyOut() {
+    for (unsigned int chunk = 0; chunk < this->chunks; chunk++) {
+        this->copyOut(chunk);
+    }
+}
+void ChunkedMemoryObject::copyInAsync(ExecutionFlow* executionFlow) {
+    CUstream cudaStream = ExecutionFlow::checkAndStartFlow(this->device, executionFlow);
+    for (unsigned int chunk = 0; chunk < this->chunks; chunk++) {
+        // We don't call copyInAsync(chunk) to avoid calling checkAndStartFlow for each chunk
+        throwExceptionIfFailed( cuMemcpyHtoDAsync((CUdeviceptr)((unsigned char*)(*this->devicePtr)+(chunk*this->getChunkSize())), this->hostPointers[chunk], this->getChunkSize(), cudaStream) );
+    }
+    this->setBaseAsyncObject(cudaStream);
+}
+void ChunkedMemoryObject::copyOutAsync(ExecutionFlow* executionFlow) {
+    CUstream cudaStream = ExecutionFlow::checkAndStartFlow(this->device, executionFlow);
+    for (unsigned int chunk = 0; chunk < this->chunks; chunk++) {
+        // We don't call copyOutAsync(chunk) to avoid calling checkAndStartFlow for each chunk
+        throwExceptionIfFailed( cuMemcpyDtoHAsync(this->hostPointers[chunk], (CUdeviceptr)((unsigned char*)(*this->devicePtr)+(chunk*this->getChunkSize())), this->getChunkSize(), cudaStream) );
+    }
+    this->setBaseAsyncObject(cudaStream);
+}
+void ChunkedMemoryObject::copyIn(unsigned int chunk) {
+    throwExceptionIfFailed( cuMemcpyHtoD((CUdeviceptr)((unsigned char*)(*this->devicePtr)+(chunk*this->getChunkSize())), this->hostPointers[chunk], this->getChunkSize()) );
+}
+void ChunkedMemoryObject::copyOut(unsigned int chunk) {
+    throwExceptionIfFailed( cuMemcpyDtoH(this->hostPointers[chunk], (CUdeviceptr)((unsigned char*)(*this->devicePtr)+(chunk*this->getChunkSize())), this->getChunkSize()) );
+}
+void ChunkedMemoryObject::copyInAsync(unsigned int chunk, ExecutionFlow* executionFlow) {
+    CUstream cudaStream = ExecutionFlow::checkAndStartFlow(this->device, executionFlow);
+    throwExceptionIfFailed( cuMemcpyHtoDAsync((CUdeviceptr)((unsigned char*)(*this->devicePtr)+(chunk*this->getChunkSize())), this->hostPointers[chunk], this->getChunkSize(), cudaStream) );
+    this->setBaseAsyncObject(cudaStream);
+}
+void ChunkedMemoryObject::copyOutAsync(unsigned int chunk, ExecutionFlow* executionFlow) {
+    CUstream cudaStream = ExecutionFlow::checkAndStartFlow(this->device, executionFlow);
+    throwExceptionIfFailed( cuMemcpyDtoHAsync(this->hostPointers[chunk], (CUdeviceptr)((unsigned char*)(*this->devicePtr)+(chunk*this->getChunkSize())), this->getChunkSize(), cudaStream) );
+    this->setBaseAsyncObject(cudaStream);
+}
+
+
+///// StreamElement /////
+
+StreamElement::StreamElement(Device* device) : BaseStreamElement(device) {
+    // Can't call this virtual function in the base constructor
+    this->start();
+}
+
+StreamElement::~StreamElement() { }
+
+
+///// KernelGenerator /////
+
+const std::string KernelGenerator::KERNEL_PREFIX = "extern \"C\" __global__";
+const std::string KernelGenerator::GLOBAL_MEMORY_PREFIX = "";
+const std::string KernelGenerator::SHARED_MEMORY_PREFIX = "extern __shared__";
+const std::string KernelGenerator::CONSTANT_PREFIX = "const";
+const std::string KernelGenerator::DEVICE_FUNCTION_PREFIX = "__device__";
+const std::string KernelGenerator::ATOMIC_ADD_POLYFILL = ""
+    "__device__ double atomicAdd(double* address, double val){ \n"
+    "    unsigned long long int* address_as_ull = (unsigned long long int*)address; \n"
+    "    unsigned long long int old = *address_as_ull, assumed; \n"
+    "    do { \n"
+    "        assumed = old; \n"
+    "        old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); \n"
+    "    } while (assumed != old); \n"
+    "    return __longlong_as_double(old); \n"
+    "} \n";
+
+const std::string KernelGenerator::getKernelPrefix() {
+    return KernelGenerator::KERNEL_PREFIX + " void";
+}
+std::string KernelGenerator::generateStdFunctions() {
+    Dimensions dims({1, 1, 1});
+    std::string gspar_get_gid = "__device__ size_t gspar_get_global_id(unsigned int dimension) { \n";
+    std::string gspar_get_tid = "__device__ size_t gspar_get_thread_id(unsigned int dimension) { \n";
+    std::string gspar_get_bid = "__device__ size_t gspar_get_block_id(unsigned int dimension) { \n";
+    std::string gspar_get_bsize = "__device__ size_t gspar_get_block_size(unsigned int dimension) { \n";
+    std::string gspar_get_gridsize = "__device__ size_t gspar_get_grid_size(unsigned int dimension) { \n";
+    for (int d = 0; d < dims.getCount(); d++) {
+        std::string dimName = dims.getName(d);
+        gspar_get_gid += "   if (dimension == " + std::to_string(d) + ") return blockIdx." + dimName + " * blockDim."+dimName+" + threadIdx." + dimName + "; \n";
+        gspar_get_tid += "   if (dimension == " + std::to_string(d) + ") return threadIdx." + dimName + "; \n";
+        gspar_get_bid += "   if (dimension == " + std::to_string(d) + ") return blockIdx." + dimName + "; \n";
+        gspar_get_bsize += "   if (dimension == " + std::to_string(d) + ") return blockDim." + dimName + "; \n";
+        gspar_get_gridsize += "   if (dimension == " + std::to_string(d) + ") return gridDim." + dimName + "; \n";
+    }
+    gspar_get_gid += "   return 0; } \n";
+    gspar_get_tid += "   return 0; } \n";
+    gspar_get_bid += "   return 0; } \n";
+    gspar_get_bsize += "   return 0; } \n";
+    gspar_get_gridsize += "   return 0; } \n";
+
+    return gspar_get_gid + gspar_get_tid + gspar_get_bid + gspar_get_bsize + gspar_get_gridsize +
+    "extern \"C\" __device__ void gspar_synchronize_local_threads() { __syncthreads(); } \n"
+    // Atomic functions
+    "__device__ int gspar_atomic_add_int(int* valq, int delta) { return atomicAdd(valq, delta); } \n"
+    "__device__ double gspar_atomic_add_double(double* valq, double delta) { return atomicAdd(valq, delta); } \n"
+    ;
+}
+std::string KernelGenerator::replaceMacroKeywords(std::string kernelSource) {
+    kernelSource = std::regex_replace(kernelSource, std::regex("GSPAR_DEVICE_MACRO_BEGIN"), "#define");
+    kernelSource = std::regex_replace(kernelSource, std::regex("GSPAR_DEVICE_MACRO_END"), "\n");
+    return kernelSource;
+}
+std::string KernelGenerator::generateInitKernel(Pattern::BaseParallelPattern* pattern, Dimensions dims) {
+    std::string r = "";
+    if (pattern->isUsingSharedMemory()) {
+        auto shmem = pattern->getSharedMemoryParameter();
+        r += KernelGenerator::SHARED_MEMORY_PREFIX + " " + shmem->getNonPointerTypeName() + " " + shmem->name + "[];";
+    }
+    return r;
+}
+std::string KernelGenerator::generateParams(Pattern::BaseParallelPattern* pattern, Dimensions dims) {
+    std::string r = "";
+    for(int d = 0; d < dims.getCount(); d++) {
+        if (dims.is(d)) {
+            std::string varName = this->getStdVarNameForDimension(pattern->getStdVarNames(), d);
+            r += "const unsigned long gspar_max_" + varName + ",";
+            if (dims[d].min && !pattern->isBatched()) { // Same check as generateStdVariables
+                // TODO Support min in batches
+                r += "const unsigned long gspar_min_" + varName + ",";
+            }
+        }
+    }
+    if (pattern->isBatched()) {
+        // This names are used in other methods
+        r += "unsigned int gspar_batch_size,";
+    }
+    for(auto &param : pattern->getParameterList()) {
+        if (param->direction != Pattern::ParameterDirection::GSPAR_PARAM_NONE) {
+            if (param->direction == Pattern::ParameterDirection::GSPAR_PARAM_IN && param->isConstant()) {
+                r += "const ";
+            }
+            r += param->toKernelParameter() + ",";
+        }
+    }
+    if (!r.empty()) r.pop_back(); // removes last comma
+    return r;
+}
+std::string KernelGenerator::generateStdVariables(Pattern::BaseParallelPattern* pattern, Dimensions dims) {
+    std::array<std::string, 3> patternNames = pattern->getStdVarNames();
+
+    std::string r;
+    for(int d = 0; d < dims.getCount(); d++) {
+        if (dims[d]) {
+            std::string varName = this->getStdVarNameForDimension(patternNames, d);
+            // Standard variables are uint3 according do CUDA specification
+            // By using size_t we can keep the same type of OpenCL driver
+            if (pattern->isBatched()) {
+                r += "size_t gspar_global_" + varName;
+            } else {
+                r += "size_t " + varName;
+            }
+            r += " = gspar_get_global_id(" + std::to_string(d) + ")";
+            if (dims[d].min && !pattern->isBatched()) { // Same check as generateParams
+                // TODO Support min in batches
+                r += " + gspar_min_" + varName;
+            }
+            r += "; \n";
+            // TODO Support multi-dimensional batches
+            if (pattern->isBatched()) {
+                // Intended implicit floor(gspar_global/dims)
+                r += "size_t gspar_batch_" + varName + " = ((size_t)(gspar_global_" + varName + " / gspar_max_" + varName + ")); \n";
+                r += "size_t gspar_offset_" + varName + " = gspar_batch_" + varName + " * gspar_max_" + varName + "; \n";
+                // This variable names are used in other methods, keep track
+                r += "size_t " + varName + " = gspar_global_" + varName + " - gspar_offset_" + varName + "; \n";
+            }
+        }
+    }
+    return r;
+}
+std::string KernelGenerator::generateBatchedParametersInitialization(Pattern::BaseParallelPattern* pattern, Dimensions max) {
+    std::array<std::string, 3> patternNames = pattern->getStdVarNames();
+    // TODO Support multi-dimensional batches
+    std::string stdVarFirstDimension = this->getStdVarNameForDimension(patternNames, 0);
+
+    std::string r = "";
+    for(auto &param : pattern->getParameterList()) {
+        if (param->isBatched()) {
+            if (param->direction == Pattern::ParameterDirection::GSPAR_PARAM_IN && param->isConstant()) {
+                r += "const ";
+            }
+            r += param->type.getFullName() + " " + param->name + " = ";
+            if (param->paramValueType == Pattern::ParameterValueType::GSPAR_PARAM_POINTER) {
+                r += "&" + param->getKernelParameterName() + "[gspar_offset_" + stdVarFirstDimension + "]";
+            } else if (param->paramValueType == Pattern::ParameterValueType::GSPAR_PARAM_VALUE) {
+                r += param->getKernelParameterName() + "[gspar_batch_" + stdVarFirstDimension + "]";
+            }
+            r += ";\n";
+        }
+    }
+    return r;
+}
diff --git a/src/GSPar_CUDA.hpp b/src/GSPar_CUDA.hpp
new file mode 100644
index 0000000..fc91c02
--- /dev/null
+++ b/src/GSPar_CUDA.hpp
@@ -0,0 +1,262 @@
+
+#ifndef __GSPAR_CUDA_INCLUDED__
+#define __GSPAR_CUDA_INCLUDED__
+
+#include <string>
+#include <vector>
+#include <map>
+#include <mutex>
+#include <cuda.h>
+#include <nvrtc.h>
+
+///// Forward declarations /////
+
+namespace GSPar {
+    namespace Driver {
+        namespace CUDA {
+            class Exception;
+            class ExecutionFlow;
+            class AsyncExecutionSupport;
+            class Instance;
+            class Device;
+            class Kernel;
+            class MemoryObject;
+            class ChunkedMemoryObject;
+            class StreamElement;
+            class KernelGenerator;
+        }
+    }
+}
+
+#include "GSPar_BaseGPUDriver.hpp"
+
+namespace GSPar {
+    namespace Driver {
+        namespace CUDA {
+
+            ///// Exception /////
+
+            class Exception :
+                public BaseException<CUresult> {
+            protected:
+                std::string getErrorString(CUresult code) override;
+
+            public:
+                explicit Exception(std::string msg, std::string details = "");
+                explicit Exception(CUresult code, std::string details = "");
+
+                static Exception* checkError(CUresult code, std::string details = "");
+                static void throwIfFailed(CUresult code, std::string details = "");
+            };
+
+            #define throwCompilationExceptionIfFailed( code, cudaProgram ) CompilationException::throwIfFailed( code, cudaProgram, defaultExceptionDetails() )
+
+            class CompilationException :
+                public BaseException<nvrtcResult> {
+            protected:
+                std::string getErrorString(nvrtcResult code) override;
+
+            public:
+                explicit CompilationException(std::string msg, std::string details = "");
+                explicit CompilationException(nvrtcResult code, std::string details = "");
+
+                static CompilationException* checkError(nvrtcResult code, std::string details = "");
+                static void throwIfFailed(nvrtcResult code, std::string details = "");
+                static void throwIfFailed(nvrtcResult code, nvrtcProgram cudaProgram, std::string details = "");
+            };
+
+            ///// ExecutionFlow /////
+
+            class ExecutionFlow :
+                virtual public BaseExecutionFlow<ExecutionFlow, Device, CUstream> {
+            public:
+                ExecutionFlow();
+                explicit ExecutionFlow(Device* device);
+                virtual ~ExecutionFlow();
+                CUstream start() override;
+                void synchronize() override;
+
+                static CUstream checkAndStartFlow(Device* device, ExecutionFlow* executionFlow = NULL);
+            };
+
+            ///// AsyncExecutionSupport /////
+
+            class AsyncExecutionSupport :
+                virtual public BaseAsyncExecutionSupport<CUstream> {
+            public:
+                AsyncExecutionSupport(CUstream asyncObj = NULL);
+                void waitAsync() override;
+
+                static void waitAllAsync(std::initializer_list<AsyncExecutionSupport*> asyncs);
+            };
+
+            ///// Instance /////
+
+            class Instance :
+                public BaseInstance<ExecutionFlow, Device, Kernel, MemoryObject, ChunkedMemoryObject, KernelGenerator> {
+            protected:
+                static Instance *instance;
+                void loadGpuList() override;
+
+            public:
+                Instance();
+                virtual ~Instance();
+                void init() override;
+                unsigned int getGpuCount() override;
+
+                static Instance* getInstance();
+            };
+
+            ///// Device /////
+
+            class Device :
+                public BaseDevice<ExecutionFlow, Kernel, MemoryObject, ChunkedMemoryObject, CUcontext, CUdevice*, CUstream> {
+            private:
+                mutable std::mutex attributeCacheMutex;
+                std::map<CUdevice_attribute, int> attributeCache;
+                int deviceId;
+
+            public:
+                Device();
+                explicit Device(int ordinal);
+                virtual ~Device();
+                ExecutionFlow* getDefaultExecutionFlow() override;
+                CUcontext getContext() override;
+                CUstream startDefaultExecutionFlow() override;
+                unsigned int getDeviceId();
+                const std::string getName() override;
+                unsigned int getComputeUnitsCount() override;
+                unsigned int getWarpSize() override;
+                unsigned int getMaxThreadsPerBlock() override;
+                unsigned long getGlobalMemorySizeBytes() override;
+                unsigned long getLocalMemorySizeBytes() override;
+                unsigned long getSharedMemoryPerComputeUnitSizeBytes() override;
+                unsigned int getClockRateMHz() override;
+                bool isIntegratedMainMemory() override;
+                MemoryObject* malloc(long size, void* hostPtr = nullptr, bool readOnly = false, bool writeOnly = false) override;
+                MemoryObject* malloc(long size, const void* hostPtr = nullptr) override;
+                ChunkedMemoryObject* mallocChunked(unsigned int chunks, long chunkSize, void** hostPtr = nullptr, bool readOnly = false, bool writeOnly = false) override;
+                ChunkedMemoryObject* mallocChunked(unsigned int chunks, long chunkSize, const void** hostPtr = nullptr) override;
+                Kernel* prepareKernel(const std::string kernelSource, const std::string kernelName) override;
+                std::vector<Kernel*> prepareKernels(const std::string kernelSource, const std::vector<std::string> kernelNames) override;
+
+                // const char* queryInfoText(cl_device_info paramName);
+                const int queryInfoNumeric(CUdevice_attribute paramName, bool cacheable = true);
+                std::tuple<nvrtcProgram, CUmodule> compileCudaProgramAndLoadModule(std::string source, const std::string programName);
+            };
+
+            ///// Kernel /////
+
+            class Kernel :
+                public BaseKernel<ExecutionFlow, Device, MemoryObject, ChunkedMemoryObject, CUstream>,
+                public AsyncExecutionSupport {
+            private:
+                nvrtcProgram cudaProgram = NULL;
+                CUmodule cudaModule = NULL;
+                CUfunction cudaFunction = NULL;
+                std::vector<void*> kernelParams;
+                bool isPrecompiled;
+                std::map<CUfunction_attribute, int> attributeCache;
+
+                void loadCudaFunction(const std::string kernelName);
+
+            public:
+                Kernel();
+                Kernel(Device* device, const std::string kernelSource, const std::string kernelName);
+                virtual ~Kernel();
+                virtual void cloneInto(BaseKernelBase* baseOther) override;
+                int setParameter(MemoryObject* memoryObject) override;
+                int setParameter(ChunkedMemoryObject* chunkedMemoryObject) override;
+                int setParameter(size_t parmSize, void* parm) override;
+                int setParameter(size_t parmSize, const void* parm) override;
+                void clearParameters() override;
+                Dimensions getNumBlocksAndThreadsFor(Dimensions dims) override;
+                void runAsync(Dimensions max, ExecutionFlow* executionFlow = NULL) override;
+
+                Kernel(Device* device, nvrtcProgram cudaProgram, CUmodule cudaModule, const std::string kernelName);
+                const int queryInfoNumeric(CUfunction_attribute paramName, bool cacheable = true);
+            };
+
+            ///// MemoryObject /////
+
+            class MemoryObject :
+                public BaseMemoryObject<Exception, ExecutionFlow, Device, CUdeviceptr*, CUstream>,
+                public AsyncExecutionSupport {
+            private:
+                void allocDeviceMemory();
+            public:
+                MemoryObject(Device* device, size_t size, void* hostPtr, bool readOnly, bool writeOnly);
+                MemoryObject(Device* device, size_t size, const void* hostPtr);
+                virtual ~MemoryObject();
+                virtual void pinHostMemory() override;
+                virtual void copyIn() override;
+                virtual void copyOut() override;
+                virtual void copyInAsync(ExecutionFlow* executionFlow = NULL) override;
+                virtual void copyOutAsync(ExecutionFlow* executionFlow = NULL) override;
+            };
+
+            ///// ChunkedMemoryObject /////
+
+            class ChunkedMemoryObject :
+                public BaseChunkedMemoryObject<Exception, ExecutionFlow, Device, CUdeviceptr*, CUstream>,
+                public AsyncExecutionSupport {
+            private:
+                void allocDeviceMemory();
+
+            public:
+                ChunkedMemoryObject(Device* device, unsigned int chunks, size_t chunkSize, void** hostPointers, bool readOnly, bool writeOnly);
+                ChunkedMemoryObject(Device* device, unsigned int chunks, size_t chunkSize, const void** hostPointers);
+                virtual ~ChunkedMemoryObject();
+                virtual void pinHostMemory() override;
+                // Copy all chunks
+                virtual void copyIn() override;
+                virtual void copyOut() override;
+                virtual void copyInAsync(ExecutionFlow* executionFlow = NULL) override;
+                virtual void copyOutAsync(ExecutionFlow* executionFlow = NULL) override;
+                // Copy specific chunks of memory. We can't use function overloading due to the override.
+                virtual void copyIn(unsigned int chunk);
+                virtual void copyOut(unsigned int chunk);
+                virtual void copyInAsync(unsigned int chunk, ExecutionFlow* executionFlow = NULL);
+                virtual void copyOutAsync(unsigned int chunk, ExecutionFlow* executionFlow = NULL);
+            };
+
+            ///// StreamElement /////
+
+            class StreamElement :
+                public BaseStreamElement<ExecutionFlow, Device, CUstream, CUstream>,
+                public AsyncExecutionSupport,
+                public ExecutionFlow {
+            private:
+                Kernel* kernel;
+
+            public:
+                explicit StreamElement(Device* device);
+                ~StreamElement();
+            };
+
+            ///// KernelGenerator /////
+
+            class KernelGenerator :
+                public BaseKernelGenerator {
+            public:
+                static const std::string KERNEL_PREFIX;
+                static const std::string GLOBAL_MEMORY_PREFIX;
+                static const std::string SHARED_MEMORY_PREFIX;
+                static const std::string CONSTANT_PREFIX;
+                static const std::string DEVICE_FUNCTION_PREFIX;
+                static const std::string ATOMIC_ADD_POLYFILL;
+                const std::string getKernelPrefix() override;
+                std::string generateStdFunctions() override;
+                std::string replaceMacroKeywords(std::string kernelSource) override;
+                std::string generateInitKernel(Pattern::BaseParallelPattern* pattern, Dimensions dims) override;
+                std::string generateParams(Pattern::BaseParallelPattern* pattern, Dimensions dims) override;
+                std::string generateStdVariables(Pattern::BaseParallelPattern* pattern, Dimensions dims) override;
+                std::string generateBatchedParametersInitialization(Pattern::BaseParallelPattern* pattern, Dimensions dims) override;
+
+            };
+
+        }
+    }
+}
+
+#endif
diff --git a/src/GSPar_OpenCL.cpp b/src/GSPar_OpenCL.cpp
new file mode 100644
index 0000000..258bd02
--- /dev/null
+++ b/src/GSPar_OpenCL.cpp
@@ -0,0 +1,1051 @@
+
+#include <regex>
+#include <iostream>
+#include <cstring>
+#include <vector>
+#ifdef GSPAR_DEBUG
+#include <sstream>
+#include <thread>
+#endif
+
+#include "GSPar_OpenCL.hpp"
+
+using namespace GSPar::Driver::OpenCL;
+
+// extern "C" void CL_CALLBACK ocl_pfn_notify(const char *errinfo, const void *private_info, size_t cb, void *user_data) {
+//     std::cerr << "OpenCL notified an error: " << errinfo << std::endl;
+// }
+
+///// Exception /////
+
+std::string Exception::getErrorString(cl_int code) {
+    switch(code) {
+        // run-time and JIT compiler errors
+        case 0: return "CL_SUCCESS";
+        case -1: return "CL_DEVICE_NOT_FOUND";
+        case -2: return "CL_DEVICE_NOT_AVAILABLE";
+        case -3: return "CL_COMPILER_NOT_AVAILABLE";
+        case -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
+        case -5: return "CL_OUT_OF_RESOURCES";
+        case -6: return "CL_OUT_OF_HOST_MEMORY";
+        case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE";
+        case -8: return "CL_MEM_COPY_OVERLAP";
+        case -9: return "CL_IMAGE_FORMAT_MISMATCH";
+        case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
+        case -11: return "CL_BUILD_PROGRAM_FAILURE";
+        case -12: return "CL_MAP_FAILURE";
+        case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
+        case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
+        case -15: return "CL_COMPILE_PROGRAM_FAILURE";
+        case -16: return "CL_LINKER_NOT_AVAILABLE";
+        case -17: return "CL_LINK_PROGRAM_FAILURE";
+        case -18: return "CL_DEVICE_PARTITION_FAILED";
+        case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
+
+        // compile-time errors
+        case -30: return "CL_INVALID_VALUE";
+        case -31: return "CL_INVALID_DEVICE_TYPE";
+        case -32: return "CL_INVALID_PLATFORM";
+        case -33: return "CL_INVALID_DEVICE";
+        case -34: return "CL_INVALID_CONTEXT";
+        case -35: return "CL_INVALID_QUEUE_PROPERTIES";
+        case -36: return "CL_INVALID_COMMAND_QUEUE";
+        case -37: return "CL_INVALID_HOST_PTR";
+        case -38: return "CL_INVALID_MEM_OBJECT";
+        case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
+        case -40: return "CL_INVALID_IMAGE_SIZE";
+        case -41: return "CL_INVALID_SAMPLER";
+        case -42: return "CL_INVALID_BINARY";
+        case -43: return "CL_INVALID_BUILD_OPTIONS";
+        case -44: return "CL_INVALID_PROGRAM";
+        case -45: return "CL_INVALID_PROGRAM_EXECUTABLE";
+        case -46: return "CL_INVALID_KERNEL_NAME";
+        case -47: return "CL_INVALID_KERNEL_DEFINITION";
+        case -48: return "CL_INVALID_KERNEL";
+        case -49: return "CL_INVALID_ARG_INDEX";
+        case -50: return "CL_INVALID_ARG_VALUE";
+        case -51: return "CL_INVALID_ARG_SIZE";
+        case -52: return "CL_INVALID_KERNEL_ARGS";
+        case -53: return "CL_INVALID_WORK_DIMENSION";
+        case -54: return "CL_INVALID_WORK_GROUP_SIZE";
+        case -55: return "CL_INVALID_WORK_ITEM_SIZE";
+        case -56: return "CL_INVALID_GLOBAL_OFFSET";
+        case -57: return "CL_INVALID_EVENT_WAIT_LIST";
+        case -58: return "CL_INVALID_EVENT";
+        case -59: return "CL_INVALID_OPERATION";
+        case -60: return "CL_INVALID_GL_OBJECT";
+        case -61: return "CL_INVALID_BUFFER_SIZE";
+        case -62: return "CL_INVALID_MIP_LEVEL";
+        case -63: return "CL_INVALID_GLOBAL_WORK_SIZE";
+        case -64: return "CL_INVALID_PROPERTY";
+        case -65: return "CL_INVALID_IMAGE_DESCRIPTOR";
+        case -66: return "CL_INVALID_COMPILER_OPTIONS";
+        case -67: return "CL_INVALID_LINKER_OPTIONS";
+        case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT";
+
+        // extension errors
+        case -1000: return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR";
+        case -1001: return "CL_PLATFORM_NOT_FOUND_KHR";
+        case -1002: return "CL_INVALID_D3D10_DEVICE_KHR";
+        case -1003: return "CL_INVALID_D3D10_RESOURCE_KHR";
+        case -1004: return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR";
+        case -1005: return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR";
+        default: return "Unknown OpenCL error";
+    }
+}
+
+Exception::Exception(std::string msg, std::string details) : BaseException(msg, details) { }
+Exception::Exception(cl_int code, std::string details) : BaseException(code, details) {
+    // Can't call this virtual function in the base constructor
+    this->msg = this->getErrorString(code);
+}
+// static
+Exception* Exception::checkError(cl_int code, std::string details) {
+    return BaseException::checkError<Exception>(code, CL_SUCCESS, details);
+}
+// static
+void Exception::throwIfFailed(cl_int code, std::string details) {
+    BaseException::throwIfFailed<Exception>(code, CL_SUCCESS, details);
+}
+
+Exception::Exception(cl_int code, cl_program program, cl_device_id device) : Exception(code) {
+    if (code == CL_BUILD_PROGRAM_FAILURE) {
+        size_t log_size;
+        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
+        char *log = new char[log_size];
+        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
+        this->msg += std::string(" - ") + std::string(log);
+    }
+}
+// static
+Exception* Exception::checkError(cl_int code, cl_program program, cl_device_id device) {
+    if (code != CL_SUCCESS) {
+        return new Exception(code, program, device);
+    }
+    return NULL;
+}
+// static
+void Exception::throwIfFailed(cl_int code, cl_program program, cl_device_id device) {
+    Exception* ex = Exception::checkError(code, program, device);
+    if (ex != NULL) {
+        throw *ex;
+    }
+}
+
+
+///// ExecutionFlow /////
+
+ExecutionFlow::ExecutionFlow() : BaseExecutionFlow() { }
+ExecutionFlow::ExecutionFlow(Device* device) : BaseExecutionFlow(device) { }
+ExecutionFlow::~ExecutionFlow() {
+    // We don't throw exceptions on destructors
+    if (this->flowObject) {
+        #ifdef GSPAR_DEBUG
+            std::stringstream ss; // Using stringstream eases multi-threaded debugging
+            ss << "[" << std::this_thread::get_id() << " GSPar ExFlow] Releasing command queue " << this << std::endl;
+            std::cout << ss.str();
+            ss.str("");
+        #endif
+        Exception* ex = Exception::checkError( clReleaseCommandQueue(this->flowObject) );
+        if (ex != nullptr) {
+            std::cerr << "Failed when releasing OpenCL command queue of execution flow: ";
+            std::cerr << ex->what() << " - " << ex->getDetails() << std::endl;
+            delete ex;
+        }
+    }
+}
+cl_command_queue ExecutionFlow::start() {
+    if (!this->device) {
+        // Can't start flow on a NULL device
+        throw Exception("A device is required to start an execution flow", defaultExceptionDetails());
+    }
+    if (!this->flowObject) {
+        this->device = device;
+        cl_int status;
+        this->flowObject = clCreateCommandQueue(device->getContext(), device->getBaseDeviceObject(), 0, &status);
+        throwExceptionIfFailed(status);
+    }
+    return this->getBaseFlowObject();
+}
+void ExecutionFlow::synchronize() {
+    #ifdef GSPAR_DEBUG
+        std::stringstream ss; // Using stringstream eases multi-threaded debugging
+        ss << "[" << std::this_thread::get_id() << " GSPar ExFlow " << this << "] Synchronizing" << std::endl;
+        std::cout << ss.str();
+        ss.str("");
+    #endif
+    // clEnqueueMarker(cl_command_queue, cl_event) is deprecated in OpenCL 1.2
+
+    // cl_event evt;
+    // throwExceptionIfFailed( clEnqueueMarkerWithWaitList(this->getBaseFlowObject(), 0, NULL, &evt) );
+    // throwExceptionIfFailed( clWaitForEvents(1, &evt) );
+    // throwExceptionIfFailed( clReleaseEvent(evt) );
+    throwExceptionIfFailed( clFinish(this->flowObject) );
+}
+cl_command_queue ExecutionFlow::checkAndStartFlow(Device* device, ExecutionFlow* executionFlow) {
+    return BaseExecutionFlow::checkAndStartFlow(device, executionFlow);
+}
+
+
+
+///// AsyncExecutionSupport /////
+
+AsyncExecutionSupport::AsyncExecutionSupport(cl_event *asyncObjs, unsigned int numAsyncEvents) :
+        BaseAsyncExecutionSupport(asyncObjs), numAsyncEvents(numAsyncEvents) { }
+AsyncExecutionSupport::~AsyncExecutionSupport() {
+    try {
+        this->releaseBaseAsyncObject();
+    } catch (GSPar::GSParException &ex) { // We don't throw exceptions on destructors
+        std::cerr << "Failed when releasing OpenCL event on AsyncExecutionSupport destructor: ";
+        std::cerr << ex.what() << " - " << ex.getDetails() << std::endl;
+        this->asyncObject = NULL;
+    }
+}
+void AsyncExecutionSupport::setBaseAsyncObject(cl_event *asyncObject) {
+    this->setBaseAsyncObject(asyncObject, 1);
+}
+void AsyncExecutionSupport::setBaseAsyncObject(cl_event *asyncObject, unsigned int numAsyncEvents) {
+    this->releaseBaseAsyncObject(); // Release current object
+    BaseAsyncExecutionSupport::setBaseAsyncObject(asyncObject);
+    this->numAsyncEvents = numAsyncEvents;
+}
+void AsyncExecutionSupport::waitAsync() {
+    if (this->executionFlow) {
+        this->executionFlow->synchronize();
+    } else if (this->asyncObject) {
+        #ifdef GSPAR_DEBUG
+            std::stringstream ss; // Using stringstream eases multi-threaded debugging
+            ss << "[" << std::this_thread::get_id() << " GSPar Async " << this << "] Waiting for " << this->numAsyncEvents << " events: " << this->asyncObject << std::endl;
+            std::cout << ss.str();
+            ss.str("");
+
+            // CL_QUEUED: 3
+            // CL_SUBMITTED: 2
+            // CL_RUNNING: 1
+            // CL_COMPLETE: 0
+            cl_int status;
+            throwExceptionIfFailed( clGetEventInfo(*this->asyncObject, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &status, NULL) );
+            // CL_COMMAND_NDRANGE_KERNEL: 4592
+            // CL_COMMAND_TASK: 4593
+            // CL_COMMAND_NATIVE_KERNEL: 4594
+            // CL_COMMAND_READ_BUFFER: 4595
+            // CL_COMMAND_WRITE_BUFFER: 4596
+            // CL_COMMAND_COPY_BUFFER: 4597
+            // CL_COMMAND_READ_IMAGE: 4598
+            // CL_COMMAND_WRITE_IMAGE: 4599
+            // CL_COMMAND_COPY_IMAGE: 4600
+            // CL_COMMAND_COPY_BUFFER_TO_IMAGE: 4602
+            // CL_COMMAND_COPY_IMAGE_TO_BUFFER: 4601
+            // CL_COMMAND_MAP_BUFFER: 4603
+            // CL_COMMAND_MAP_IMAGE: 4604
+            // CL_COMMAND_UNMAP_MEM_OBJECT: 4605
+            // CL_COMMAND_MARKER: 4606
+            // CL_COMMAND_ACQUIRE_GL_OBJECTS: 4607
+            // CL_COMMAND_RELEASE_GL_OBJECTS: 4608
+            // CL_COMMAND_READ_BUFFER_RECT: 4609
+            // CL_COMMAND_WRITE_BUFFER_RECT: 4610
+            // CL_COMMAND_COPY_BUFFER_RECT: 4611
+            // CL_COMMAND_USER: 4612
+            // CL_COMMAND_BARRIER: 4613
+            // CL_COMMAND_MIGRATE_MEM_OBJECTS: 4614
+            // CL_COMMAND_FILL_BUFFER: 4615
+            // CL_COMMAND_FILL_IMAGE: 4616
+            // CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR: 8205
+            cl_command_type type;
+            throwExceptionIfFailed( clGetEventInfo(*this->asyncObject, CL_EVENT_COMMAND_TYPE, sizeof(cl_command_type), &type, NULL) );
+
+            ss << "[" << std::this_thread::get_id() << " GSPar Async " << this << "] Event " << this->asyncObject << " of type " << type << " is of status " << status << std::endl;
+            std::cout << ss.str();
+            ss.str("");
+        #endif
+
+        throwExceptionIfFailed( clWaitForEvents(this->numAsyncEvents, this->asyncObject) );
+    }
+    this->releaseBaseAsyncObject();
+}
+void AsyncExecutionSupport::releaseBaseAsyncObject() {
+    if (this->executionFlow) {
+        // We don't own this ExecutionFlow, it's just a weak reference, so we don't delete it
+        this->executionFlow = nullptr;
+    }
+    if (this->asyncObject) {
+        #ifdef GSPAR_DEBUG
+            std::stringstream ss; // Using stringstream eases multi-threaded debugging
+            ss << "[" << std::this_thread::get_id() << " GSPar Async " << this << "] Releasing " << this->numAsyncEvents << " events: " << this->asyncObject << std::endl;
+            std::cout << ss.str();
+            ss.str("");
+        #endif
+        for (unsigned int i = 0; i < this->numAsyncEvents; i++) {
+            throwExceptionIfFailed( clReleaseEvent(this->asyncObject[i]) );
+        }
+        this->asyncObject = NULL;
+    }
+    this->clearRunningAsync(); // We can't be running async since we don't have the async objects anymore
+}
+// static
+void AsyncExecutionSupport::waitAllAsync(std::initializer_list<AsyncExecutionSupport*> asyncs) {
+    std::vector<cl_event> oclEvents;
+    #ifdef GSPAR_DEBUG
+        std::stringstream ss; // Using stringstream eases multi-threaded debugging
+        ss << "[" << std::this_thread::get_id() << " GSPar Async] Waiting for all async events" << std::endl;
+        std::cout << ss.str();
+        ss.str("");
+    #endif
+    for (auto async : asyncs) {
+        // std::cout << "Waiting for all cl_events " << async->asyncObject << " " << *async->asyncObject << std::endl;
+        oclEvents.insert(oclEvents.end(), async->getBaseAsyncObject(), async->getBaseAsyncObject()+async->numAsyncEvents);
+    }
+    if (oclEvents.size() > 0) {
+        throwExceptionIfFailed( clWaitForEvents(oclEvents.size(), oclEvents.data()) );
+    }
+    for (auto async : asyncs) {
+        async->releaseBaseAsyncObject();
+    }
+}
+
+
+///// Instance /////
+
+Instance *Instance::instance = nullptr;
+
+void Instance::loadGpuList() {
+    this->clearGpuList();
+    
+    cl_uint platformCount;
+    throwExceptionIfFailed( clGetPlatformIDs(0, NULL, &platformCount) );
+
+    cl_platform_id* platforms = new cl_platform_id[platformCount];
+    throwExceptionIfFailed( clGetPlatformIDs(platformCount, platforms, NULL) );
+
+    for (unsigned int i = 0; i < platformCount; ++i) {
+        cl_uint deviceCount;
+        throwExceptionIfFailed( clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, 0, NULL, &deviceCount) );
+
+        cl_device_id* deviceIds = new cl_device_id[deviceCount];
+        throwExceptionIfFailed( clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, deviceCount, deviceIds, NULL) );
+
+        for (unsigned int d = 0; d < deviceCount; ++d) {
+            this->devices.push_back(new Device(deviceIds[d]));
+        }
+    }
+
+    delete[] platforms;
+}
+
+Instance::Instance() : BaseInstance(Runtime::GSPAR_RT_OPENCL) { }
+Instance::~Instance() {
+    #ifdef GSPAR_DEBUG
+        std::stringstream ss; // Using stringstream eases multi-threaded debugging
+        ss << "[" << std::this_thread::get_id() << " GSPar Instance] Deleting Singleton instance " << this << std::endl;
+        std::cout << ss.str();
+        ss.str("");
+    #endif
+    Instance::instance = nullptr;
+}
+Instance* Instance::getInstance() {
+    // TODO implement thread-safety
+    if (!instance) {
+        instance = new Instance();
+    }
+    return instance;
+}
+
+void Instance::init() {
+    this->instanceInitiated = true;
+}
+
+unsigned int Instance::getGpuCount() {
+    unsigned int gpuCount = 0;
+
+    cl_uint platformCount;
+    throwExceptionIfFailed( clGetPlatformIDs(0, NULL, &platformCount) );
+
+    cl_platform_id* platforms = new cl_platform_id[platformCount];
+    throwExceptionIfFailed( clGetPlatformIDs(platformCount, platforms, NULL) );
+
+    for (unsigned int i = 0; i < platformCount; ++i) {
+        cl_uint deviceCount;
+        throwExceptionIfFailed( clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, 0, NULL, &deviceCount) );
+
+        gpuCount += deviceCount;
+    }
+
+    delete[] platforms;
+
+    return gpuCount;
+}
+
+
+///// Device /////
+
+Device::Device() : BaseDevice() { }
+Device::Device(cl_device_id device) {
+    this->setBaseDeviceObject(device);
+}
+Device::~Device() {
+    // We don't throw exceptions on destructors
+    if (this->defaultExecutionFlow) {
+        delete this->defaultExecutionFlow;
+        this->defaultExecutionFlow = NULL;
+    }
+
+    if (this->libContext) {
+        #ifdef GSPAR_DEBUG
+            std::stringstream ss; // Using stringstream eases multi-threaded debugging
+            ss << "[" << std::this_thread::get_id() << " GSPar Device] Releasing context " << this << std::endl;
+            std::cout << ss.str();
+            ss.str("");
+        #endif
+        Exception* ex = Exception::checkError( clReleaseContext(this->libContext) );
+        if (ex) {
+            std::cerr << "Failed when releasing device context on Device's destructor: ";
+            std::cerr << ex->what() << " - " << ex->getDetails() << std::endl;
+            delete ex;
+        }
+        this->libContext = NULL;
+    }
+}
+ExecutionFlow* Device::getDefaultExecutionFlow() {
+    if (!this->defaultExecutionFlow) {
+        this->defaultExecutionFlow = new ExecutionFlow(this);
+    }
+    return this->defaultExecutionFlow;
+}
+cl_context Device::getContext() {
+    if (!this->libContext) {
+        std::lock_guard<std::mutex> lock(this->libContextMutex);
+        if (!this->libContext) { // Check if someone changed it while we were waiting for the lock
+            cl_int status;
+            // TODO add a CL_CALLBACK to get notified of errors. Check opencl versions in test/comparison for an example
+            cl_context context = clCreateContext(NULL, 1, &this->libDevice, NULL, NULL, &status);
+            throwExceptionIfFailed(status);
+            this->setContext(context);
+        }
+        // Auto-unlock of libContextMutex, RAII
+    }
+    return this->libContext;
+}
+cl_command_queue Device::startDefaultExecutionFlow() {
+    return this->getDefaultExecutionFlow()->start();
+}
+const std::string Device::getName() {
+    return this->queryInfoDevice<char>(CL_DEVICE_NAME);
+}
+unsigned int Device::getComputeUnitsCount() {
+    return *(this->queryInfoDevice<cl_uint>(CL_DEVICE_MAX_COMPUTE_UNITS));
+}
+unsigned int Device::getWarpSize() {
+    // TODO warp size is available only for NVIDIA GPUs
+    return *(this->queryInfoDevice<size_t>(CL_DEVICE_WARP_SIZE_NV));
+}
+unsigned int Device::getMaxThreadsPerBlock() {
+    return *(this->queryInfoDevice<size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE));
+}
+unsigned long Device::getGlobalMemorySizeBytes() {
+    return *(this->queryInfoDevice<cl_ulong>(CL_DEVICE_GLOBAL_MEM_SIZE));
+}
+unsigned long Device::getLocalMemorySizeBytes() {
+    return *(this->queryInfoDevice<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE));
+}
+unsigned long Device::getSharedMemoryPerComputeUnitSizeBytes() {
+    return *(this->queryInfoDevice<cl_ulong>(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE));
+}
+unsigned int Device::getClockRateMHz() {
+    return *(this->queryInfoDevice<cl_uint>(CL_DEVICE_MAX_CLOCK_FREQUENCY));
+}
+bool Device::isIntegratedMainMemory() {
+    // CL_DEVICE_HOST_UNIFIED_MEMORY is deprecated in OpenCL 1.2
+    // should probably use CL_DEVICE_SVM_CAPABILITIES instead in OpenCL 2.0
+    return *(this->queryInfoDevice<cl_bool>(CL_DEVICE_HOST_UNIFIED_MEMORY));
+}
+MemoryObject* Device::malloc(long size, void* hostPtr, bool readOnly, bool writeOnly) {
+    return new MemoryObject(this, size, hostPtr, readOnly, writeOnly);
+}
+MemoryObject* Device::malloc(long size, const void* hostPtr) {
+    return new MemoryObject(this, size, hostPtr);
+}
+ChunkedMemoryObject* Device::mallocChunked(unsigned int chunks, long chunkSize, void** hostPointers, bool readOnly, bool writeOnly) {
+    return new ChunkedMemoryObject(this, chunks, chunkSize, hostPointers, readOnly, writeOnly);
+}
+ChunkedMemoryObject* Device::mallocChunked(unsigned int chunks, long chunkSize, const void** hostPointers) {
+    return new ChunkedMemoryObject(this, chunks, chunkSize, hostPointers);
+}
+Kernel* Device::prepareKernel(const std::string kernel_source, const std::string kernel_name) {
+    return new Kernel(this, kernel_source, kernel_name);
+}
+std::vector<Kernel*> Device::prepareKernels(const std::string kernelSource, const std::vector<std::string> kernelNames) {
+    cl_program oclProgram = this->compileOCLProgram(kernelSource);
+
+    std::vector<Kernel*> kernels;
+    for (auto name : kernelNames) {
+        kernels.push_back(new Kernel(this, oclProgram, name));
+    }
+    return kernels;
+}
+template<class T>
+const T* Device::queryInfoDevice(cl_device_info paramName, bool cacheable) {
+    //https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/clGetDeviceInfo.html
+    if (cacheable) { // Check if the attribute is cached
+        std::lock_guard<std::mutex> lock(this->attributeCacheMutex); // Auto-unlock, RAII
+        auto it = this->attributeCache.find(paramName);
+        if (it != this->attributeCache.end()) {
+            return (T*)it->second;
+        }
+    }
+
+    size_t valueSize;
+    clGetDeviceInfo(this->getBaseDeviceObject(), paramName, 0, NULL, &valueSize);
+    T* value = new T[valueSize];
+    clGetDeviceInfo(this->getBaseDeviceObject(), paramName, valueSize, value, NULL);
+    if (cacheable) { // Stores the attribute in cache
+        std::lock_guard<std::mutex> lock(this->attributeCacheMutex); // Auto-unlock, RAII
+        this->attributeCache[paramName] = value;
+    }
+    return value;
+}
+cl_program Device::compileOCLProgram(std::string source) {
+#ifdef GSPAR_DEBUG
+    std::stringstream ss; // Using stringstream eases multi-threaded debugging
+    ss << "[GSPar Device " << this << "] Kernel received to compile: \n" << source << std::endl;
+    std::cout << ss.str();
+    ss.str("");
+#endif
+
+    std::string openclExtensions = "#pragma OPENCL EXTENSION all: enable\n";
+    std::string completeKernelSource = "";
+    completeKernelSource.append(openclExtensions);
+    completeKernelSource.append(Instance::getInstance()->getKernelGenerator()->generateStdFunctions());
+    completeKernelSource.append(Instance::getInstance()->getKernelGenerator()->replaceMacroKeywords(source));
+
+#ifdef GSPAR_DEBUG
+    ss << "[GSPar Device " << this << "] Complete kernel for compilation: \n" << completeKernelSource << std::endl;
+    std::cout << ss.str();
+    ss.str("");
+#endif
+
+    cl_program oclProgram;
+    cl_device_id devId = this->getBaseDeviceObject();
+
+    // Place for inserting any additional macros
+    std::string macrosGspar = "";
+    macrosGspar.append("-D GSPAR_DEVICE_KERNEL=" + KernelGenerator::KERNEL_PREFIX);
+    macrosGspar.append(" -D GSPAR_DEVICE_GLOBAL_MEMORY=" + KernelGenerator::GLOBAL_MEMORY_PREFIX);
+    macrosGspar.append(" -D GSPAR_DEVICE_SHARED_MEMORY=" + KernelGenerator::SHARED_MEMORY_PREFIX);
+    macrosGspar.append(" -D GSPAR_DEVICE_CONSTANT=" + KernelGenerator::CONSTANT_PREFIX);
+    macrosGspar.append(" -D GSPAR_DEVICE_FUNCTION=" + KernelGenerator::DEVICE_FUNCTION_PREFIX);
+    const char *compilationOptions = macrosGspar.c_str();
+
+#ifdef GSPAR_DEBUG
+    ss << "[GSPar Device " << this << "] Compiling kernel with arguments: " << compilationOptions;
+    ss << std::endl;
+    std::cout << ss.str();
+    ss.str("");
+#endif
+
+    cl_int status;
+    const char* src = completeKernelSource.c_str();
+    oclProgram = clCreateProgramWithSource(this->getContext(), 1, &src, NULL, &status);
+    Exception::throwIfFailed(status, oclProgram, devId);
+
+    status = clBuildProgram(oclProgram, 1, &devId, compilationOptions, NULL, NULL);
+    Exception::throwIfFailed(status, oclProgram, devId);
+
+    return oclProgram;
+}
+
+///// Kernel /////
+
+void Kernel::loadOclKernel(const std::string kernelName) {
+    cl_int status;
+    this->oclKernel = clCreateKernel(this->oclProgram, kernelName.c_str(), &status);
+    Exception::throwIfFailed(status, this->oclProgram, this->device->getBaseDeviceObject());
+    this->kernelName = kernelName;
+}
+
+Kernel::Kernel() : BaseKernel() { }
+Kernel::Kernel(Device* device, const std::string kernelSource, const std::string kernelName) : BaseKernel(device, kernelSource, kernelName) {
+    this->oclProgram = device->compileOCLProgram(kernelSource);
+
+    this->isPrecompiled = false; //Kernel owns oclProgram
+
+    this->loadOclKernel(kernelName);
+}
+Kernel::Kernel(Device* device, cl_program oclProgram, const std::string kernelName) : BaseKernel(device) {
+    this->oclProgram = oclProgram;
+    this->isPrecompiled = true; //Kernel shares oclProgram
+
+    this->loadOclKernel(kernelName);
+}
+Kernel::~Kernel() {
+    #ifdef GSPAR_DEBUG
+        std::stringstream ss; // Using stringstream eases multi-threaded debugging
+    #endif
+    if (!this->isPrecompiled && this->oclProgram) {
+        #ifdef GSPAR_DEBUG
+            ss << "[" << std::this_thread::get_id() << " GSPar Kernel] Releasing oclProgram " << this << std::endl;
+            std::cout << ss.str();
+            ss.str("");
+        #endif
+        Exception* ex = Exception::checkError( clReleaseProgram(this->oclProgram) ); // We don't throw exceptions on destructors
+        if (ex != nullptr) {
+            std::cerr << "Failed when releasing OpenCL program on Kernel destructor: ";
+            std::cerr << ex->what() << " - " << ex->getDetails() << std::endl;
+            delete ex;
+        }
+        this->oclProgram = NULL;
+    }
+    if (this->oclKernel) {
+        #ifdef GSPAR_DEBUG
+            ss << "[" << std::this_thread::get_id() << " GSPar Kernel " << this << "] Releasing oclKernel" << std::endl;
+            std::cout << ss.str();
+            ss.str("");
+        #endif
+        Exception* ex = Exception::checkError( clReleaseKernel(this->oclKernel) ); // We don't throw exceptions on destructors
+        if (ex != nullptr) {
+            std::cerr << "Failed when releasing OpenCL kernel on Kernel destructor: ";
+            std::cerr << ex->what() << " - " << ex->getDetails() << std::endl;
+            delete ex;
+        }
+        this->oclKernel = NULL;
+    }
+}
+void Kernel::cloneInto(BaseKernelBase* baseOther) {
+    BaseKernel::cloneInto(baseOther);
+    Kernel* other = static_cast<Kernel*>(baseOther);
+    other->oclProgram = this->oclProgram;
+    // cl_kernel objects are not thread-safe (OpenCL 1.2 Specification p. 360)
+    other->loadOclKernel(this->kernelName);
+    // We do not mark this kernel as precompiled, so it destroys the cl_program on destructor.
+    // However, once it is destroyed, the cloned pattern cannot be further cloned because we need the program to call clCreateKernel (called during the clone process)
+    // TODO I haven't tested, but this probably causes issues
+    // this->isPrecompiled = true;
+    other->isPrecompiled = true;
+}
+int Kernel::setParameter(MemoryObject* memoryObject) {
+    cl_mem oclObject = memoryObject->getBaseMemoryObject();
+    throwExceptionIfFailed( clSetKernelArg(this->oclKernel, this->parameterCount++, sizeof(cl_mem), &oclObject) );
+    return this->parameterCount;
+}
+int Kernel::setParameter(ChunkedMemoryObject* chunkedMemoryObject) {
+    cl_mem oclObject = chunkedMemoryObject->getBaseMemoryObject();
+    throwExceptionIfFailed( clSetKernelArg(this->oclKernel, this->parameterCount++, sizeof(cl_mem), &oclObject) );
+    return this->parameterCount;
+}
+int Kernel::setParameter(size_t parm_size, void* parm) {
+    // clSetKernelArg expects a const void*, so we can treat all pointers as const
+    return this->setParameter(parm_size, const_cast<const void*>(parm));
+}
+int Kernel::setParameter(size_t parm_size, const void* parm) {
+    // The argument data pointed to by arg_value is copied and the arg_value pointer can therefore be reused by the application after clSetKernelArg returns.
+    // https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/clSetKernelArg.html
+    throwExceptionIfFailed( clSetKernelArg(this->oclKernel, this->parameterCount++, parm_size, parm) );
+    return this->parameterCount;
+}
+GSPar::Driver::Dimensions Kernel::getNumBlocksAndThreadsFor(Dimensions dims) {
+
+    // CL_DEVICE_MAX_WORK_GROUP_SIZE is usually 1024, but CL_KERNEL_WORK_GROUP_SIZE is 256.
+    // In general, the kernels works just fine with 1024 even with the 256 limitation reported by CL_KERNEL_WORK_GROUP_SIZE.
+    // What limit should we use?
+    // unsigned int maxThreadsPerBlock = this->device->getMaxThreadsPerBlock(); //CL_DEVICE_MAX_WORK_GROUP_SIZE
+    const size_t *kernelWorkGroupSize = this->queryInfo<size_t>(CL_KERNEL_WORK_GROUP_SIZE);
+    unsigned int maxThreadsPerBlock = *kernelWorkGroupSize;
+
+    // Should we check CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS? We only support 3 dimensions anyway.
+    const size_t* maxWorkItemSizes = this->device->queryInfoDevice<size_t>(CL_DEVICE_MAX_WORK_ITEM_SIZES);
+    size_t maxThreadsDimension[3]; // Copying data to remove constness
+    memcpy(maxThreadsDimension, maxWorkItemSizes, sizeof(size_t) * 3);
+
+    return this->getNumBlocksAndThreads(dims, maxThreadsPerBlock, maxThreadsDimension);
+}
+void Kernel::runAsync(Dimensions dims, ExecutionFlow* executionFlow) {
+
+    #ifdef GSPAR_DEBUG
+        std::stringstream ss; // Using stringstream eases multi-threaded debugging
+        ss << "[" << std::this_thread::get_id() << " GSPar Kernel " << this << "] Running kernel async with " << this->parameterCount << " parameters for " << dims.toString() << " in flow " << executionFlow << std::endl;
+        std::cout << ss.str();
+        ss.str("");
+    #endif
+
+    cl_command_queue oclQueue = ExecutionFlow::checkAndStartFlow(this->device, executionFlow);
+
+    if (!dims.x) {
+        throw Exception("The first dimension is required to run a kernel");
+    }
+
+    Dimensions blocksAndThreads = this->getNumBlocksAndThreadsFor(dims);
+
+    int dimensions = dims.getCount();
+
+    size_t localSize[dimensions];
+    size_t globalSize[dimensions];
+    for (int d = 0; d < dimensions; d++) {
+        localSize[d] = blocksAndThreads[d].max;
+        globalSize[d] = blocksAndThreads[d].min * localSize[d];
+    }
+
+    // Set shared memory - https://community.khronos.org/t/dynamically-allocated-shared-memory/1562
+    if (this->sharedMemoryBytes > 0) {
+        throwExceptionIfFailed( clSetKernelArg(this->oclKernel, this->parameterCount++, this->sharedMemoryBytes, NULL) );
+    }
+
+    #ifdef GSPAR_DEBUG
+        ss << "[" << std::this_thread::get_id() << " GSPar Kernel " << this << "] Shall start " << dims.toString() << " threads: ";
+        ss << "starting (" << globalSize[0];
+        if (dims.y) ss << "," << globalSize[1];
+        if (dims.z) ss << "," << globalSize[2];
+        ss << ") threads ";
+        ss << "divided in blocks of (" << localSize[0];
+        if (dims.y) ss << "," << localSize[1];
+        if (dims.z) ss << "," << localSize[2];
+        ss << ") threads ";
+        ss << "using " << this->sharedMemoryBytes << " bytes of shared memory in execution flow " << executionFlow << std::endl;
+        std::cout << ss.str();
+        ss.str("");
+    #endif
+
+    cl_event *evt = new cl_event;
+    throwExceptionIfFailed( clEnqueueNDRangeKernel(oclQueue, this->oclKernel, dimensions, NULL, globalSize, localSize, 0, NULL, evt) );
+    #ifdef GSPAR_DEBUG
+        ss << "[" << std::this_thread::get_id() << " GSPar Kernel " << this << "] Setting evt to wait: " << evt << std::endl;
+        std::cout << ss.str();
+        ss.str("");
+    #endif
+
+    this->setBaseAsyncObject(evt); // setBaseAsyncObject sets runningAsync to false
+    // Use Execution Flow instead of the event for synchronization. See comment on executionFlow attribute.
+    this->setExecutionFlowToSynchronize(executionFlow ? executionFlow : this->device->getDefaultExecutionFlow());
+    this->runningAsync = true;
+}
+template<class T>
+T* Kernel::queryInfo(cl_kernel_work_group_info param, bool cacheable) {
+    // https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/clGetKernelWorkGroupInfo.html
+    if (cacheable) { // Check if the attribute is cached
+        auto it = this->attributeCache.find(param);
+        if (it != this->attributeCache.end()) {
+            return (T*)it->second;
+        }
+    }
+
+    size_t valueSize;
+    throwExceptionIfFailed( clGetKernelWorkGroupInfo(this->oclKernel, this->device->getBaseDeviceObject(), param, 0, NULL, &valueSize) );
+    T* value = new T[valueSize];
+    throwExceptionIfFailed( clGetKernelWorkGroupInfo(this->oclKernel, this->device->getBaseDeviceObject(), param, valueSize, value, NULL) );
+    if (cacheable) { // Stores the attribute in cache
+        this->attributeCache[param] = value;
+    }
+    return value;
+}
+
+
+///// MemoryObject /////
+
+void MemoryObject::copy(bool in, bool async, ExecutionFlow* executionFlow) {
+    cl_event *evt = new cl_event;
+    cl_bool blocking = async ? CL_FALSE : CL_TRUE;
+    int numEvtsToWait = 0;
+    cl_event *evtToWait = NULL;
+    if (this->getBaseAsyncObject()) {
+        numEvtsToWait = this->numAsyncEvents;
+        evtToWait = this->asyncObject;
+        #ifdef GSPAR_DEBUG
+            std::stringstream ss; // Using stringstream eases multi-threaded debugging
+            ss << "[" << std::this_thread::get_id() << " GSPar MemObj " << this << "] Already has an async event: " << evtToWait << ", binding two events" << std::endl;
+            std::cout << ss.str();
+            ss.str("");
+        #endif
+    }
+
+    cl_command_queue oclQueue = ExecutionFlow::checkAndStartFlow(this->device, executionFlow);
+
+    if (in) {
+        throwExceptionIfFailed( clEnqueueWriteBuffer(
+            oclQueue, this->devicePtr,
+            blocking, 0, this->size, this->hostPtr,
+            numEvtsToWait, evtToWait, evt) );
+    } else { //copy out
+        throwExceptionIfFailed( clEnqueueReadBuffer(
+            oclQueue, this->devicePtr,
+            blocking, 0, this->size, this->hostPtr,
+            numEvtsToWait, evtToWait, evt) );
+    }
+    if (this->getBaseAsyncObject()) { // Releases old async event handler
+        this->releaseBaseAsyncObject();
+    }
+    if (async) {
+        #ifdef GSPAR_DEBUG
+            std::stringstream ss; // Using stringstream eases multi-threaded debugging
+            ss << "[" << std::this_thread::get_id() << " GSPar MemObj " << this << "] Setting evt " << evt << " from queue " << oclQueue << " to wait" << std::endl;
+            std::cout << ss.str();
+            ss.str("");
+        #endif
+
+        this->setBaseAsyncObject(evt); // setBaseAsyncObject sets runningAsync to false
+        // Use Execution Flow instead of the event for synchronization. See comment on executionFlow attribute.
+        this->setExecutionFlowToSynchronize(executionFlow ? executionFlow : this->device->getDefaultExecutionFlow());
+        this->runningAsync = true;
+    }
+}
+
+void MemoryObject::allocDeviceMemory() {
+    cl_int status;
+
+    // Security check is already done in base class
+    cl_mem_flags ocl_flags = CL_MEM_READ_WRITE;
+    if (this->isReadOnly()) {
+        ocl_flags = CL_MEM_READ_ONLY;
+    } else if (this->isWriteOnly()) {
+        ocl_flags = CL_MEM_WRITE_ONLY;
+    }
+
+    this->devicePtr = clCreateBuffer(device->getContext(), ocl_flags, size, NULL, &status);
+    throwExceptionIfFailed(status);
+}
+MemoryObject::MemoryObject(Device* device, size_t size, void* hostPtr, bool readOnly, bool writeOnly) : BaseMemoryObject(device, size, hostPtr, readOnly, writeOnly) {
+    this->allocDeviceMemory();
+}
+MemoryObject::MemoryObject(Device* device, size_t size, const void* hostPtr) : BaseMemoryObject(device, size, hostPtr) {
+    this->allocDeviceMemory();
+}
+MemoryObject::~MemoryObject() {
+    if (this->devicePtr) {
+        #ifdef GSPAR_DEBUG
+            std::stringstream ss; // Using stringstream eases multi-threaded debugging
+            ss << "[" << std::this_thread::get_id() << " GSPar MemObj] Releasing Memory Object " << this << std::endl;
+            std::cout << ss.str();
+            ss.str("");
+        #endif
+        Exception* ex = Exception::checkError( clReleaseMemObject(this->devicePtr) ); // We don't throw exceptions on destructors
+        if (ex != nullptr) {
+            std::cerr << "Failed when releasing OpenCL memory object: ";
+            std::cerr << ex->what() << " - " << ex->getDetails() << std::endl;
+            delete ex;
+        }
+        this->devicePtr = NULL;
+    }
+}
+void MemoryObject::copyIn() { copy(true, false); }
+void MemoryObject::copyOut() { copy(false, false); }
+void MemoryObject::copyInAsync(ExecutionFlow* executionFlow) { copy(true, true, executionFlow); }
+void MemoryObject::copyOutAsync(ExecutionFlow* executionFlow) { copy(false, true, executionFlow); }
+
+
+///// ChunkedMemoryObject /////
+
+void ChunkedMemoryObject::copy(bool in, bool async, unsigned int chunkFrom, unsigned int chunkTo, ExecutionFlow* executionFlow) {
+    unsigned int numChunksToCopy = chunkTo - chunkFrom;
+    cl_event *newEvents = new cl_event[numChunksToCopy];
+
+    cl_bool blocking = async ? CL_FALSE : CL_TRUE;
+    unsigned int currentNumEvents = 0;
+    cl_event *currentEvents = NULL;
+    if (this->getBaseAsyncObject()) {
+        currentNumEvents = this->numAsyncEvents;
+        currentEvents = this->asyncObject;
+    }
+
+    cl_command_queue oclQueue = ExecutionFlow::checkAndStartFlow(this->device, executionFlow);
+
+    for (unsigned int chunk = chunkFrom, evtIdx = 0; chunk < chunkTo; chunk++, evtIdx++) {
+        if (in) {
+            throwExceptionIfFailed( clEnqueueWriteBuffer(
+                oclQueue, this->devicePtr,
+                blocking, chunk * this->getChunkSize(), this->getChunkSize(), this->hostPointers[chunk],
+                currentNumEvents, currentEvents, &newEvents[evtIdx]) );
+        } else { //copy out
+            throwExceptionIfFailed( clEnqueueReadBuffer(
+                oclQueue, this->devicePtr,
+                blocking, chunk * this->getChunkSize(), this->getChunkSize(), this->hostPointers[chunk],
+                currentNumEvents, currentEvents, &newEvents[evtIdx]) );
+        }
+    }
+    if (this->getBaseAsyncObject()) { // Releases old async event handler
+        this->releaseBaseAsyncObject();
+    }
+    if (async) {
+        #ifdef GSPAR_DEBUG
+            std::stringstream ss; // Using stringstream eases multi-threaded debugging
+            ss << "[" << std::this_thread::get_id() << " GSPar ChunkedMemObj " << this << "] Setting evts (" << numChunksToCopy << ") to wait: " << newEvents << std::endl;
+            std::cout << ss.str();
+            ss.str("");
+        #endif
+        this->setBaseAsyncObject(newEvents, numChunksToCopy); // setBaseAsyncObject sets runningAsync to false
+        // Use Execution Flow instead of the event for synchronization. See comment on executionFlow attribute.
+        this->setExecutionFlowToSynchronize(executionFlow ? executionFlow : this->device->getDefaultExecutionFlow());
+        this->runningAsync = true;
+    }
+}
+
+void ChunkedMemoryObject::allocDeviceMemory() {
+    cl_int status;
+
+    // Security check is already done in base class
+    cl_mem_flags ocl_flags = CL_MEM_READ_WRITE;
+    if (this->isReadOnly()) {
+        ocl_flags = CL_MEM_READ_ONLY;
+    } else if (this->isWriteOnly()) {
+        ocl_flags = CL_MEM_WRITE_ONLY;
+    }
+
+    // We allocate space for all the memory chunks
+    this->devicePtr = clCreateBuffer(device->getContext(), ocl_flags, this->getChunkSize() * this->chunks, NULL, &status);
+    throwExceptionIfFailed(status);
+}
+ChunkedMemoryObject::ChunkedMemoryObject(Device* device, unsigned int chunks, size_t chunkSize, void** hostPointers, bool readOnly, bool writeOnly) :
+        BaseChunkedMemoryObject(device, chunks, chunkSize, hostPointers, readOnly, writeOnly) {
+    this->allocDeviceMemory();
+}
+ChunkedMemoryObject::ChunkedMemoryObject(Device* device, unsigned int chunks, size_t chunkSize, const void** hostPointers) :
+        BaseChunkedMemoryObject(device, chunks, chunkSize, hostPointers) {
+    this->allocDeviceMemory();
+}
+ChunkedMemoryObject::~ChunkedMemoryObject() {
+    //devicePtr is released in ~MemoryObject
+}
+void ChunkedMemoryObject::copyIn() { copy(true, false, 0, this->chunks); }
+void ChunkedMemoryObject::copyOut() { copy(false, false, 0, this->chunks); }
+void ChunkedMemoryObject::copyInAsync(ExecutionFlow* executionFlow) { copy(true, true, 0, this->chunks, executionFlow); }
+void ChunkedMemoryObject::copyOutAsync(ExecutionFlow* executionFlow) { copy(false, true, 0, this->chunks, executionFlow); }
+void ChunkedMemoryObject::copyIn(unsigned int chunk) { copy(true, false, chunk, chunk+1); }
+void ChunkedMemoryObject::copyOut(unsigned int chunk) { copy(false, false, chunk, chunk+1); }
+void ChunkedMemoryObject::copyInAsync(unsigned int chunk, ExecutionFlow* executionFlow) { copy(true, true, chunk, chunk+1, executionFlow); }
+void ChunkedMemoryObject::copyOutAsync(unsigned int chunk, ExecutionFlow* executionFlow) { copy(false, true, chunk, chunk+1, executionFlow); }
+
+
+///// StreamElement /////
+
+StreamElement::StreamElement(Device* device) : BaseStreamElement(device) {
+    // Can't call this virtual function in the base constructor
+    this->start();
+}
+
+StreamElement::~StreamElement() { }
+
+
+///// KernelGenerator /////
+
+const std::string KernelGenerator::KERNEL_PREFIX = "__kernel";
+const std::string KernelGenerator::GLOBAL_MEMORY_PREFIX = "__global";
+const std::string KernelGenerator::SHARED_MEMORY_PREFIX = "__local";
+const std::string KernelGenerator::CONSTANT_PREFIX = "__constant";
+const std::string KernelGenerator::DEVICE_FUNCTION_PREFIX = "";
+
+const std::string KernelGenerator::getKernelPrefix() {
+    return KernelGenerator::KERNEL_PREFIX + " void";
+}
+std::string KernelGenerator::generateStdFunctions() {
+    return ""
+    "size_t gspar_get_global_id(unsigned int dimension) { return get_global_id(dimension); } \n"
+    "size_t gspar_get_thread_id(unsigned int dimension) { return get_local_id(dimension); } \n"
+    "size_t gspar_get_block_id(unsigned int dimension) { return get_group_id(dimension); } \n"
+    "size_t gspar_get_block_size(unsigned int dimension) { return get_local_size(dimension); } \n"
+    "size_t gspar_get_grid_size(unsigned int dimension) { return get_num_groups(dimension); } \n"
+    "void gspar_synchronize_local_threads() { barrier(CLK_LOCAL_MEM_FENCE); } \n"
+    "int gspar_atomic_add_int(__global int *valq, int delta){ atomic_add(valq, delta); } \n"
+    "double gspar_atomic_add_double(__global double *valq, double delta){ \n "
+    "    union { double f; unsigned long i; } old; \n"
+    "    union { double f; unsigned long i; } new1; \n"
+    "    do { \n"
+    "        old.f = *valq; \n"
+    "        new1.f = old.f + delta; \n"
+    "    } while (atom_cmpxchg((volatile __global unsigned long *)valq, old.i, new1.i) != old.i); \n"
+    "    return old.f; \n"
+    "} \n"
+    ;
+}
+std::string KernelGenerator::replaceMacroKeywords(std::string kernelSource) {
+    kernelSource = std::regex_replace(kernelSource, std::regex("GSPAR_DEVICE_MACRO_BEGIN"), "#define");
+    kernelSource = std::regex_replace(kernelSource, std::regex("GSPAR_DEVICE_MACRO_END"), "\n");
+    return kernelSource;
+}
+std::string KernelGenerator::generateInitKernel(Pattern::BaseParallelPattern* pattern, Dimensions max) {
+    return "";
+}
+std::string KernelGenerator::generateParams(Pattern::BaseParallelPattern* pattern, Dimensions dims) {
+    std::string r = "";
+    for(int d = 0; d < dims.getCount(); d++) {
+        if (dims.is(d)) {
+            std::string varName = this->getStdVarNameForDimension(pattern->getStdVarNames(), d);
+            r += "const unsigned long gspar_max_" + varName + ",";
+            if (dims[d].min && !pattern->isBatched()) {
+                // TODO Support min in batches
+                r += "const unsigned long gspar_min_" + varName + ",";
+            }
+        }
+    }
+    if (pattern->isBatched()) {
+        // This names are used in other methods
+        r += "unsigned int gspar_batch_size,";
+    }
+    for(auto &param : pattern->getParameterList()) {
+        if (param->direction != Pattern::ParameterDirection::GSPAR_PARAM_NONE) {
+            if (param->paramValueType == Pattern::ParameterValueType::GSPAR_PARAM_POINTER || param->isBatched()) { // Batched values are always pointers
+                r += KernelGenerator::GLOBAL_MEMORY_PREFIX + " ";
+            }
+            if (param->direction == Pattern::ParameterDirection::GSPAR_PARAM_IN && param->isConstant()) {
+                r += "const ";
+            }
+            r += param->toKernelParameter() + ",";
+        }
+    }
+    if (pattern->isUsingSharedMemory()) {
+        auto shmem = pattern->getSharedMemoryParameter();
+        r += KernelGenerator::SHARED_MEMORY_PREFIX + " " + shmem->toString();
+    } else {
+        if (!r.empty()) r.pop_back(); // removes last comma
+    }
+    return r;
+}
+std::string KernelGenerator::generateStdVariables(Pattern::BaseParallelPattern* pattern, Dimensions dims) {
+    std::array<std::string, 3> patternNames = pattern->getStdVarNames();
+
+    // OpenCL get_global_id returns a size_t, so this is the type of our std variables
+    // https://www.khronos.org/registry/OpenCL/specs/opencl-1.2.pdf#page=244
+    std::string r;
+    for(int d = 0; d < dims.getCount(); d++) {
+        if (dims.is(d)) {
+            std::string varName = this->getStdVarNameForDimension(patternNames, d);
+            if (pattern->isBatched()) {
+                r += "size_t gspar_global_" + varName;
+            } else {
+                r += "size_t " + varName;
+            }
+            r += " = gspar_get_global_id(" + std::to_string(d) + ")";
+            if (dims[d].min && !pattern->isBatched()) {
+                // TODO Support min in batches
+                r += " + gspar_min_" + varName;
+            }
+            r += "; \n";
+            // TODO Support multi-dimensional batches
+            if (pattern->isBatched()) {
+                // Intended implicit floor(gspar_global/dims)
+                r += "size_t gspar_batch_" + varName + " = ((size_t)(gspar_global_" + varName + " / " + std::to_string(dims[d].max) + ")); \n";
+                r += "size_t gspar_offset_" + varName + " = gspar_batch_" + varName + " * " + std::to_string(dims[d].max) + "; \n";
+                // This variable names are used in other methods, keep track
+                r += "size_t " + varName + " = gspar_global_" + varName + " - gspar_offset_" + varName + "; \n";
+            }
+        }
+    }
+    return r;
+}
+std::string KernelGenerator::generateBatchedParametersInitialization(Pattern::BaseParallelPattern* pattern, Dimensions dims) {
+    std::array<std::string, 3> patternNames = pattern->getStdVarNames();
+    // TODO Support multi-dimensional batches
+    std::string stdVarFirstDimension = this->getStdVarNameForDimension(patternNames, 0);
+
+    std::string r = "";
+    for(auto &param : pattern->getParameterList()) {
+        if (param->isBatched()) {
+            if (param->paramValueType == Pattern::ParameterValueType::GSPAR_PARAM_POINTER) {
+                r += "__global ";
+            }
+            r += param->type.getFullName() + " " + param->name + " = ";
+            if (param->paramValueType == Pattern::ParameterValueType::GSPAR_PARAM_POINTER) {
+                r += "&" + param->getKernelParameterName() + "[gspar_offset_" + stdVarFirstDimension + "]";
+            } else if (param->paramValueType == Pattern::ParameterValueType::GSPAR_PARAM_VALUE) {
+                r += param->getKernelParameterName() + "[gspar_batch_" + stdVarFirstDimension + "]";
+            }
+            r += ";\n";
+        }
+    }
+    return r;
+}
diff --git a/src/GSPar_OpenCL.hpp b/src/GSPar_OpenCL.hpp
new file mode 100644
index 0000000..a0327c4
--- /dev/null
+++ b/src/GSPar_OpenCL.hpp
@@ -0,0 +1,260 @@
+
+#ifndef __GSPAR_OPENCL_INCLUDED__
+#define __GSPAR_OPENCL_INCLUDED__
+
+#include <string>
+#include <map>
+#include <mutex>
+#include <CL/opencl.h>
+
+///// Forward declarations /////
+
+namespace GSPar {
+    namespace Driver {
+        namespace OpenCL {
+            class Exception;
+            class ExecutionFlow;
+            class AsyncExecutionSupport;
+            class Instance;
+            class Device;
+            class Kernel;
+            class MemoryObject;
+            class ChunkedMemoryObject;
+            class StreamElement;
+            class KernelGenerator;
+        }
+    }
+}
+
+#include "GSPar_BaseGPUDriver.hpp"
+
+namespace GSPar {
+    namespace Driver {
+        namespace OpenCL {
+
+            ///// Exception /////
+
+            class Exception :
+                public BaseException<cl_int> {
+            protected:
+                std::string getErrorString(cl_int code) override;
+
+            public:
+                explicit Exception(std::string msg, std::string details = "");
+                explicit Exception(cl_int code, std::string details = "");
+
+                static Exception* checkError(cl_int code, std::string details = "");
+                static void throwIfFailed(cl_int code, std::string details = "");
+
+                explicit Exception(cl_int code, cl_program program, cl_device_id device);
+                static Exception* checkError(cl_int code, cl_program program, cl_device_id device);
+                static void throwIfFailed(cl_int code, cl_program program, cl_device_id device);
+            };
+
+            ///// ExecutionFlow /////
+
+            class ExecutionFlow :
+                virtual public BaseExecutionFlow<ExecutionFlow, Device, cl_command_queue> {
+            public:
+                ExecutionFlow();
+                explicit ExecutionFlow(Device* device);
+                virtual ~ExecutionFlow();
+                cl_command_queue start() override;
+                void synchronize() override;
+
+                static cl_command_queue checkAndStartFlow(Device* device, ExecutionFlow* executionFlow = NULL);
+            };
+
+            ///// AsyncExecutionSupport /////
+
+            class AsyncExecutionSupport :
+                virtual public BaseAsyncExecutionSupport<cl_event*> {
+            protected:
+                unsigned int numAsyncEvents = 0;
+                /// OpenCL sometimes simply hangs on clWaitForEvents
+                /// I've seen it happen when using multithread and 3 kernels (pattern->run) called sequentially by each thread
+                /// The internet are full of people complaining over similar issues, and one of them used clFinish instead of clWaitForEvents, so that's what we're gonna do
+                /// https://github.com/fangq/mcxcl/commit/135dc825e2905253ab0626a2b335dfee8b6e741e
+                /// https://community.intel.com/t5/OpenCL/Is-there-a-driver-watchdog-time-limit-for-Intel-GPU-on-Linux/td-p/1108291
+                /// Whenever an Execution Flow is filled here, we'll synchronize it instead of waiting for the event
+                ExecutionFlow *executionFlow = nullptr;
+            public:
+                AsyncExecutionSupport(cl_event *asyncObjs = NULL, unsigned int numAsyncEvents = 0);
+                virtual ~AsyncExecutionSupport();
+                void setBaseAsyncObject(cl_event *asyncObject) override;
+                void waitAsync() override;
+
+                void releaseBaseAsyncObject();
+                void setBaseAsyncObject(cl_event *asyncObject, unsigned int numAsyncEvents);
+                void setExecutionFlowToSynchronize(ExecutionFlow *flow) {
+                    this->executionFlow = flow;
+                }
+                static void waitAllAsync(std::initializer_list<AsyncExecutionSupport*> asyncs);
+            };
+
+            ///// Instance /////
+
+            class Instance : public BaseInstance<ExecutionFlow, Device, Kernel, MemoryObject, ChunkedMemoryObject, KernelGenerator> {
+            protected:
+                static Instance *instance;
+                void loadGpuList() override;
+
+            public:
+                Instance();
+                virtual ~Instance();
+                void init() override;
+                unsigned int getGpuCount() override;
+
+                static Instance* getInstance();
+            };
+
+            ///// Device /////
+
+            class Device :
+                public BaseDevice<ExecutionFlow, Kernel, MemoryObject, ChunkedMemoryObject, cl_context, cl_device_id, cl_command_queue> {
+            private:
+                mutable std::mutex attributeCacheMutex;
+                std::map<cl_device_info, void*> attributeCache;
+
+            public:
+                Device();
+                explicit Device(cl_device_id deviceId);
+                virtual ~Device();
+                ExecutionFlow* getDefaultExecutionFlow() override;
+                cl_context getContext() override;
+                cl_command_queue startDefaultExecutionFlow() override;
+                const std::string getName() override;
+                unsigned int getComputeUnitsCount() override;
+                unsigned int getWarpSize() override;
+                unsigned int getMaxThreadsPerBlock() override;
+                unsigned long getGlobalMemorySizeBytes() override;
+                unsigned long getLocalMemorySizeBytes() override;
+                unsigned long getSharedMemoryPerComputeUnitSizeBytes() override;
+                unsigned int getClockRateMHz() override;
+                bool isIntegratedMainMemory() override;
+                MemoryObject* malloc(long size, void* hostPtr = nullptr, bool readOnly = false, bool writeOnly = false) override;
+                MemoryObject* malloc(long size, const void* hostPtr = nullptr) override;
+                ChunkedMemoryObject* mallocChunked(unsigned int chunks, long chunkSize, void** hostPtr = nullptr, bool readOnly = false, bool writeOnly = false) override;
+                ChunkedMemoryObject* mallocChunked(unsigned int chunks, long chunkSize, const void** hostPtr = nullptr) override;
+                Kernel* prepareKernel(const std::string kernelSource, const std::string kernelName) override;
+                std::vector<Kernel*> prepareKernels(const std::string kernelSource, const std::vector<std::string> kernelNames) override;
+
+                template<class T>
+                const T* queryInfoDevice(cl_device_info paramName, bool cacheable = true);
+                cl_program compileOCLProgram(std::string source);
+            };
+
+            ///// Kernel /////
+
+            class Kernel :
+                public BaseKernel<ExecutionFlow, Device, MemoryObject, ChunkedMemoryObject, cl_event*>,
+                public AsyncExecutionSupport {
+            private:
+                cl_program oclProgram;
+                cl_kernel oclKernel;
+                bool isPrecompiled;
+                std::map<cl_kernel_work_group_info, void*> attributeCache;
+
+                void loadOclKernel(const std::string kernelName);
+
+            public:
+                Kernel();
+                Kernel(Device* device, const std::string kernelSource, const std::string kernelName);
+                virtual ~Kernel();
+                virtual void cloneInto(BaseKernelBase* baseOther) override;
+                int setParameter(MemoryObject* memoryObject) override;
+                int setParameter(ChunkedMemoryObject* chunkedMemoryObject) override;
+                int setParameter(size_t parmSize, void* parm) override;
+                int setParameter(size_t parmSize, const void* parm) override;
+                Dimensions getNumBlocksAndThreadsFor(Dimensions dims) override;
+                void runAsync(Dimensions max, ExecutionFlow* executionFlow = NULL) override;
+
+                template<class T>
+                T* queryInfo(cl_kernel_work_group_info param, bool cacheable = true);
+                Kernel(Device* device, cl_program oclProgram, const std::string kernelName);
+            };
+
+            ///// MemoryObject /////
+
+            class MemoryObject :
+                public BaseMemoryObject<Exception, ExecutionFlow, Device, cl_mem, cl_event*>,
+                public AsyncExecutionSupport {
+            private:
+                void copy(bool in, bool async, ExecutionFlow* executionFlow = NULL);
+                void allocDeviceMemory();
+
+            public:
+                MemoryObject(Device* device, size_t size, void* hostPtr, bool readOnly, bool writeOnly);
+                MemoryObject(Device* device, size_t size, const void* hostPtr);
+                virtual ~MemoryObject();
+                void copyIn() override;
+                void copyOut() override;
+                void copyInAsync(ExecutionFlow* executionFlow = NULL) override;
+                void copyOutAsync(ExecutionFlow* executionFlow = NULL) override;
+            };
+
+            ///// ChunkedMemoryObject /////
+
+            class ChunkedMemoryObject :
+                public BaseChunkedMemoryObject<Exception, ExecutionFlow, Device, cl_mem, cl_event*>,
+                public AsyncExecutionSupport {
+            private:
+                void copy(bool in, bool async, unsigned int chunkFrom, unsigned int chunkTo, ExecutionFlow* executionFlow = NULL);
+                void allocDeviceMemory();
+
+            public:
+                ChunkedMemoryObject(Device* device, unsigned int chunks, size_t chunkSize, void** hostPointers, bool readOnly, bool writeOnly);
+                ChunkedMemoryObject(Device* device, unsigned int chunks, size_t chunkSize, const void** hostPointers);
+                virtual ~ChunkedMemoryObject();
+                // Copy all chunks
+                virtual void copyIn() override;
+                virtual void copyOut() override;
+                virtual void copyInAsync(ExecutionFlow* executionFlow = NULL) override;
+                virtual void copyOutAsync(ExecutionFlow* executionFlow = NULL) override;
+                // Copy specific chunks of memory. We can't use function overloading due to the override.
+                virtual void copyIn(unsigned int chunk);
+                virtual void copyOut(unsigned int chunk);
+                virtual void copyInAsync(unsigned int chunk, ExecutionFlow* executionFlow = NULL);
+                virtual void copyOutAsync(unsigned int chunk, ExecutionFlow* executionFlow = NULL);
+            };
+
+            ///// StreamElement /////
+
+            class StreamElement :
+                public BaseStreamElement<ExecutionFlow, Device, cl_event*, cl_command_queue>,
+                public AsyncExecutionSupport,
+                public ExecutionFlow {
+            private:
+                Kernel* kernel;
+                cl_kernel oclKernel = NULL;
+
+            public:
+                explicit StreamElement(Device* device);
+                ~StreamElement();
+            };
+
+            ///// KernelGenerator /////
+
+            class KernelGenerator :
+                public BaseKernelGenerator {
+            public:
+                static const std::string KERNEL_PREFIX;
+                static const std::string GLOBAL_MEMORY_PREFIX;
+                static const std::string SHARED_MEMORY_PREFIX;
+                static const std::string CONSTANT_PREFIX;
+                static const std::string DEVICE_FUNCTION_PREFIX;
+                const std::string getKernelPrefix() override;
+                std::string generateStdFunctions() override;
+                std::string replaceMacroKeywords(std::string kernelSource) override;
+                std::string generateInitKernel(Pattern::BaseParallelPattern* pattern, Dimensions dims) override;
+                std::string generateParams(Pattern::BaseParallelPattern* pattern, Dimensions dims) override;
+                std::string generateStdVariables(Pattern::BaseParallelPattern* pattern, Dimensions dims) override;
+                std::string generateBatchedParametersInitialization(Pattern::BaseParallelPattern* pattern, Dimensions dims) override;
+
+            };
+
+        }
+    }
+}
+
+#endif
diff --git a/src/GSPar_PatternComposition.hpp b/src/GSPar_PatternComposition.hpp
new file mode 100644
index 0000000..6952e46
--- /dev/null
+++ b/src/GSPar_PatternComposition.hpp
@@ -0,0 +1,271 @@
+
+#ifndef __GSPAR_PATTERNCOMPOSITION_INCLUDED__
+#define __GSPAR_PATTERNCOMPOSITION_INCLUDED__
+
+#include <vector>
+#include <map>
+#include <initializer_list>
+#include <utility>
+
+///// Forward declarations /////
+
+namespace GSPar {
+    namespace Pattern {
+        class PatternComposition;
+    }
+}
+
+#include "GSPar_Base.hpp"
+#include "GSPar_BaseGPUDriver.hpp"
+#include "GSPar_BaseParallelPattern.hpp"
+#include "GSPar_PatternMap.hpp"
+#include "GSPar_PatternReduce.hpp"
+
+namespace GSPar {
+    namespace Pattern {
+
+        enum PatternType {
+            GSPAR_PATTERN_MAP,
+            GSPAR_PATTERN_REDUCE
+        };
+        
+        class PatternComposition {
+        protected:
+            bool built = false;
+            std::string extraKernelCode;
+            std::array<std::string, 3> stdVarNames;
+            std::vector<BaseParallelPattern*> patterns;
+            std::map<BaseParallelPattern*, PatternType> patternsTypes;
+            Driver::Dimensions compiledPatternsDimension;
+
+            template<typename Base, typename T>
+            inline bool instanceof(const T*) {
+                return std::is_base_of<Base, T>::value;
+            }
+
+            template<class TDriverInstance>
+            std::string generateKernelSource(Driver::Dimensions max, unsigned int gpuIndex = 0) {
+
+                std::string kernelSource = this->extraKernelCode;
+                if (!this->extraKernelCode.empty()) {
+                    kernelSource += "\n";
+                }
+                bool addedKernel = false;
+                for(auto pattern : patterns) {
+                    if (pattern->getGpuIndex() != gpuIndex) {
+                        continue;
+                    }
+                    addedKernel = true;
+
+                    pattern->callbackBeforeGeneratingKernelSource();
+                    kernelSource += pattern->generateKernelSource<TDriverInstance>(max);
+                    kernelSource += "\n";
+                }
+
+                return addedKernel ? kernelSource : "";
+            }
+
+            template<class T>
+            PatternComposition& addPatternInverseOrder(T* pattern) {
+                this->assertValidParallelPattern(pattern);
+                //This has a terrible performance, but this vector shouldn't be that large for this to be a problem
+                patterns.insert(patterns.begin(), 1, pattern);
+                this->patternsTypes[pattern] = this->instanceof<Pattern::Map>(pattern) ? GSPAR_PATTERN_MAP : GSPAR_PATTERN_REDUCE;
+                return *this;
+            }
+
+            void assertAnyPatternAdded() {
+                if (this->patterns.empty()) {
+                    throw GSParException("No patterns added in composition, interrupting");
+                }
+            }
+            template<class T>
+            void assertValidParallelPattern(T* pattern) {
+                if (!this->instanceof<BaseParallelPattern>(pattern)) {
+                    throw GSParException("Trying to add invalid pattern. All patterns must inherit BaseParallelPattern.");
+                }
+            }
+
+            template<class TDriverInstance>
+            void run(Driver::Dimensions pDims, bool useCompiledDim) {
+                this->assertAnyPatternAdded();
+                Driver::Dimensions dims = useCompiledDim ? this->compiledPatternsDimension : pDims;
+                if (!dims.getCount()) {
+                    throw GSParException("No dimensions set to run the pattern composition");
+                }
+
+                // TODO validade if dims is valid
+
+                this->compilePatterns<TDriverInstance>(dims);
+
+                for (const auto& pattern : this->patterns) {
+                    // We pass dims again in Run case we have other thread asking the pattern to compile to another dims (which shouldn't happen anyway)
+                    switch (this->patternsTypes[pattern]) {
+                        case GSPAR_PATTERN_MAP:
+                            (static_cast<Map*>(pattern))->run<TDriverInstance>(dims);
+                            break;
+                        case GSPAR_PATTERN_REDUCE:
+                            // Almost https://en.wikipedia.org/wiki/Curiously_recurring_template_pattern
+                            (static_cast<Reduce*>(pattern))->run<TDriverInstance>(dims);
+                            break;
+                    }
+                }
+            }
+
+        public:
+            PatternComposition() = default;
+
+            template<class T>
+            PatternComposition(std::initializer_list<T*> patterns) {
+                for (auto p : patterns) {
+                    this->addPattern(p);
+                }
+            }
+
+            template<class TFirst, class... TArgs>
+            PatternComposition(TFirst pattern, TArgs... args) : PatternComposition(args...) {
+                this->addPatternInverseOrder(pattern); // The elements are processed from last to first
+            }
+
+            virtual ~PatternComposition() { }
+
+
+            template<class TDriverInstance>
+            PatternComposition* clone() const {
+                PatternComposition* other = new PatternComposition();
+                for (const auto &pattern : this->patterns) {
+                    switch (this->patternsTypes.at(pattern)) {
+                        case GSPAR_PATTERN_MAP:
+                            other->addPattern((static_cast<Map*>(pattern))->clone<TDriverInstance>());
+                            break;
+                        case GSPAR_PATTERN_REDUCE:
+                            other->addPattern((static_cast<Reduce*>(pattern))->clone<TDriverInstance>());
+                            break;
+                    }
+                }
+                other->built = this->built;
+                other->extraKernelCode = this->extraKernelCode;
+                other->stdVarNames = this->stdVarNames;
+                if (this->compiledPatternsDimension.getCount()) {
+                    Driver::Dimensions compiledPatternsDimension = this->compiledPatternsDimension;
+                    other->compiledPatternsDimension = compiledPatternsDimension;
+                }
+                return other;
+            }
+            
+            virtual PatternComposition& addExtraKernelCode(std::string extraKernelCode) {
+                this->extraKernelCode += extraKernelCode;
+                return *this;
+            }
+
+            virtual BaseParallelPattern* getPattern(size_t index) {
+                return patterns[index];
+            }
+
+            template<class T>
+            PatternComposition& addPattern(T* pattern) {
+                this->assertValidParallelPattern(pattern);
+                patterns.push_back(pattern);
+                this->patternsTypes[pattern] = this->instanceof<Pattern::Map>(pattern) ? GSPAR_PATTERN_MAP : GSPAR_PATTERN_REDUCE;
+                return *this;
+            }
+
+            virtual bool isAllPatternsCompiledFor(Driver::Dimensions dims) {
+                if (this->compiledPatternsDimension != dims) { // We are compiled with a different dims
+                    return false;
+                }
+                for (auto pattern : this->patterns) {
+                    if (!pattern->isKernelCompiledFor(dims)) {
+                        return false;
+                    }
+                }
+                return true;
+            }
+
+            template<class TDriverInstance>
+            PatternComposition& compilePatterns(Driver::Dimensions dims) {
+                this->assertAnyPatternAdded();
+                if (this->isAllPatternsCompiledFor(dims)) {
+                    // The kernels are already compiled
+                    return *this;
+                }
+                
+                // Init GPU driver
+                TDriverInstance* driver = TDriverInstance::getInstance();
+                // Driver::OpenCL::Instance driver = TDriverInstance::getInstance(); //Provides autocomplete
+                driver->init();
+
+                if (driver->getGpuCount() == 0) {
+                    throw GSParException("No GPU found, interrupting");
+                }
+
+                auto gpus = driver->getGpuList();
+
+                unsigned int gpuIndex = 0;
+                for (const auto& gpu : gpus) {
+                    // Prepare kernels
+                    std::string kernelSource = this->generateKernelSource<TDriverInstance>(dims, gpuIndex);
+                    if (kernelSource.empty()) {
+                        continue; // If there's no patterns in this GPU, we can move on
+                    }
+
+                    std::vector<std::string> kernelNames;
+                    for (auto pattern : this->patterns) {
+                        if (pattern->getGpuIndex() != gpuIndex) {
+                            continue;
+                        }
+                        kernelNames.push_back(pattern->getKernelName());
+                    }
+
+                    #ifdef GSPAR_DEBUG
+                        std::stringstream ss;
+                        ss << "[GSPar "<<this<<"] Compiling " << kernelNames.size() << " kernels in GPU " << gpu << " with " << dims.toString() << ":" << std::endl;
+                        ss << kernelSource << std::endl;
+                        std::cout << ss.str();
+                        ss.str("");
+                    #endif
+
+                    if (!kernelNames.empty()) { // If there's no patterns in this GPU, we can move on
+                        auto kernels = gpu->prepareKernels(kernelSource.c_str(), kernelNames);
+                        int patternIndex = 0;
+                        for (auto pattern : this->patterns) {
+                            if (pattern->getGpuIndex() != gpuIndex) {
+                                continue;
+                            }
+                            pattern->setCompiledKernel<TDriverInstance>(kernels.at(patternIndex), dims);
+                            patternIndex++;
+                        }
+                    }
+                    gpuIndex++;
+                }
+
+                this->compiledPatternsDimension = dims;
+
+                return *this;
+            }
+
+            template<class TDriverInstance>
+            void run() {
+                this->run<TDriverInstance>(Driver::Dimensions(), true);
+            }
+
+            template<class TDriverInstance>
+            void run(unsigned long dims[3][2]) {
+                this->run<TDriverInstance>(Driver::Dimensions(dims), false);
+            }
+
+            template<class TDriverInstance>
+            void run(unsigned long max[3]) {
+                this->run<TDriverInstance>(Driver::Dimensions(max), false);
+            }
+
+            template<class TDriverInstance>
+            void run(Driver::Dimensions dims) {
+                this->run<TDriverInstance>(dims, false);
+            }
+        };
+
+    }
+}
+
+#endif
diff --git a/src/GSPar_PatternMap.hpp b/src/GSPar_PatternMap.hpp
new file mode 100644
index 0000000..b2a1dc6
--- /dev/null
+++ b/src/GSPar_PatternMap.hpp
@@ -0,0 +1,29 @@
+
+#ifndef __GSPAR_PATTERNMAP_INCLUDED__
+#define __GSPAR_PATTERNMAP_INCLUDED__
+
+#include "GSPar_BaseParallelPattern.hpp"
+
+namespace GSPar {
+    namespace Pattern {
+
+        /**
+         * Map parallel pattern
+         */
+        class Map : public BaseParallelPattern {
+        public:
+            Map() : BaseParallelPattern() { };
+            Map(std::string source) : BaseParallelPattern(source) { };
+
+            template<class TDriverInstance>
+            Map* clone() const {
+                Map* other = new Map();
+                this->cloneInto<TDriverInstance>(other);
+                return other;
+            }
+        };
+
+    }
+}
+
+#endif
diff --git a/src/GSPar_PatternReduce.cpp b/src/GSPar_PatternReduce.cpp
new file mode 100644
index 0000000..ab21161
--- /dev/null
+++ b/src/GSPar_PatternReduce.cpp
@@ -0,0 +1,155 @@
+#include <iostream>
+
+#include "GSPar_PatternReduce.hpp"
+
+using namespace GSPar::Pattern;
+
+PointerParameter* Reduce::getOutputParameter() {
+    auto param = this->getParameter(this->outputParameterName);
+    if (!param) {
+        throw GSParException("Could not find output parameter with name '" + this->outputParameterName + "' in Reduce pattern");
+    }
+    return static_cast<PointerParameter*>(param);
+}
+
+PointerParameter* Reduce::generateSharedMemoryParameter(Driver::Dimensions dims, Driver::BaseKernelBase *kernel) {
+    if (dims.y || dims.z) {
+        // TODO support multiple dimensions
+        throw GSParException("Reduce pattern currently does not support multi-dimensional kernels");
+    }
+
+    // if (this->sharedMemoryParameter == nullptr || !this->sharedMemoryParameter->isComplete()) {
+        this->getSharedMemoryParameter(); // Generate the placeholder parameter
+
+        std::lock_guard<std::mutex> lock(this->sharedMemoryParameterMutex); // Auto-unlock, RAII
+        if (!this->sharedMemoryParameter->isComplete()) { // Check if there was a race condition for this resource
+            Driver::Dimensions blocksAndThreads = kernel->getNumBlocksAndThreadsFor(dims);
+            size_t sharedMemSize = (dims.x.max > blocksAndThreads.x.max) ? blocksAndThreads.x.max : dims.x.max;
+
+            auto outParam = this->getOutputParameter();
+            this->sharedMemoryParameter->numberOfElements = sharedMemSize;
+            this->sharedMemoryParameter->size = outParam->size * sharedMemSize;
+            this->sharedMemoryParameter->setComplete(true);
+        }
+        // Auto-unlock of sharedMemoryParameterMutex, RAII
+    // }
+    return this->sharedMemoryParameter;
+}
+
+PointerParameter* Reduce::getSharedMemoryParameter() {
+    if (this->sharedMemoryParameter == nullptr) {
+        std::lock_guard<std::mutex> lock(this->sharedMemoryParameterMutex); // Auto-unlock, RAII
+        if (this->sharedMemoryParameter == nullptr) { // Check if there was a race condition for this resource
+            auto outParam = this->getOutputParameter();
+            std::string paramName = "gspar_shared_" + getRandomString(5);
+            this->sharedMemoryParameter = new PointerParameter(paramName, outParam->type, 0, nullptr);
+        }
+        // Auto-unlock of sharedMemoryParameterMutex, RAII
+    }
+    return this->sharedMemoryParameter;
+};
+
+std::string Reduce::getKernelCore(Driver::Dimensions dims, std::array<std::string, 3> stdVarNames) {
+    if (dims.y || dims.z) {
+        // TODO support multiple dimensions
+        throw GSParException("Reduce pattern currently does not support multi-dimensional kernels");
+    }
+
+    PointerParameter *outParam = this->getOutputParameter();
+    auto shmemParam = this->getSharedMemoryParameter();
+    std::string shmem = shmemParam->name;
+
+    std::string op = this->binaryOperation;
+    std::string gid = stdVarNames[0];
+    std::string max = "gspar_max_" + stdVarNames[0];
+    std::string tid = "gspar_tid_" + stdVarNames[0];
+    std::string bid = "gspar_bid_" + stdVarNames[0];
+    std::string bsize = "gspar_bsize_" + stdVarNames[0];
+    
+    // TODO support batches and min-max in Reduce
+
+    // https://devblogs.nvidia.com/using-shared-memory-cuda-cc/
+    // https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
+    std::string kernelSource =
+    "   size_t " + tid + " = gspar_get_thread_id(0); \n"
+    "   size_t " + bid + " = gspar_get_block_id(0); \n"
+    "   size_t " + bsize + " = gspar_get_block_size(0); \n"
+    "   " + shmem + "["+tid+"] = " + this->vectorName + "["+gid+"]; \n"
+    "   gspar_synchronize_local_threads(); \n"
+
+    "   for (unsigned int s="+bsize+"/2; s>0; s>>=1) { \n"
+    "       if ("+tid+" < s && "+gid+"+s < "+max+") { \n"
+    "           "+shmem+"["+tid+"] = "+shmem+"["+tid+"]" + op + shmem+"["+tid+"+s]; \n"
+    "       } \n"
+    "       gspar_synchronize_local_threads(); \n"
+    "       if ("+tid+" == 0 && s > 1 && s % 2 != 0) { \n"
+    "           "+shmem+"["+tid+"] = "+shmem+"["+tid+"]" + op + shmem+"[s-1]; \n"
+    "       } \n"
+    "       gspar_synchronize_local_threads(); \n"
+    "   } \n"
+    "   if ("+tid+" == 0) { \n"
+    "       if ("+bsize+" % 2 != 0) { \n"
+    "           "+shmem+"[0] = "+shmem+"[0]" + op + shmem+"["+max+"-1]; \n"
+    "       } \n"
+    "       " + this->partialTotalsParamName + "["+bid+"] = "+shmem+"[0]; \n"
+    // If the param is input, we reduce it together in the end
+    + (outParam->isIn() ?
+    "       if (gspar_get_grid_size(0) == 1) { \n"
+    "           " + this->partialTotalsParamName+"["+bid+"] = " + this->partialTotalsParamName+"["+bid+"]" + op + "*" + outParam->name + "; \n"
+    "       } \n"
+        : "") +
+    "   } \n"
+    ;
+
+    return kernelSource;
+};
+
+bool Reduce::isKernelCompiledFor(Driver::Dimensions dims) {
+    // We only compile if the kernel wasn't compiled yet and the configuration didn't change
+    return this->_isKernelCompiled && !this->isKernelStale && this->compiledKernelDimension.getCount() == dims.getCount();
+}
+
+void Reduce::callbackBeforeGeneratingKernelSource() {
+    auto partialTotalsParam = this->getParameter(this->partialTotalsParamName);
+    if (!partialTotalsParam) {
+        #ifdef GSPAR_DEBUG
+            std::stringstream ss;
+            ss << "[GSPar Reduce "<<this<<"] Adding parameter for Reduce partial totals (" << this->partialTotalsParamName << ")" << std::endl;
+            std::cout << ss.str();
+            ss.str("");
+        #endif
+        // It is a ParameterPlaceholder, but we don't have the type here to call the proper function
+        auto outParam = this->getOutputParameter();
+        VarType partialsTotalsType = outParam->type;
+        if (!partialsTotalsType.isPointer) {
+            partialsTotalsType.name += "*";
+            partialsTotalsType.isPointer = true;
+        }
+        this->setPointerParameter(this->partialTotalsParamName, partialsTotalsType, 0, nullptr, GSPAR_PARAM_OUT);
+    }
+}
+
+void Reduce::callbackBeforeAllocatingMemoryOnGpu(Driver::Dimensions dims, Driver::BaseKernelBase *kernel) {
+    auto partialTotalsParam = this->getParameter(this->partialTotalsParamName);
+    if (!partialTotalsParam || !partialTotalsParam->isComplete()) {
+        // TODO we could use the previous value (~15 lines above)
+        Driver::Dimensions blocksAndThreads = kernel->getNumBlocksAndThreadsFor(dims);
+        auto outParam = this->getOutputParameter();
+        
+        size_t partialTotalsSize = blocksAndThreads.x.min * outParam->size; // Number of blocks * data size
+        // Should we store this pointer in a class-wide attribute?
+        void *partialTotals = malloc(partialTotalsSize);
+        #ifdef GSPAR_DEBUG
+            std::stringstream ss;
+            ss << "[GSPar Reduce "<<this<<"] Setting parameter for Reduce partial totals (" << this->partialTotalsParamName << ") as " << partialTotals << " (pointer of " << partialTotalsSize << " bytes)" << std::endl;
+            std::cout << ss.str();
+            ss.str("");
+        #endif
+        VarType partialsTotalsType = outParam->type;
+        if (!partialsTotalsType.isPointer) {
+            partialsTotalsType.name += "*";
+            partialsTotalsType.isPointer = true;
+        }
+        this->setPointerParameter(this->partialTotalsParamName, partialsTotalsType, partialTotalsSize, partialTotals, GSPAR_PARAM_OUT);
+    }
+}
diff --git a/src/GSPar_PatternReduce.hpp b/src/GSPar_PatternReduce.hpp
new file mode 100644
index 0000000..cc9f074
--- /dev/null
+++ b/src/GSPar_PatternReduce.hpp
@@ -0,0 +1,174 @@
+
+#ifndef __GSPAR_PATTERNREDUCE_INCLUDED__
+#define __GSPAR_PATTERNREDUCE_INCLUDED__
+
+#include "GSPar_BaseParallelPattern.hpp"
+
+namespace GSPar {
+    namespace Pattern {
+
+        /**
+         * Reduce parallel pattern
+         */
+        class Reduce : public BaseParallelPattern {
+        private:
+            const std::string partialTotalsParamName = "gspar_partial_reductions";
+            PointerParameter* getOutputParameter();
+
+        protected:
+            std::string vectorName;
+            std::string binaryOperation; // https://northstar-www.dartmouth.edu/doc/ibmcxx/en_US/doc/language/ref/ruclxbin.htm
+            std::string outputParameterName;
+
+            PointerParameter* generateSharedMemoryParameter(Driver::Dimensions dims, Driver::BaseKernelBase *kernel) override;
+            PointerParameter* getSharedMemoryParameter() override;
+
+        public:
+            Reduce() : BaseParallelPattern() { };
+            Reduce(std::string vectorName, std::string binaryOperation, std::string outputParameterName) : BaseParallelPattern("") {
+                this->vectorName = vectorName;
+                this->binaryOperation = binaryOperation;
+                this->outputParameterName = outputParameterName;
+                this->useSharedMemory = true;
+            };
+
+            template<class TDriverInstance>
+            Reduce* clone() const {
+                Reduce* other = new Reduce();
+                this->cloneInto<TDriverInstance>(other);
+                other->vectorName = this->vectorName;
+                other->binaryOperation = this->binaryOperation;
+                other->outputParameterName = this->outputParameterName;
+                return other;
+            };
+
+            std::string getKernelCore(Driver::Dimensions dims, std::array<std::string, 3> stdVarNames) override;
+
+            bool isKernelCompiledFor(Driver::Dimensions dims) override;
+
+            // Callback override
+            void callbackBeforeGeneratingKernelSource() override;
+            void callbackBeforeAllocatingMemoryOnGpu(Driver::Dimensions dims, Driver::BaseKernelBase *kernel) override;
+
+            // Main run function for Reduce Pattern
+            // TODO this does not override base class due to templates. Fix this.
+            template<class TDriverInstance>
+            void run(Driver::Dimensions dimsToUse) {
+                if (dimsToUse.y || dimsToUse.z) {
+                    // TODO support multiple dimensions
+                    throw GSParException("Reduce pattern currently does not support multi-dimensional kernels");
+                }
+                
+                // TODO support batched Reduce pattern
+
+                #ifdef GSPAR_DEBUG
+                    std::stringstream ss;
+                #endif
+                this->compile<TDriverInstance>(dimsToUse);
+
+                // #ifdef GSPAR_DEBUG
+                //     auto gpu = this->getGpu<TDriverInstance, decltype(TDriverInstance::getDeviceType())>();
+                //     ss << "[GSPar Reduce "<<this<<"] Working with GPU " << gpu << " - " << gpu->getName() << std::endl;
+                //     std::cout << ss.str();
+                //     ss.str("");
+                // #endif
+
+                auto kernel = this->getCompiledKernel<TDriverInstance>();
+                kernel->clearParameters();
+
+                this->callbackBeforeAllocatingMemoryOnGpu(dimsToUse, kernel);
+
+                this->mallocParametersInGpu<TDriverInstance>();
+
+                this->copyParametersFromHostToGpuAsync<TDriverInstance>();
+
+                auto executionFlow = this->getExecutionFlow<TDriverInstance>();
+
+                Driver::Dimensions dimsToRun = dimsToUse;
+
+                // We start reducing the input vector
+                PointerParameter *inputVector = static_cast<PointerParameter*>(this->getParameter(this->vectorName));
+                if (inputVector == nullptr) {
+                    throw GSParException("Could not find input parameter with name '" + this->vectorName + "' in Reduce pattern");
+                }
+                decltype(TDriverInstance::getMemoryObjectType())* inputMemoryObject = dynamic_cast<decltype(TDriverInstance::getMemoryObjectType())*>(inputVector->getMemoryObject());
+
+                // In the first iteration, partialTotals is the output. After the first iteration, it is the input and output parameters
+                PointerParameter *partialTotals = static_cast<PointerParameter*>(this->getParameter(this->partialTotalsParamName));
+                if (partialTotals == nullptr) {
+                    throw GSParException("Could not find partial totals parameter with name '" + this->partialTotalsParamName + "' in Reduce pattern");
+                }
+
+                while (true) {
+
+                    Driver::Dimensions blocksAndThreads = kernel->getNumBlocksAndThreadsFor(dimsToRun);
+
+                    this->setSharedMemoryInKernel<TDriverInstance>(kernel, dimsToRun);
+
+                    // Init this->setParametersInKernel
+                    this->setDimsParametersInKernel<TDriverInstance>(kernel, dimsToRun);
+
+                    // Sets Pattern parameters in Kernel object
+                    for (auto& paramName : this->paramsOrder) {
+                        if (paramName == this->vectorName) { // Input parameter
+                            if (inputMemoryObject) {
+                                inputMemoryObject->waitAsync(); // Waits for async copy to finish
+                            }
+                            kernel->setParameter(inputMemoryObject); // We can simply set the memory object
+                        } else {
+                            auto param = this->getParameter(paramName);
+                            this->setParameterInKernel<TDriverInstance>(kernel, param);
+                        }
+
+                    }
+                    // Finish this->setParametersInKernel
+
+                    this->callbackAfterCopyDataFromHostToGpu();
+                    this->callbackBeforeRunInGpu();
+
+                    #ifdef GSPAR_DEBUG
+                        ss << "[GSPar Reduce "<<this<<"] Running kernel " << kernel << " for " << dimsToRun.toString() << " in flow " << executionFlow << std::endl;
+                        std::cout << ss.str();
+                        ss.str("");
+                    #endif
+
+                    kernel->runAsync(dimsToRun, executionFlow);
+
+                    kernel->waitAsync();
+
+                    #ifdef GSPAR_DEBUG
+                        ss << "[GSPar Reduce "<<this<<"] Finished running kernel " << kernel << " in flow " << executionFlow;
+                        ss << ". Reduced to " << blocksAndThreads.x.min << " element(s)" << std::endl;
+                        std::cout << ss.str();
+                        ss.str("");
+                    #endif
+
+                    if (blocksAndThreads.x.min == 1) break;
+
+                    Driver::Dimensions newDims(blocksAndThreads.x.min, 0, 0);
+                    dimsToRun = newDims;
+
+                    inputMemoryObject = dynamic_cast<decltype(TDriverInstance::getMemoryObjectType())*>(partialTotals->getMemoryObject());
+
+                    kernel->clearParameters();
+                }
+
+                // "Hack" to copy partial totals into output parameter
+                PointerParameter *outParam = this->getOutputParameter();
+                decltype(TDriverInstance::getMemoryObjectType())* outputMemoryObject = dynamic_cast<decltype(TDriverInstance::getMemoryObjectType())*>(partialTotals->getMemoryObject());
+                outputMemoryObject->bindTo(outParam->getPointer(), outParam->size);
+                outputMemoryObject->copyOut();
+                outParam->direction = GSPAR_PARAM_NONE; // We already copied the parameter out, copyParametersFromGpuToHostAsync should ignore it
+
+                this->callbackAfterRunInGpu();
+
+                this->copyParametersFromGpuToHostAsync<TDriverInstance>();
+
+                this->callbackAfterCopyDataFromGpuToHost(dimsToUse, kernel);
+            }
+        };
+
+    }
+}
+
+#endif
diff --git a/thirdpt/marX2/marX2.c b/thirdpt/marX2/marX2.c
new file mode 100644
index 0000000..4bc12ed
--- /dev/null
+++ b/thirdpt/marX2/marX2.c
@@ -0,0 +1,434 @@
+/* ***************************************************************************
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as 
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ *  As a special exception, you may use this file as part of a free software
+ *  library without restriction.  Specifically, if other files instantiate
+ *  templates or use macros or inline functions from this file, or you compile
+ *  this file and link it with other files to produce an executable, this
+ *  file does not by itself cause the resulting executable to be covered by
+ *  the GNU General Public License.  This exception does not however
+ *  invalidate any other reasons why the executable file might be covered by
+ *  the GNU General Public License.
+ *
+ ****************************************************************************
+ */
+
+/*
+   Author: Marco Aldinucci.   
+   email:  aldinuc@di.unipi.it
+           marco@pisa.quadrics.com
+   date :  15/11/97
+
+   Modified: Massimo Coppola
+   email:    coppola@di.unipi.it
+   date:     23/11/2001
+
+*/
+
+/*  gcc -Wall -ansi -c -O2 -I/usr/X11R6/include marX2.c  for Linux LAN */
+/*  cc -fast -Xc -c -I/usr/openwin/include marX2.c for SUN or MEIKO */
+/* Specify the path where X11 include are ... */
+
+/* ar -rv marX2.a marX2.o */
+/* ranlib marX2.a */
+
+
+#include "marX2.h" 
+
+#ifndef PI
+#define PI 3.1415926535 
+#endif
+
+/*#define DEBUG(x) x;fflush(stdout);*/
+/* defined empty to remove any  debug printf */
+#define DEBUG(x)
+
+void ShowTM ();
+void ChooseColorMap(int);
+void RainbowColorMap(int); 
+
+static Display * display;
+static Window window;
+
+#define SCREEN DefaultScreen(display)
+
+static GC gc;
+static int fg;
+static int bg;
+static XImage * image;
+static Colormap colormap;
+
+
+static char *m_image_buffer=NULL; /* pointer to the image buffer area */
+
+static int m_h,m_w; /* size of our window */
+
+static int iScreen; /* integer id of the default screen of display */
+static int dDepth;  /* display bit depth: only 8,12,24 bits are supported */
+static unsigned int sizeofPixel;  /* rounded up pixel size  */
+static int rounded_length;        /* rounded up length of a line in pixels */
+
+
+static int primo=0; /* ?? 0 if the colormap has not yet been changed ?? */
+
+/* function that opens a window over the default screen */
+void SetupXWindows(int w, int h, int setup_color, char *display_name,
+		   const char *window_title)
+{
+  XEvent event;
+  XGCValues gcvalues;
+  int mask;
+  
+  display = XOpenDisplay((display_name ? display_name : ""));
+  if (!display)
+	{
+	  printf("Error in open X display\n");
+	  exit (1);
+	}
+
+  iScreen = DefaultScreen(display);
+  dDepth = DefaultDepth(display,iScreen);
+  switch (dDepth)
+    {
+    case 8:
+      sizeofPixel=1;
+      break;
+    case 16:
+      sizeofPixel=2;
+      break;
+    case 24:
+    case 32: /* It should work; on my system 24 bits pixels are word-aligned */
+      sizeofPixel=4;
+      break;
+    default:
+      printf("SetupXWindows : unsupported Display depth %d\n",dDepth);
+    }
+
+  DEBUG(printf("Default display depth %d\n", dDepth));
+
+  m_w=w;
+  m_h=h;
+
+  /* scanlines are rounded to 32 pixels to avoid any misalignment */
+  rounded_length = (m_w+31)&(~31); 
+  /* allocate window buffer space */
+  m_image_buffer = /*(XImage *)*/
+    (char *) calloc(sizeof(char)*sizeofPixel,rounded_length*m_h);
+
+  fg=Black();
+  bg=White();
+
+  window = XCreateSimpleWindow(display, DefaultRootWindow(display),
+			       0,0,w,h,2,fg,bg);
+
+  mask = ExposureMask | ButtonPressMask | KeyPressMask;
+  XSelectInput(display, window,mask);
+  XStoreName(display, window, window_title);
+  XMapWindow(display, window);
+
+  for (;;)
+	{
+	  XNextEvent(display, &event);
+	  if (event.type == Expose)
+		break;
+	}
+  
+  gcvalues.foreground = fg;
+  gcvalues.background = bg;
+  mask = GCForeground | GCBackground;
+  gc = XCreateGC(display, window, mask, &gcvalues);
+
+  DEBUG(printf("Window opened\n"));
+}
+
+
+void CloseXWindows()
+{
+  /* should close the window and free all resources */
+
+  XCloseDisplay(display);
+
+}
+
+/* service function to convert an array of bytes to an array of dDepth
+   depth pixels. 
+*/
+static void ConvertLine(unsigned char *line, int line_len, XImage * dest)
+{
+  int i, value, mult=0;
+  switch (dDepth)
+    {
+    case 32:
+      mult = 0x01010101;
+      break;
+    case 24:
+      mult = 0x000f0704;
+      break;
+    case 16:
+      mult = 0x00000101;
+      break;
+    case 8:
+      mult = 1;
+      break;
+    }
+  for (i=0;i<line_len;i++)
+    {
+      value = line[i] * mult;
+      XPutPixel(dest, i, 0, value); 
+    }
+  return;
+}
+
+
+/* draw a line of data in the window, copy it to a buffer area for
+ *  refresh purposes (allocate the area if it's not already there) 
+ *
+ * It is assumed that the input is a line of 8bit values, to be mapped
+ * to 256 colors in the default visual. 
+ *
+ * we assume a square image of identical lines 
+ * this is actually a bug!!
+ */
+
+void ShowLine(void *line,int line_len,int position)
+{ 
+  char * image_buffer;
+  XImage * image_line;
+
+  /* I added some clipping! */
+  if (line_len>rounded_length) line_len=rounded_length;
+  if (position>m_h) return;
+
+  /* alloc temp image area and XImage structure */
+  DEBUG(printf("Showline - 2 pt %x len %d pos %d rlen %d  \n",
+	       line, line_len, position, rounded_length));
+  image_buffer = (char *)(calloc(sizeof(char)*sizeofPixel, rounded_length));
+
+  DEBUG(printf("Showline - 3 tmpimage %x siz %x len %x \n",
+	       image_buffer, sizeof(char)*sizeofPixel, rounded_length));
+
+  image_line = XCreateImage(display,DefaultVisual(display, SCREEN),
+		       dDepth,ZPixmap,0,
+		       (char *)image_buffer,rounded_length,1,32,0); 
+
+  DEBUG(printf("Showline - 4.1 image_line %x \n",image_line));
+
+  /* convert the input data into the image */
+  ConvertLine(line, line_len, image_line);
+		   
+  /* Put the line into the window */
+  XPutImage(display,window,gc,image_line,0,0,0,position,line_len,1);
+  XFlush(display);
+
+  /* Save the line into our backing store */
+  memcpy((m_image_buffer+(position*rounded_length*sizeofPixel)),
+	 image_buffer,
+	 line_len*sizeofPixel);
+  /* destroy temporary image */
+  XDestroyImage (image_line);
+  DEBUG(printf("Showline - 9\n"));
+}
+
+
+/* manage some events coming from the window:  
+ * refresh (Expose) events,
+ * button press,
+ * key presses: q,Q,c,C,r,R,m,M
+ */
+void HXI(int *px, int *py,int *dim,int *done)
+{
+  XEvent event;
+  int something =0;
+  static int next=0;
+  const int clicks = 10; /* how many clicks before exiting anyway */
+
+  *dim=1;
+  *done=0;
+  
+  while ((XEventsQueued(display, QueuedAfterReading) > 0)|| something<clicks)
+	{
+	  XFlush(display);
+	  XNextEvent(display, &event);
+	  switch(event.type)
+		{
+		case ButtonPress:
+		  {
+			unsigned int button;
+			XButtonPressedEvent * bpe = 
+			  (XButtonPressedEvent *) &event;
+
+			*px=bpe->x;
+			*py=bpe->y;
+			button=bpe->button;
+			*dim = (button == Button1 ? 2 :
+				button == Button2 ? 4 : 8);
+			something++;
+		  }
+		  break;
+		case Expose:
+		  if (m_image_buffer!=NULL)
+		    {
+		      if (image == NULL)
+			{
+			  image = 
+			    XCreateImage(display,
+					 DefaultVisual(display,SCREEN),
+					 dDepth,ZPixmap,0,
+					 (char*) m_image_buffer,
+					 rounded_length,m_h,32,0);
+			  DEBUG(printf("image %x \n",image));
+			}
+		      XPutImage(display,window,gc,image,0,0,0,0,m_w,m_h);
+		      XFlush(display);
+		    }
+		  break;
+		case KeyPress:
+		  {
+		    XKeyEvent * kpe = (XKeyEvent *) &event;
+		    KeySym ks = XLookupKeysym(kpe, 0);
+			
+		    switch (ks)
+		      {
+		      case 'q':
+		      case 'Q':
+			*done=1;
+			something=clicks;
+			break;
+		      case 'r':
+		      case 'R':
+			RainbowColorMap(128);
+			break;
+		      case 'c':
+		      case 'C':
+			ChooseColorMap(next++); 
+			break;
+		      case 'M':
+		      case 'm':
+			ShowTM ();
+			break;
+		      default:
+			printf("Keys:\n\n");
+			printf(
+			       "q) Quit !\n"
+			       "c) change colormap (8bits display)\n"
+			       "c + r) rainbow colormap (8bits display)\n");
+			fflush(stdout);
+			
+		      }
+		  }
+		  break;
+		}
+	}
+}
+
+
+/* two functions to get the default fg/gb colours on the default screen*/
+int Black(void){return(BlackPixelOfScreen(DefaultScreenOfDisplay(display)));}
+int White(void){return(WhitePixelOfScreen(DefaultScreenOfDisplay(display)));}
+
+
+/* show a short message on the window */
+void ShowTM ()
+{
+  /* should clear the window */
+
+  XGCValues gcvalues,tmp_val;
+  int mask;
+  char s1[]="Bacci Cantalupo Ravazzolo";
+  char s2[]="Riaudo Pesciullesi";
+  char s3[]="Aldinucci Coppola Torquati";
+  
+  mask=GCForeground|GCFunction;
+  XGetGCValues(display,gc,mask,&tmp_val);
+  gcvalues.foreground=1;
+  gcvalues.function=GXcopy;
+  XChangeGC(display,gc,mask,&gcvalues);
+  XDrawString(display,window,gc,5,10,"QSW PISA are:",12);
+  XDrawString(display,window,gc,10,25,s1,strlen(s1));
+  XDrawString(display,window,gc,10,40,s2,strlen(s2));
+  XDrawString(display,window,gc,10,55,s3,strlen(s3));
+  /* Restore Graphic Context */
+  XChangeGC(display,gc,mask,&tmp_val);
+
+}
+
+/* the following two functions change the colormap for 8-bit
+   displays. They should check and do nothing on true color displays */
+void ChooseColorMap(int which)
+{
+
+  Visual visual;
+  XColor color;
+  int i;
+
+  if (dDepth!=8)return; /* only for 256 color display! */
+
+  if (primo==0)
+    {
+      visual = *DefaultVisual(display, SCREEN);
+      colormap= XCreateColormap(display, window, &visual, AllocAll); 
+    }
+  
+  color.flags=DoRed | DoGreen | DoBlue;
+  for (i=0; i< 256; i++)
+    {
+      color.pixel=i;
+      color.red=0xffffL * ((long) (i+which) * 101 %256)/255L;
+      color.green=0xffffL * ((long) (i+which) * 151 %256)/255L;
+      color.blue=0xffffL * ((long) (i+which) * 171 %256)/255L;
+      
+      XStoreColor(display, colormap, &color);	  
+    }
+  
+  if (primo==0)
+    {
+      XInstallColormap(display,colormap);
+      XSetWindowColormap(display,window,colormap);
+      primo=1;
+    }
+}
+
+void RainbowColorMap(int n)
+{
+  int i, j;
+  double d, e;
+  XColor color;
+
+  if (dDepth!=8)return; /* only for 256 color display! */
+
+  if (primo!=0)
+    {
+      color.flags=DoRed | DoGreen | DoBlue;
+      for (i = 1; i < n - 1; i++) {
+	j = n - 1 - i;
+	d = (d = cos((double)((j - n * 0.16) * (PI / n)))) < 0.0
+	  ? 0.0 : d;
+	color.blue = d * n;
+	d = (d = cos((double)((j - n * 0.52) * (PI / n)))) < 0.0
+	  ? 0.0 : d;
+	color.green = d * n;
+	d = (d = cos((double)((j - n * .83) * (PI / n)))) < 0.0
+	  ? 0.0 : d;
+	e = (e = cos((double)(j * (PI / n)))) < 0.0
+	  ? 0.0 : e;
+	color.red = d * n + e * (n / 2);
+	color.pixel=i;
+	XStoreColor(display, colormap, &color);
+      }
+      color.green=color.blue=color.red=color.pixel=i;
+      XStoreColor(display, colormap, &color);
+      color.green=color.blue=color.red=color.pixel=0;
+      XStoreColor(display, colormap, &color);
+    }
+} 
diff --git a/thirdpt/marX2/marX2.h b/thirdpt/marX2/marX2.h
new file mode 100644
index 0000000..990e815
--- /dev/null
+++ b/thirdpt/marX2/marX2.h
@@ -0,0 +1,29 @@
+
+#ifndef MARCO_X
+#define MARCO_X
+
+#include <X11/Xlib.h> 
+#include <X11/Xutil.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void CloseXWindows();
+void ShowLine(void *line,int line_len,int position);
+void SetupXWindows(int w, int h, int setup_color, char *display_name,
+		   const char *window_title);
+
+void HXI(int *px, int *py,int *dim,int *done);
+
+int Black();
+int White();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/thirdpt/rapidxml-1.13/license.txt b/thirdpt/rapidxml-1.13/license.txt
new file mode 100644
index 0000000..1409831
--- /dev/null
+++ b/thirdpt/rapidxml-1.13/license.txt
@@ -0,0 +1,52 @@
+Use of this software is granted under one of the following two licenses,
+to be chosen freely by the user.
+
+1. Boost Software License - Version 1.0 - August 17th, 2003
+===============================================================================
+
+Copyright (c) 2006, 2007 Marcin Kalicinski
+
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+
+2. The MIT License
+===============================================================================
+
+Copyright (c) 2006, 2007 Marcin Kalicinski
+
+Permission is hereby granted, free of charge, to any person obtaining a copy 
+of this software and associated documentation files (the "Software"), to deal 
+in the Software without restriction, including without limitation the rights 
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 
+of the Software, and to permit persons to whom the Software is furnished to do so, 
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all 
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 
+IN THE SOFTWARE.
diff --git a/thirdpt/rapidxml-1.13/manual.html b/thirdpt/rapidxml-1.13/manual.html
new file mode 100644
index 0000000..2c42270
--- /dev/null
+++ b/thirdpt/rapidxml-1.13/manual.html
@@ -0,0 +1,406 @@
+<html><head><style type="text/css">
+
+          body
+          {
+          font-family: sans-serif;
+          font-size: 90%;
+          margin: 8pt 8pt 8pt 8pt;
+          text-align: justify;
+          background-color: White;
+          }
+
+          h1 { font-weight: bold; text-align: left;  }
+          h2 { font: 140% sans-serif; font-weight: bold; text-align: left;  }
+          h3 { font: 120% sans-serif; font-weight: bold; text-align: left;  }
+          h4 { font: bold 100% sans-serif; font-weight: bold; text-align: left;  }
+          h5 { font: italic 100% sans-serif; font-weight: bold; text-align: left;  }
+          h6 { font: small-caps 100% sans-serif; font-weight: bold; text-align: left;  }
+
+          code
+          {
+          font-family: &quot;Courier New&quot;, Courier, mono;
+          }
+
+          pre
+          {
+          border-top: gray 0.5pt solid;
+          border-right: gray 0.5pt solid;
+          border-left: gray 0.5pt solid;
+          border-bottom: gray 0.5pt solid;
+          padding-top: 2pt;
+          padding-right: 2pt;
+          padding-left: 2pt;
+          padding-bottom: 2pt;
+          display: block;
+          font-family: &quot;courier new&quot;, courier, mono;
+          background-color: #eeeeee;
+          }
+
+          a
+          {
+          color: #000080;
+          text-decoration: none;
+          }
+
+          a:hover
+          {
+          text-decoration: underline;
+          }
+
+          .reference-header
+          {
+          border-top: gray 0.5pt solid;
+          border-right: gray 0.5pt solid;
+          border-left: gray 0.5pt solid;
+          border-bottom: gray 0.5pt solid;
+          padding-top: 2pt;
+          padding-right: 2pt;
+          padding-left: 2pt;
+          padding-bottom: 2pt;
+          background-color: #dedede;
+          }
+
+          .parameter-name
+          {
+          font-style: italic;
+          }
+
+          .indented
+          {
+          margin-left: 0.5cm;
+          }
+
+          a.toc1
+          {
+          margin-left: 0.0cm;
+          }
+
+          a.toc2
+          {
+          margin-left: 0.75cm;
+          }
+
+          a.toc3
+          {
+          margin-left: 1.5cm;
+          }
+
+        </style></head><body><h1>RAPIDXML Manual</h1><h3>Version 1.13</h3><detaileddescription xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><para><i>Copyright (C) 2006, 2009 Marcin Kalicinski</i><br/><i>See accompanying file <a href="license.txt">license.txt</a> for license information.</i><hr/><h2 level="2">Table of Contents</h2></para><para><toc><toc-contents><a href="#namespacerapidxml_1what_is_rapidxml" class="toc1">1. What is RapidXml?</a><br/><a href="#namespacerapidxml_1dependencies_and_compatibility" class="toc2">1.1 Dependencies And Compatibility</a><br/><a href="#namespacerapidxml_1character_types_and_encodings" class="toc2">1.2 Character Types And Encodings</a><br/><a href="#namespacerapidxml_1error_handling" class="toc2">1.3 Error Handling</a><br/><a href="#namespacerapidxml_1memory_allocation" class="toc2">1.4 Memory Allocation</a><br/><a href="#namespacerapidxml_1w3c_compliance" class="toc2">1.5 W3C Compliance</a><br/><a href="#namespacerapidxml_1api_design" class="toc2">1.6 API Design</a><br/><a href="#namespacerapidxml_1reliability" class="toc2">1.7 Reliability</a><br/><a href="#namespacerapidxml_1acknowledgements" class="toc2">1.8 Acknowledgements</a><br/><a href="#namespacerapidxml_1two_minute_tutorial" class="toc1">2. Two Minute Tutorial</a><br/><a href="#namespacerapidxml_1parsing" class="toc2">2.1 Parsing</a><br/><a href="#namespacerapidxml_1accessing_dom_tree" class="toc2">2.2 Accessing The DOM Tree</a><br/><a href="#namespacerapidxml_1modifying_dom_tree" class="toc2">2.3 Modifying The DOM Tree</a><br/><a href="#namespacerapidxml_1printing" class="toc2">2.4 Printing XML</a><br/><a href="#namespacerapidxml_1differences" class="toc1">3. Differences From Regular XML Parsers</a><br/><a href="#namespacerapidxml_1lifetime_of_source_text" class="toc2">3.1 Lifetime Of Source Text</a><br/><a href="#namespacerapidxml_1ownership_of_strings" class="toc2">3.2 Ownership Of Strings</a><br/><a href="#namespacerapidxml_1destructive_non_destructive" class="toc2">3.3 Destructive Vs Non-Destructive Mode</a><br/><a href="#namespacerapidxml_1performance" class="toc1">4. Performance</a><br/><a href="#namespacerapidxml_1performance_charts" class="toc2">4.1 Comparison With Other Parsers</a><br/><a href="#namespacerapidxml_1reference" class="toc1">5. Reference</a><br/></toc-contents></toc><br/></para><sect1><h2 id="namespacerapidxml_1what_is_rapidxml">1. What is RapidXml?</h2><para><a href="http://rapidxml.sourceforge.net">RapidXml</a> is an attempt to create the fastest XML DOM parser possible, while retaining useability, portability and reasonable W3C compatibility. It is an in-situ parser written in C++, with parsing speed approaching that of <code>strlen()</code> function executed on the same data. <br/><br/>
+ Entire parser is contained in a single header file, so no building or linking is neccesary. To use it you just need to copy <code>rapidxml.hpp</code> file to a convenient place (such as your project directory), and include it where needed. You may also want to use printing functions contained in header <code>rapidxml_print.hpp</code>.</para><sect2><h3 id="namespacerapidxml_1dependencies_and_compatibility">1.1 Dependencies And Compatibility</h3><para>RapidXml has <i>no dependencies</i> other than a very small subset of standard C++ library (<code>&lt;cassert&gt;</code>, <code>&lt;cstdlib&gt;</code>, <code>&lt;new&gt;</code> and <code>&lt;exception&gt;</code>, unless exceptions are disabled). It should compile on any reasonably conformant compiler, and was tested on Visual C++ 2003, Visual C++ 2005, Visual C++ 2008, gcc 3, gcc 4, and Comeau 4.3.3. Care was taken that no warnings are produced on these compilers, even with highest warning levels enabled.</para></sect2><sect2><h3 id="namespacerapidxml_1character_types_and_encodings">1.2 Character Types And Encodings</h3><para>RapidXml is character type agnostic, and can work both with narrow and wide characters. Current version does not fully support UTF-16 or UTF-32, so use of wide characters is somewhat incapacitated. However, it should succesfully parse <code>wchar_t</code> strings containing UTF-16 or UTF-32 if endianness of the data matches that of the machine. UTF-8 is fully supported, including all numeric character references, which are expanded into appropriate UTF-8 byte sequences (unless you enable parse_no_utf8 flag). <br/><br/>
+ Note that RapidXml performs no decoding - strings returned by name() and value() functions will contain text encoded using the same encoding as source file. Rapidxml understands and expands the following character references: <code>&amp;apos; &amp;amp; &amp;quot; &amp;lt; &amp;gt; &amp;#...;</code> Other character references are not expanded.</para></sect2><sect2><h3 id="namespacerapidxml_1error_handling">1.3 Error Handling</h3><para>By default, RapidXml uses C++ exceptions to report errors. If this behaviour is undesirable, RAPIDXML_NO_EXCEPTIONS can be defined to suppress exception code. See <a href="#classrapidxml_1_1parse__error" kindref="compound">parse_error</a> class and <a href="#namespacerapidxml_ff5d67f74437199d316d2b2660653ae1_1ff5d67f74437199d316d2b2660653ae1" kindref="member">parse_error_handler()</a> function for more information.</para></sect2><sect2><h3 id="namespacerapidxml_1memory_allocation">1.4 Memory Allocation</h3><para>RapidXml uses a special memory pool object to allocate nodes and attributes, because direct allocation using <code>new</code> operator would be far too slow. Underlying memory allocations performed by the pool can be customized by use of <a href="#classrapidxml_1_1memory__pool_c0a55a6ef0837dca67572e357100d78a_1c0a55a6ef0837dca67572e357100d78a" kindref="member">memory_pool::set_allocator()</a> function. See class <a href="#classrapidxml_1_1memory__pool" kindref="compound">memory_pool</a> for more information.</para></sect2><sect2><h3 id="namespacerapidxml_1w3c_compliance">1.5 W3C Compliance</h3><para>RapidXml is not a W3C compliant parser, primarily because it ignores DOCTYPE declarations. There is a number of other, minor incompatibilities as well. Still, it can successfully parse and produce complete trees of all valid XML files in W3C conformance suite (over 1000 files specially designed to find flaws in XML processors). In destructive mode it performs whitespace normalization and character entity substitution for a small set of built-in entities.</para></sect2><sect2><h3 id="namespacerapidxml_1api_design">1.6 API Design</h3><para>RapidXml API is minimalistic, to reduce code size as much as possible, and facilitate use in embedded environments. Additional convenience functions are provided in separate headers: <code>rapidxml_utils.hpp</code> and <code><a href="#rapidxml__print_8hpp" kindref="compound">rapidxml_print.hpp</a></code>. Contents of these headers is not an essential part of the library, and is currently not documented (otherwise than with comments in code).</para></sect2><sect2><h3 id="namespacerapidxml_1reliability">1.7 Reliability</h3><para>RapidXml is very robust and comes with a large harness of unit tests. Special care has been taken to ensure stability of the parser no matter what source text is thrown at it. One of the unit tests produces 100,000 randomly corrupted variants of XML document, which (when uncorrupted) contains all constructs recognized by RapidXml. RapidXml passes this test when it correctly recognizes that errors have been introduced, and does not crash or loop indefinitely. <br/><br/>
+ Another unit test puts RapidXml head-to-head with another, well estabilished XML parser, and verifies that their outputs match across a wide variety of small and large documents. <br/><br/>
+ Yet another test feeds RapidXml with over 1000 test files from W3C compliance suite, and verifies that correct results are obtained. There are also additional tests that verify each API function separately, and test that various parsing modes work as expected.</para></sect2><sect2><h3 id="namespacerapidxml_1acknowledgements">1.8 Acknowledgements</h3><para>I would like to thank Arseny Kapoulkine for his work on <a href="http://code.google.com/p/pugixml">pugixml</a>, which was an inspiration for this project. Additional thanks go to Kristen Wegner for creating <a href="http://www.codeproject.com/soap/pugxml.asp">pugxml</a>, from which pugixml was derived. Janusz Wohlfeil kindly ran RapidXml speed tests on hardware that I did not have access to, allowing me to expand performance comparison table.</para></sect2></sect1><sect1><h2 id="namespacerapidxml_1two_minute_tutorial">2. Two Minute Tutorial</h2><sect2><h3 id="namespacerapidxml_1parsing">2.1 Parsing</h3><para>The following code causes RapidXml to parse a zero-terminated string named <code>text</code>: <pre>using namespace rapidxml;
+xml_document&lt;&gt; doc;    // character type defaults to char
+doc.parse&lt;0&gt;(text);    // 0 means default parse flags
+</pre><code>doc</code> object is now a root of DOM tree containing representation of the parsed XML. Because all RapidXml interface is contained inside namespace <code>rapidxml</code>, users must either bring contents of this namespace into scope, or fully qualify all the names. Class <a href="#classrapidxml_1_1xml__document" kindref="compound">xml_document</a> represents a root of the DOM hierarchy. By means of public inheritance, it is also an <a href="#classrapidxml_1_1xml__node" kindref="compound">xml_node</a> and a <a href="#classrapidxml_1_1memory__pool" kindref="compound">memory_pool</a>. Template parameter of <a href="#classrapidxml_1_1xml__document_8338ce6042e7b04d5a42144fb446b69c_18338ce6042e7b04d5a42144fb446b69c" kindref="member">xml_document::parse()</a> function is used to specify parsing flags, with which you can fine-tune behaviour of the parser. Note that flags must be a compile-time constant.</para></sect2><sect2><h3 id="namespacerapidxml_1accessing_dom_tree">2.2 Accessing The DOM Tree</h3><para>To access the DOM tree, use methods of <a href="#classrapidxml_1_1xml__node" kindref="compound">xml_node</a> and <a href="#classrapidxml_1_1xml__attribute" kindref="compound">xml_attribute</a> classes: <pre>cout &lt;&lt; &quot;Name of my first node is: &quot; &lt;&lt; doc.first_node()-&gt;name() &lt;&lt; &quot;\n&quot;;
+xml_node&lt;&gt; *node = doc.first_node(&quot;foobar&quot;);
+cout &lt;&lt; &quot;Node foobar has value &quot; &lt;&lt; node-&gt;value() &lt;&lt; &quot;\n&quot;;
+for (xml_attribute&lt;&gt; *attr = node-&gt;first_attribute();
+     attr; attr = attr-&gt;next_attribute())
+{
+    cout &lt;&lt; &quot;Node foobar has attribute &quot; &lt;&lt; attr-&gt;name() &lt;&lt; &quot; &quot;;
+    cout &lt;&lt; &quot;with value &quot; &lt;&lt; attr-&gt;value() &lt;&lt; &quot;\n&quot;;
+}
+</pre></para></sect2><sect2><h3 id="namespacerapidxml_1modifying_dom_tree">2.3 Modifying The DOM Tree</h3><para>DOM tree produced by the parser is fully modifiable. Nodes and attributes can be added/removed, and their contents changed. The below example creates a HTML document, whose sole contents is a link to google.com website: <pre>xml_document&lt;&gt; doc;
+xml_node&lt;&gt; *node = doc.allocate_node(node_element, &quot;a&quot;, &quot;Google&quot;);
+doc.append_node(node);
+xml_attribute&lt;&gt; *attr = doc.allocate_attribute(&quot;href&quot;, &quot;google.com&quot;);
+node-&gt;append_attribute(attr);
+</pre> One quirk is that nodes and attributes <i>do not own</i> the text of their names and values. This is because normally they only store pointers to the source text. So, when assigning a new name or value to the node, care must be taken to ensure proper lifetime of the string. The easiest way to achieve it is to allocate the string from the <a href="#classrapidxml_1_1xml__document" kindref="compound">xml_document</a> memory pool. In the above example this is not necessary, because we are only assigning character constants. But the code below uses <a href="#classrapidxml_1_1memory__pool_69729185bc59b0875192d667c47b8859_169729185bc59b0875192d667c47b8859" kindref="member">memory_pool::allocate_string()</a> function to allocate node name (which will have the same lifetime as the document), and assigns it to a new node: <pre>xml_document&lt;&gt; doc;
+char *node_name = doc.allocate_string(name);        // Allocate string and copy name into it
+xml_node&lt;&gt; *node = doc.allocate_node(node_element, node_name);  // Set node name to node_name
+</pre> Check <a href="#namespacerapidxml_1reference" kindref="member">Reference</a>  section for description of the entire interface.</para></sect2><sect2><h3 id="namespacerapidxml_1printing">2.4 Printing XML</h3><para>You can print <code><a href="#classrapidxml_1_1xml__document" kindref="compound">xml_document</a></code> and <code><a href="#classrapidxml_1_1xml__node" kindref="compound">xml_node</a></code> objects into an XML string. Use <a href="#namespacerapidxml_b94d570fc4c4ab2423813cd0243326b1_1b94d570fc4c4ab2423813cd0243326b1" kindref="member">print()</a> function or operator &lt;&lt;, which are defined in <code><a href="#rapidxml__print_8hpp" kindref="compound">rapidxml_print.hpp</a></code> header. <pre>using namespace rapidxml;
+xml_document&lt;&gt; doc;    // character type defaults to char
+// ... some code to fill the document
+
+// Print to stream using operator &lt;&lt;
+std::cout &lt;&lt; doc;   
+
+// Print to stream using print function, specifying printing flags
+print(std::cout, doc, 0);   // 0 means default printing flags
+
+// Print to string using output iterator
+std::string s;
+print(std::back_inserter(s), doc, 0);
+
+// Print to memory buffer using output iterator
+char buffer[4096];                      // You are responsible for making the buffer large enough!
+char *end = print(buffer, doc, 0);      // end contains pointer to character after last printed character
+*end = 0;                               // Add string terminator after XML
+</pre></para></sect2></sect1><sect1><h2 id="namespacerapidxml_1differences">3. Differences From Regular XML Parsers</h2><para>RapidXml is an <i>in-situ parser</i>, which allows it to achieve very high parsing speed. In-situ means that parser does not make copies of strings. Instead, it places pointers to the <i>source text</i> in the DOM hierarchy.</para><sect2><h3 id="namespacerapidxml_1lifetime_of_source_text">3.1 Lifetime Of Source Text</h3><para>In-situ parsing requires that source text lives at least as long as the document object. If source text is destroyed, names and values of nodes in DOM tree will become destroyed as well. Additionally, whitespace processing, character entity translation, and zero-termination of strings require that source text be modified during parsing (but see non-destructive mode). This makes the text useless for further processing once it was parsed by RapidXml. <br/><br/>
+ In many cases however, these are not serious issues.</para></sect2><sect2><h3 id="namespacerapidxml_1ownership_of_strings">3.2 Ownership Of Strings</h3><para>Nodes and attributes produced by RapidXml do not own their name and value strings. They merely hold the pointers to them. This means you have to be careful when setting these values manually, by using <a href="#classrapidxml_1_1xml__base_e099c291e104a0d277307fe71f5e0f9e_1e099c291e104a0d277307fe71f5e0f9e" kindref="member">xml_base::name(const Ch *)</a> or <a href="#classrapidxml_1_1xml__base_18c7469acdca771de9b4f3054053029c_118c7469acdca771de9b4f3054053029c" kindref="member">xml_base::value(const Ch *)</a> functions. Care must be taken to ensure that lifetime of the string passed is at least as long as lifetime of the node/attribute. The easiest way to achieve it is to allocate the string from <a href="#classrapidxml_1_1memory__pool" kindref="compound">memory_pool</a> owned by the document. Use <a href="#classrapidxml_1_1memory__pool_69729185bc59b0875192d667c47b8859_169729185bc59b0875192d667c47b8859" kindref="member">memory_pool::allocate_string()</a> function for this purpose.</para></sect2><sect2><h3 id="namespacerapidxml_1destructive_non_destructive">3.3 Destructive Vs Non-Destructive Mode</h3><para>By default, the parser modifies source text during the parsing process. This is required to achieve character entity translation, whitespace normalization, and zero-termination of strings. <br/><br/>
+ In some cases this behaviour may be undesirable, for example if source text resides in read only memory, or is mapped to memory directly from file. By using appropriate parser flags (parse_non_destructive), source text modifications can be disabled. However, because RapidXml does in-situ parsing, it obviously has the following side-effects:<ul><li><para>no whitespace normalization is done</para></li><li><para>no entity reference translation is done</para></li><li><para>names and values are not zero-terminated, you must use <a href="#classrapidxml_1_1xml__base_0dae694c8f7e4d89f1003e2f3a15a43c_10dae694c8f7e4d89f1003e2f3a15a43c" kindref="member">xml_base::name_size()</a> and <a href="#classrapidxml_1_1xml__base_aed5ae791b7164c1ee5e649198cbb3db_1aed5ae791b7164c1ee5e649198cbb3db" kindref="member">xml_base::value_size()</a> functions to tell where they end</para></li></ul></para></sect2></sect1><sect1><h2 id="namespacerapidxml_1performance">4. Performance</h2><para>RapidXml achieves its speed through use of several techniques:<ul><li><para>In-situ parsing. When building DOM tree, RapidXml does not make copies of string data, such as node names and values. Instead, it stores pointers to interior of the source text.</para></li><li><para>Use of template metaprogramming techniques. This allows it to move much of the work to compile time. Through magic of the templates, C++ compiler generates a separate copy of parsing code for any combination of parser flags you use. In each copy, all possible decisions are made at compile time and all unused code is omitted.</para></li><li><para>Extensive use of lookup tables for parsing.</para></li><li><para>Hand-tuned C++ with profiling done on several most popular CPUs.</para></li></ul></para><para>This results in a very small and fast code: a parser which is custom tailored to exact needs with each invocation.</para><sect2><h3 id="namespacerapidxml_1performance_charts">4.1 Comparison With Other Parsers</h3><para>The table below compares speed of RapidXml to some other parsers, and to <code>strlen()</code> function executed on the same data. On a modern CPU (as of 2007), you can expect parsing throughput to be close to 1 GB/s. As a rule of thumb, parsing speed is about 50-100x faster than Xerces DOM, 30-60x faster than TinyXml, 3-12x faster than pugxml, and about 5% - 30% faster than pugixml, the fastest XML parser I know of.</para><para><ul><li><para>The test file is a real-world, 50kB large, moderately dense XML file. </para></li><li><para>All timing is done by using RDTSC instruction present in Pentium-compatible CPUs. </para></li><li><para>No profile-guided optimizations are used. </para></li><li><para>All parsers are running in their fastest modes. </para></li><li><para>The results are given in CPU cycles per character, so frequency of CPUs is irrelevant. </para></li><li><para>The results are minimum values from a large number of runs, to minimize effects of operating system activity, task switching, interrupt handling etc. </para></li><li><para>A single parse of the test file takes about 1/10th of a millisecond, so with large number of runs there is a good chance of hitting at least one no-interrupt streak, and obtaining undisturbed results. </para></li></ul><table rows="9" cols="7" border="1" cellpadding="3pt"><tr><th thead="yes"><para><center>Platform</center></para></th><th thead="yes"><para><center>Compiler</center></para></th><th thead="yes"><para>strlen() </para></th><th thead="yes"><para>RapidXml </para></th><th thead="yes"><para>pugixml 0.3 </para></th><th thead="yes"><para>pugxml </para></th><th thead="yes"><para>TinyXml  </para></th></tr><tr><td thead="no"><para><center>Pentium 4</center></para></td><td thead="no"><para><center>MSVC 8.0</center></para></td><td thead="no"><para><center>2.5</center></para></td><td thead="no"><para><center>5.4</center></para></td><td thead="no"><para><center>7.0</center></para></td><td thead="no"><para><center>61.7</center></para></td><td thead="no"><para><center>298.8</center></para></td></tr><tr><td thead="no"><para><center>Pentium 4</center></para></td><td thead="no"><para><center>gcc 4.1.1</center></para></td><td thead="no"><para><center>0.8</center></para></td><td thead="no"><para><center>6.1</center></para></td><td thead="no"><para><center>9.5</center></para></td><td thead="no"><para><center>67.0</center></para></td><td thead="no"><para><center>413.2</center></para></td></tr><tr><td thead="no"><para><center>Core 2</center></para></td><td thead="no"><para><center>MSVC 8.0</center></para></td><td thead="no"><para><center>1.0</center></para></td><td thead="no"><para><center>4.5</center></para></td><td thead="no"><para><center>5.0</center></para></td><td thead="no"><para><center>24.6</center></para></td><td thead="no"><para><center>154.8</center></para></td></tr><tr><td thead="no"><para><center>Core 2</center></para></td><td thead="no"><para><center>gcc 4.1.1</center></para></td><td thead="no"><para><center>0.6</center></para></td><td thead="no"><para><center>4.6</center></para></td><td thead="no"><para><center>5.4</center></para></td><td thead="no"><para><center>28.3</center></para></td><td thead="no"><para><center>229.3</center></para></td></tr><tr><td thead="no"><para><center>Athlon XP</center></para></td><td thead="no"><para><center>MSVC 8.0</center></para></td><td thead="no"><para><center>3.1</center></para></td><td thead="no"><para><center>7.7</center></para></td><td thead="no"><para><center>8.0</center></para></td><td thead="no"><para><center>25.5</center></para></td><td thead="no"><para><center>182.6</center></para></td></tr><tr><td thead="no"><para><center>Athlon XP</center></para></td><td thead="no"><para><center>gcc 4.1.1</center></para></td><td thead="no"><para><center>0.9</center></para></td><td thead="no"><para><center>8.2</center></para></td><td thead="no"><para><center>9.2</center></para></td><td thead="no"><para><center>33.7</center></para></td><td thead="no"><para><center>265.2</center></para></td></tr><tr><td thead="no"><para><center>Pentium 3</center></para></td><td thead="no"><para><center>MSVC 8.0</center></para></td><td thead="no"><para><center>2.0</center></para></td><td thead="no"><para><center>6.3</center></para></td><td thead="no"><para><center>7.0</center></para></td><td thead="no"><para><center>30.9</center></para></td><td thead="no"><para><center>211.9</center></para></td></tr><tr><td thead="no"><para><center>Pentium 3</center></para></td><td thead="no"><para><center>gcc 4.1.1</center></para></td><td thead="no"><para><center>1.0</center></para></td><td thead="no"><para><center>6.7</center></para></td><td thead="no"><para><center>8.9</center></para></td><td thead="no"><para><center>35.3</center></para></td><td thead="no"><para><center>316.0</center></para></td></tr></table><i>(*) All results are in CPU cycles per character of source text</i></para></sect2></sect1><sect1><h2 id="namespacerapidxml_1reference">5. Reference</h2><para>This section lists all classes, functions, constants etc. and describes them in detail. </para></sect1></detaileddescription><dl><dt>class
+								  template
+							   <a href="#classrapidxml_1_1memory__pool">rapidxml::memory_pool</a></dt><dt class="indented">
+				constructor
+			 <a href="#classrapidxml_1_1memory__pool_f8fb3c8f1a564f8045c40bcd07a89866_1f8fb3c8f1a564f8045c40bcd07a89866">memory_pool()</a></dt><dt class="indented">
+				destructor
+			 <a href="#classrapidxml_1_1memory__pool_6f8c7990d9ec1ed2acf6558b238570eb_16f8c7990d9ec1ed2acf6558b238570eb">~memory_pool()</a></dt><dt class="indented">function <a href="#classrapidxml_1_1memory__pool_750ba3c610b129ac057d817509d08f41_1750ba3c610b129ac057d817509d08f41">allocate_node(node_type type, const Ch *name=0, const Ch *value=0, std::size_t name_size=0, std::size_t value_size=0)</a></dt><dt class="indented">function <a href="#classrapidxml_1_1memory__pool_462de142669e0ff649e8e615b82bf457_1462de142669e0ff649e8e615b82bf457">allocate_attribute(const Ch *name=0, const Ch *value=0, std::size_t name_size=0, std::size_t value_size=0)</a></dt><dt class="indented">function <a href="#classrapidxml_1_1memory__pool_69729185bc59b0875192d667c47b8859_169729185bc59b0875192d667c47b8859">allocate_string(const Ch *source=0, std::size_t size=0)</a></dt><dt class="indented">function <a href="#classrapidxml_1_1memory__pool_95c49fcb056e9103ec906a59e3e01d76_195c49fcb056e9103ec906a59e3e01d76">clone_node(const xml_node&lt; Ch &gt; *source, xml_node&lt; Ch &gt; *result=0)</a></dt><dt class="indented">function <a href="#classrapidxml_1_1memory__pool_c8bb3912a3ce86b15842e79d0b421204_1c8bb3912a3ce86b15842e79d0b421204">clear()</a></dt><dt class="indented">function <a href="#classrapidxml_1_1memory__pool_c0a55a6ef0837dca67572e357100d78a_1c0a55a6ef0837dca67572e357100d78a">set_allocator(alloc_func *af, free_func *ff)</a></dt><dt class="indented"/><dt class="indented"/><dt class="indented"/><dt class="indented"/><dt class="indented"/><dt class="indented"/><dt class="indented"/><dt class="indented"/><dt class="indented"/><dt class="indented"/><p/><p/><dt>class <a href="#classrapidxml_1_1parse__error">rapidxml::parse_error</a></dt><dt class="indented">
+				constructor
+			 <a href="#classrapidxml_1_1parse__error_4dd8d1bdbd9221df4dcb90cafaee3332_14dd8d1bdbd9221df4dcb90cafaee3332">parse_error(const char *what, void *where)</a></dt><dt class="indented">function <a href="#classrapidxml_1_1parse__error_ff06f49065b54a8a86e02e9a2441a8ba_1ff06f49065b54a8a86e02e9a2441a8ba">what() const </a></dt><dt class="indented">function <a href="#classrapidxml_1_1parse__error_377be7d201d95221c318682c35377aca_1377be7d201d95221c318682c35377aca">where() const </a></dt><dt class="indented"/><dt class="indented"/><p/><dt>class
+								  template
+							   <a href="#classrapidxml_1_1xml__attribute">rapidxml::xml_attribute</a></dt><dt class="indented">
+				constructor
+			 <a href="#classrapidxml_1_1xml__attribute_d5464aadf08269a886b730993525db34_1d5464aadf08269a886b730993525db34">xml_attribute()</a></dt><dt class="indented">function <a href="#classrapidxml_1_1xml__attribute_77aea7d8d996ba4f6bd61cc478a4e72d_177aea7d8d996ba4f6bd61cc478a4e72d">document() const </a></dt><dt class="indented">function <a href="#classrapidxml_1_1xml__attribute_5c4a98d2b75f9b41b12c110108fd55ab_15c4a98d2b75f9b41b12c110108fd55ab">previous_attribute(const Ch *name=0, std::size_t name_size=0, bool case_sensitive=true) const </a></dt><dt class="indented">function <a href="#classrapidxml_1_1xml__attribute_1b8a814d0d3a7165396b08433eee8a91_11b8a814d0d3a7165396b08433eee8a91">next_attribute(const Ch *name=0, std::size_t name_size=0, bool case_sensitive=true) const </a></dt><dt class="indented"/><dt class="indented"/><dt class="indented"/><p/><dt>class
+								  template
+							   <a href="#classrapidxml_1_1xml__base">rapidxml::xml_base</a></dt><dt class="indented">
+				constructor
+			 <a href="#classrapidxml_1_1xml__base_23630d2c130a9e0e3f3afa7584a9b218_123630d2c130a9e0e3f3afa7584a9b218">xml_base()</a></dt><dt class="indented">function <a href="#classrapidxml_1_1xml__base_622eade29fdf7806d3ef93ac4d90e707_1622eade29fdf7806d3ef93ac4d90e707">name() const </a></dt><dt class="indented">function <a href="#classrapidxml_1_1xml__base_0dae694c8f7e4d89f1003e2f3a15a43c_10dae694c8f7e4d89f1003e2f3a15a43c">name_size() const </a></dt><dt class="indented">function <a href="#classrapidxml_1_1xml__base_c54fa4987fb503916a7b541eb15c9c7f_1c54fa4987fb503916a7b541eb15c9c7f">value() const </a></dt><dt class="indented">function <a href="#classrapidxml_1_1xml__base_aed5ae791b7164c1ee5e649198cbb3db_1aed5ae791b7164c1ee5e649198cbb3db">value_size() const </a></dt><dt class="indented">function <a href="#classrapidxml_1_1xml__base_4e7e23d06d48126c65b1f6266acfba5c_14e7e23d06d48126c65b1f6266acfba5c">name(const Ch *name, std::size_t size)</a></dt><dt class="indented">function <a href="#classrapidxml_1_1xml__base_e099c291e104a0d277307fe71f5e0f9e_1e099c291e104a0d277307fe71f5e0f9e">name(const Ch *name)</a></dt><dt class="indented">function <a href="#classrapidxml_1_1xml__base_d9640aa3f5374673cb72a5289b6c91eb_1d9640aa3f5374673cb72a5289b6c91eb">value(const Ch *value, std::size_t size)</a></dt><dt class="indented">function <a href="#classrapidxml_1_1xml__base_18c7469acdca771de9b4f3054053029c_118c7469acdca771de9b4f3054053029c">value(const Ch *value)</a></dt><dt class="indented">function <a href="#classrapidxml_1_1xml__base_798e8df7ea53ade4d9f0701017dce80e_1798e8df7ea53ade4d9f0701017dce80e">parent() const </a></dt><dt class="indented"/><dt class="indented"/><dt class="indented"/><dt class="indented"/><dt class="indented"/><dt class="indented"/><p/><dt>class
+								  template
+							   <a href="#classrapidxml_1_1xml__document">rapidxml::xml_document</a></dt><dt class="indented">
+				constructor
+			 <a href="#classrapidxml_1_1xml__document_6ce266cc52d549c42abe3a3d5e8af9ba_16ce266cc52d549c42abe3a3d5e8af9ba">xml_document()</a></dt><dt class="indented">function <a href="#classrapidxml_1_1xml__document_8338ce6042e7b04d5a42144fb446b69c_18338ce6042e7b04d5a42144fb446b69c">parse(Ch *text)</a></dt><dt class="indented">function <a href="#classrapidxml_1_1xml__document_c8bb3912a3ce86b15842e79d0b421204_1c8bb3912a3ce86b15842e79d0b421204">clear()</a></dt><dt class="indented"/><dt class="indented"/><dt class="indented"/><dt class="indented"/><dt class="indented"/><dt class="indented"/><dt class="indented"/><dt class="indented"/><dt class="indented"/><dt class="indented"/><dt class="indented"/><dt class="indented"/><dt class="indented"/><dt class="indented"/><p/><p/><p/><p/><p/><p/><p/><p/><p/><dt>class
+								  template
+							   <a href="#classrapidxml_1_1xml__node">rapidxml::xml_node</a></dt><dt class="indented">
+				constructor
+			 <a href="#classrapidxml_1_1xml__node_34c55af3504549a475e5b9dfcaa6adf5_134c55af3504549a475e5b9dfcaa6adf5">xml_node(node_type type)</a></dt><dt class="indented">function <a href="#classrapidxml_1_1xml__node_975e86937621ae4afe6a423219de30d0_1975e86937621ae4afe6a423219de30d0">type() const </a></dt><dt class="indented">function <a href="#classrapidxml_1_1xml__node_77aea7d8d996ba4f6bd61cc478a4e72d_177aea7d8d996ba4f6bd61cc478a4e72d">document() const </a></dt><dt class="indented">function <a href="#classrapidxml_1_1xml__node_7823e36687669e59c2afdf66334ef35a_17823e36687669e59c2afdf66334ef35a">first_node(const Ch *name=0, std::size_t name_size=0, bool case_sensitive=true) const </a></dt><dt class="indented">function <a href="#classrapidxml_1_1xml__node_fcb6e2209b591a36d2dadba20d2bc7cc_1fcb6e2209b591a36d2dadba20d2bc7cc">last_node(const Ch *name=0, std::size_t name_size=0, bool case_sensitive=true) const </a></dt><dt class="indented">function <a href="#classrapidxml_1_1xml__node_ac2f6886c0107e9d5f156e9542546df6_1ac2f6886c0107e9d5f156e9542546df6">previous_sibling(const Ch *name=0, std::size_t name_size=0, bool case_sensitive=true) const </a></dt><dt class="indented">function <a href="#classrapidxml_1_1xml__node_b3ead2cefecc03a813836203e3f6f38f_1b3ead2cefecc03a813836203e3f6f38f">next_sibling(const Ch *name=0, std::size_t name_size=0, bool case_sensitive=true) const </a></dt><dt class="indented">function <a href="#classrapidxml_1_1xml__node_5810a09f82f8d53efbe9456286dcec83_15810a09f82f8d53efbe9456286dcec83">first_attribute(const Ch *name=0, std::size_t name_size=0, bool case_sensitive=true) const </a></dt><dt class="indented">function <a href="#classrapidxml_1_1xml__node_16953d66751b5b949ee4ee2d9c0bc63a_116953d66751b5b949ee4ee2d9c0bc63a">last_attribute(const Ch *name=0, std::size_t name_size=0, bool case_sensitive=true) const </a></dt><dt class="indented">function <a href="#classrapidxml_1_1xml__node_a78759bfa429fa2ab6bc5fe617cfa3cf_1a78759bfa429fa2ab6bc5fe617cfa3cf">type(node_type type)</a></dt><dt class="indented">function <a href="#classrapidxml_1_1xml__node_0c39df6617e709eb2fba11300dea63f2_10c39df6617e709eb2fba11300dea63f2">prepend_node(xml_node&lt; Ch &gt; *child)</a></dt><dt class="indented">function <a href="#classrapidxml_1_1xml__node_86de2e22276826089b7baed2599f8dee_186de2e22276826089b7baed2599f8dee">append_node(xml_node&lt; Ch &gt; *child)</a></dt><dt class="indented">function <a href="#classrapidxml_1_1xml__node_780972a57fc447250ab47cc8f421b65e_1780972a57fc447250ab47cc8f421b65e">insert_node(xml_node&lt; Ch &gt; *where, xml_node&lt; Ch &gt; *child)</a></dt><dt class="indented">function <a href="#classrapidxml_1_1xml__node_9a31d861e1bddc710839c551a5d2b3a4_19a31d861e1bddc710839c551a5d2b3a4">remove_first_node()</a></dt><dt class="indented">function <a href="#classrapidxml_1_1xml__node_87addf2bc127ee31aa4b5295d3c9b530_187addf2bc127ee31aa4b5295d3c9b530">remove_last_node()</a></dt><dt class="indented">function <a href="#classrapidxml_1_1xml__node_9316463a2201631e7e2062b17729f9cd_19316463a2201631e7e2062b17729f9cd">remove_node(xml_node&lt; Ch &gt; *where)</a></dt><dt class="indented">function <a href="#classrapidxml_1_1xml__node_0218147d13e41d5fa60ced4e7a7e9726_10218147d13e41d5fa60ced4e7a7e9726">remove_all_nodes()</a></dt><dt class="indented">function <a href="#classrapidxml_1_1xml__node_f6dffa513da74cc0be71a7ba84f8265e_1f6dffa513da74cc0be71a7ba84f8265e">prepend_attribute(xml_attribute&lt; Ch &gt; *attribute)</a></dt><dt class="indented">function <a href="#classrapidxml_1_1xml__node_8fbd4f5ef7169d493da9f8d87ac04b77_18fbd4f5ef7169d493da9f8d87ac04b77">append_attribute(xml_attribute&lt; Ch &gt; *attribute)</a></dt><dt class="indented">function <a href="#classrapidxml_1_1xml__node_070d5888b0557fe06a5b24961de1b988_1070d5888b0557fe06a5b24961de1b988">insert_attribute(xml_attribute&lt; Ch &gt; *where, xml_attribute&lt; Ch &gt; *attribute)</a></dt><dt class="indented">function <a href="#classrapidxml_1_1xml__node_4eea4a7f6cb484ca9944f7eafe6e1843_14eea4a7f6cb484ca9944f7eafe6e1843">remove_first_attribute()</a></dt><dt class="indented">function <a href="#classrapidxml_1_1xml__node_37d87c4d5d89fa0cf05b72ee8d4cba3b_137d87c4d5d89fa0cf05b72ee8d4cba3b">remove_last_attribute()</a></dt><dt class="indented">function <a href="#classrapidxml_1_1xml__node_c75154db2e768c0e5b541fc8cd0775ab_1c75154db2e768c0e5b541fc8cd0775ab">remove_attribute(xml_attribute&lt; Ch &gt; *where)</a></dt><dt class="indented">function <a href="#classrapidxml_1_1xml__node_59e6ad4cfd5e8096c052e71d79561eda_159e6ad4cfd5e8096c052e71d79561eda">remove_all_attributes()</a></dt><dt class="indented"/><dt class="indented"/><dt class="indented"/><dt class="indented"/><dt class="indented"/><dt class="indented"/><dt class="indented"/><dt class="indented"/><dt class="indented"/><p/><dt>namespace <a href="#namespacerapidxml">rapidxml</a></dt><dt class="indented">enum <a href="#namespacerapidxml_6a276b85e2da28c5f9c3dbce61c55682_16a276b85e2da28c5f9c3dbce61c55682">node_type</a></dt><dt class="indented"/><dt class="indented"/><dt class="indented"/><dt class="indented"/><dt class="indented"/><dt class="indented"/><dt class="indented"/><dt class="indented"/><dt class="indented">function <a href="#namespacerapidxml_ff5d67f74437199d316d2b2660653ae1_1ff5d67f74437199d316d2b2660653ae1">parse_error_handler(const char *what, void *where)</a></dt><dt class="indented">function <a href="#namespacerapidxml_b94d570fc4c4ab2423813cd0243326b1_1b94d570fc4c4ab2423813cd0243326b1">print(OutIt out, const xml_node&lt; Ch &gt; &amp;node, int flags=0)</a></dt><dt class="indented">function <a href="#namespacerapidxml_13bc37d6d1047acb0efdbc1689221a5e_113bc37d6d1047acb0efdbc1689221a5e">print(std::basic_ostream&lt; Ch &gt; &amp;out, const xml_node&lt; Ch &gt; &amp;node, int flags=0)</a></dt><dt class="indented">function <a href="#namespacerapidxml_5619b38000d967fb223b2b0a8c17463a_15619b38000d967fb223b2b0a8c17463a">operator&lt;&lt;(std::basic_ostream&lt; Ch &gt; &amp;out, const xml_node&lt; Ch &gt; &amp;node)</a></dt><dt class="indented">
+				constant
+			 <a href="#namespacerapidxml_87e8bbab53702cf3b438bd553c10b6b9_187e8bbab53702cf3b438bd553c10b6b9">parse_no_data_nodes</a></dt><dt class="indented">
+				constant
+			 <a href="#namespacerapidxml_97e2c4fdc04fae17126f9971a4fc993e_197e2c4fdc04fae17126f9971a4fc993e">parse_no_element_values</a></dt><dt class="indented">
+				constant
+			 <a href="#namespacerapidxml_9cae3801e70437cbc410c24bf6be691c_19cae3801e70437cbc410c24bf6be691c">parse_no_string_terminators</a></dt><dt class="indented">
+				constant
+			 <a href="#namespacerapidxml_7223b7815c4fb8b42e6e4e77e1ea6b97_17223b7815c4fb8b42e6e4e77e1ea6b97">parse_no_entity_translation</a></dt><dt class="indented">
+				constant
+			 <a href="#namespacerapidxml_ccde57f6054857ee4042a1b4d98c83b9_1ccde57f6054857ee4042a1b4d98c83b9">parse_no_utf8</a></dt><dt class="indented">
+				constant
+			 <a href="#namespacerapidxml_52e2c934ad9c845a5f4cc49570470556_152e2c934ad9c845a5f4cc49570470556">parse_declaration_node</a></dt><dt class="indented">
+				constant
+			 <a href="#namespacerapidxml_0f7479dacbc868456d07897a8c072784_10f7479dacbc868456d07897a8c072784">parse_comment_nodes</a></dt><dt class="indented">
+				constant
+			 <a href="#namespacerapidxml_8e187746ba1ca04f107951ad32df962e_18e187746ba1ca04f107951ad32df962e">parse_doctype_node</a></dt><dt class="indented">
+				constant
+			 <a href="#namespacerapidxml_1c20b2b2b75711cd76423e119c49f830_11c20b2b2b75711cd76423e119c49f830">parse_pi_nodes</a></dt><dt class="indented">
+				constant
+			 <a href="#namespacerapidxml_a5daff9d61c7d4eaf98e4d42efe628ee_1a5daff9d61c7d4eaf98e4d42efe628ee">parse_validate_closing_tags</a></dt><dt class="indented">
+				constant
+			 <a href="#namespacerapidxml_ac1f06b1afd47b812732fb521b146fd9_1ac1f06b1afd47b812732fb521b146fd9">parse_trim_whitespace</a></dt><dt class="indented">
+				constant
+			 <a href="#namespacerapidxml_88f95d4e275ba01408fefde83078651b_188f95d4e275ba01408fefde83078651b">parse_normalize_whitespace</a></dt><dt class="indented">
+				constant
+			 <a href="#namespacerapidxml_45751cf2f38fd6915f35b3122b46d5b6_145751cf2f38fd6915f35b3122b46d5b6">parse_default</a></dt><dt class="indented">
+				constant
+			 <a href="#namespacerapidxml_a97ba1a0a79a6d66f4eef3612508d943_1a97ba1a0a79a6d66f4eef3612508d943">parse_non_destructive</a></dt><dt class="indented">
+				constant
+			 <a href="#namespacerapidxml_398c5476e76102f8bd76c10bb0abbe10_1398c5476e76102f8bd76c10bb0abbe10">parse_fastest</a></dt><dt class="indented">
+				constant
+			 <a href="#namespacerapidxml_b4f2515265facb42291570307924bd57_1b4f2515265facb42291570307924bd57">parse_full</a></dt><dt class="indented">
+				constant
+			 <a href="#namespacerapidxml_b08b8d4293c203b69ed6c5ae77ac1907_1b08b8d4293c203b69ed6c5ae77ac1907">print_no_indenting</a></dt><p/><p/><p/><p/></dl><hr/><h3 class="reference-header" id="classrapidxml_1_1memory__pool">class
+									  template
+								   rapidxml::memory_pool</h3>
+
+							  Defined in <a href="rapidxml.hpp">rapidxml.hpp</a><br/>
+								  Base class for
+								  <a href="#classrapidxml_1_1xml__document">xml_document</a> <h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">This class is used by the parser to create new nodes and attributes, without overheads of dynamic memory allocation. In most cases, you will not need to use this class directly. However, if you need to create nodes manually or modify names/values of nodes, you are encouraged to use <a href="#classrapidxml_1_1memory__pool" kindref="compound">memory_pool</a> of relevant <a href="#classrapidxml_1_1xml__document" kindref="compound">xml_document</a> to allocate the memory. Not only is this faster than allocating them by using <code>new</code> operator, but also their lifetime will be tied to the lifetime of document, possibly simplyfing memory management. <br/><br/>
+ Call <a href="#classrapidxml_1_1memory__pool_750ba3c610b129ac057d817509d08f41_1750ba3c610b129ac057d817509d08f41" kindref="member">allocate_node()</a> or <a href="#classrapidxml_1_1memory__pool_462de142669e0ff649e8e615b82bf457_1462de142669e0ff649e8e615b82bf457" kindref="member">allocate_attribute()</a> functions to obtain new nodes or attributes from the pool. You can also call <a href="#classrapidxml_1_1memory__pool_69729185bc59b0875192d667c47b8859_169729185bc59b0875192d667c47b8859" kindref="member">allocate_string()</a> function to allocate strings. Such strings can then be used as names or values of nodes without worrying about their lifetime. Note that there is no <code>free()</code> function -- all allocations are freed at once when <a href="#classrapidxml_1_1memory__pool_c8bb3912a3ce86b15842e79d0b421204_1c8bb3912a3ce86b15842e79d0b421204" kindref="member">clear()</a> function is called, or when the pool is destroyed. <br/><br/>
+ It is also possible to create a standalone <a href="#classrapidxml_1_1memory__pool" kindref="compound">memory_pool</a>, and use it to allocate nodes, whose lifetime will not be tied to any document. <br/><br/>
+ Pool maintains <code>RAPIDXML_STATIC_POOL_SIZE</code> bytes of statically allocated memory. Until static memory is exhausted, no dynamic memory allocations are done. When static memory is exhausted, pool allocates additional blocks of memory of size <code>RAPIDXML_DYNAMIC_POOL_SIZE</code> each, by using global <code>new[]</code> and <code>delete[]</code> operators. This behaviour can be changed by setting custom allocation routines. Use <a href="#classrapidxml_1_1memory__pool_c0a55a6ef0837dca67572e357100d78a_1c0a55a6ef0837dca67572e357100d78a" kindref="member">set_allocator()</a> function to set them. <br/><br/>
+ Allocations for nodes, attributes and strings are aligned at <code>RAPIDXML_ALIGNMENT</code> bytes. This value defaults to the size of pointer on target architecture. <br/><br/>
+ To obtain absolutely top performance from the parser, it is important that all nodes are allocated from a single, contiguous block of memory. Otherwise, cache misses when jumping between two (or more) disjoint blocks of memory can slow down parsing quite considerably. If required, you can tweak <code>RAPIDXML_STATIC_POOL_SIZE</code>, <code>RAPIDXML_DYNAMIC_POOL_SIZE</code> and <code>RAPIDXML_ALIGNMENT</code> to obtain best wasted memory to performance compromise. To do it, define their values before <a href="#rapidxml_8hpp" kindref="compound">rapidxml.hpp</a> file is included. </para><h4>Parameters</h4><dl><dt class="parameter-name">Ch</dt><dd>Character type of created nodes. </dd></dl><p/><h3 class="reference-header" id="classrapidxml_1_1memory__pool_f8fb3c8f1a564f8045c40bcd07a89866_1f8fb3c8f1a564f8045c40bcd07a89866">
+				constructor
+			 memory_pool::memory_pool</h3><h4>Synopsis</h4><code class="synopsis">memory_pool();
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Constructs empty pool with default allocator functions. </para><p/><h3 class="reference-header" id="classrapidxml_1_1memory__pool_6f8c7990d9ec1ed2acf6558b238570eb_16f8c7990d9ec1ed2acf6558b238570eb">
+				destructor
+			 memory_pool::~memory_pool</h3><h4>Synopsis</h4><code class="synopsis">~memory_pool();
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Destroys pool and frees all the memory. This causes memory occupied by nodes allocated by the pool to be freed. Nodes allocated from the pool are no longer valid. </para><p/><h3 class="reference-header" id="classrapidxml_1_1memory__pool_750ba3c610b129ac057d817509d08f41_1750ba3c610b129ac057d817509d08f41">function memory_pool::allocate_node</h3><h4>Synopsis</h4><code class="synopsis">xml_node&lt;Ch&gt;* allocate_node(node_type type, const Ch *name=0, const Ch *value=0, std::size_t name_size=0, std::size_t value_size=0);
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Allocates a new node from the pool, and optionally assigns name and value to it. If the allocation request cannot be accomodated, this function will throw <code>std::bad_alloc</code>. If exceptions are disabled by defining RAPIDXML_NO_EXCEPTIONS, this function will call <a href="#namespacerapidxml_ff5d67f74437199d316d2b2660653ae1_1ff5d67f74437199d316d2b2660653ae1" kindref="member">rapidxml::parse_error_handler()</a> function. </para><h4>Parameters</h4><dl><dt class="parameter-name">type</dt><dd class="parameter-def">Type of node to create. </dd></dl><dl><dt class="parameter-name">name</dt><dd class="parameter-def">Name to assign to the node, or 0 to assign no name. </dd></dl><dl><dt class="parameter-name">value</dt><dd class="parameter-def">Value to assign to the node, or 0 to assign no value. </dd></dl><dl><dt class="parameter-name">name_size</dt><dd class="parameter-def">Size of name to assign, or 0 to automatically calculate size from name string. </dd></dl><dl><dt class="parameter-name">value_size</dt><dd class="parameter-def">Size of value to assign, or 0 to automatically calculate size from value string. </dd></dl><h4>Returns</h4>Pointer to allocated node. This pointer will never be NULL. <p/><h3 class="reference-header" id="classrapidxml_1_1memory__pool_462de142669e0ff649e8e615b82bf457_1462de142669e0ff649e8e615b82bf457">function memory_pool::allocate_attribute</h3><h4>Synopsis</h4><code class="synopsis">xml_attribute&lt;Ch&gt;* allocate_attribute(const Ch *name=0, const Ch *value=0, std::size_t name_size=0, std::size_t value_size=0);
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Allocates a new attribute from the pool, and optionally assigns name and value to it. If the allocation request cannot be accomodated, this function will throw <code>std::bad_alloc</code>. If exceptions are disabled by defining RAPIDXML_NO_EXCEPTIONS, this function will call <a href="#namespacerapidxml_ff5d67f74437199d316d2b2660653ae1_1ff5d67f74437199d316d2b2660653ae1" kindref="member">rapidxml::parse_error_handler()</a> function. </para><h4>Parameters</h4><dl><dt class="parameter-name">name</dt><dd class="parameter-def">Name to assign to the attribute, or 0 to assign no name. </dd></dl><dl><dt class="parameter-name">value</dt><dd class="parameter-def">Value to assign to the attribute, or 0 to assign no value. </dd></dl><dl><dt class="parameter-name">name_size</dt><dd class="parameter-def">Size of name to assign, or 0 to automatically calculate size from name string. </dd></dl><dl><dt class="parameter-name">value_size</dt><dd class="parameter-def">Size of value to assign, or 0 to automatically calculate size from value string. </dd></dl><h4>Returns</h4>Pointer to allocated attribute. This pointer will never be NULL. <p/><h3 class="reference-header" id="classrapidxml_1_1memory__pool_69729185bc59b0875192d667c47b8859_169729185bc59b0875192d667c47b8859">function memory_pool::allocate_string</h3><h4>Synopsis</h4><code class="synopsis">Ch* allocate_string(const Ch *source=0, std::size_t size=0);
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Allocates a char array of given size from the pool, and optionally copies a given string to it. If the allocation request cannot be accomodated, this function will throw <code>std::bad_alloc</code>. If exceptions are disabled by defining RAPIDXML_NO_EXCEPTIONS, this function will call <a href="#namespacerapidxml_ff5d67f74437199d316d2b2660653ae1_1ff5d67f74437199d316d2b2660653ae1" kindref="member">rapidxml::parse_error_handler()</a> function. </para><h4>Parameters</h4><dl><dt class="parameter-name">source</dt><dd class="parameter-def">String to initialize the allocated memory with, or 0 to not initialize it. </dd></dl><dl><dt class="parameter-name">size</dt><dd class="parameter-def">Number of characters to allocate, or zero to calculate it automatically from source string length; if size is 0, source string must be specified and null terminated. </dd></dl><h4>Returns</h4>Pointer to allocated char array. This pointer will never be NULL. <p/><h3 class="reference-header" id="classrapidxml_1_1memory__pool_95c49fcb056e9103ec906a59e3e01d76_195c49fcb056e9103ec906a59e3e01d76">function memory_pool::clone_node</h3><h4>Synopsis</h4><code class="synopsis">xml_node&lt;Ch&gt;* clone_node(const xml_node&lt; Ch &gt; *source, xml_node&lt; Ch &gt; *result=0);
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Clones an <a href="#classrapidxml_1_1xml__node" kindref="compound">xml_node</a> and its hierarchy of child nodes and attributes. Nodes and attributes are allocated from this memory pool. Names and values are not cloned, they are shared between the clone and the source. Result node can be optionally specified as a second parameter, in which case its contents will be replaced with cloned source node. This is useful when you want to clone entire document. </para><h4>Parameters</h4><dl><dt class="parameter-name">source</dt><dd class="parameter-def">Node to clone. </dd></dl><dl><dt class="parameter-name">result</dt><dd class="parameter-def">Node to put results in, or 0 to automatically allocate result node </dd></dl><h4>Returns</h4>Pointer to cloned node. This pointer will never be NULL. <p/><h3 class="reference-header" id="classrapidxml_1_1memory__pool_c8bb3912a3ce86b15842e79d0b421204_1c8bb3912a3ce86b15842e79d0b421204">function memory_pool::clear</h3><h4>Synopsis</h4><code class="synopsis">void clear();
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Clears the pool. This causes memory occupied by nodes allocated by the pool to be freed. Any nodes or strings allocated from the pool will no longer be valid. </para><p/><h3 class="reference-header" id="classrapidxml_1_1memory__pool_c0a55a6ef0837dca67572e357100d78a_1c0a55a6ef0837dca67572e357100d78a">function memory_pool::set_allocator</h3><h4>Synopsis</h4><code class="synopsis">void set_allocator(alloc_func *af, free_func *ff);
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Sets or resets the user-defined memory allocation functions for the pool. This can only be called when no memory is allocated from the pool yet, otherwise results are undefined. Allocation function must not return invalid pointer on failure. It should either throw, stop the program, or use <code>longjmp()</code> function to pass control to other place of program. If it returns invalid pointer, results are undefined. <br/><br/>
+ User defined allocation functions must have the following forms: <br/><code><br/>
+void *allocate(std::size_t size); <br/>
+void free(void *pointer); </code><br/></para><h4>Parameters</h4><dl><dt class="parameter-name">af</dt><dd class="parameter-def">Allocation function, or 0 to restore default function </dd></dl><dl><dt class="parameter-name">ff</dt><dd class="parameter-def">Free function, or 0 to restore default function </dd></dl><p/><h3 class="reference-header" id="classrapidxml_1_1parse__error">class rapidxml::parse_error</h3>
+
+							  Defined in <a href="rapidxml.hpp">rapidxml.hpp</a><br/><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Parse error exception. This exception is thrown by the parser when an error occurs. Use <a href="#classrapidxml_1_1parse__error_ff06f49065b54a8a86e02e9a2441a8ba_1ff06f49065b54a8a86e02e9a2441a8ba" kindref="member">what()</a> function to get human-readable error message. Use <a href="#classrapidxml_1_1parse__error_377be7d201d95221c318682c35377aca_1377be7d201d95221c318682c35377aca" kindref="member">where()</a> function to get a pointer to position within source text where error was detected. <br/><br/>
+ If throwing exceptions by the parser is undesirable, it can be disabled by defining RAPIDXML_NO_EXCEPTIONS macro before <a href="#rapidxml_8hpp" kindref="compound">rapidxml.hpp</a> is included. This will cause the parser to call <a href="#namespacerapidxml_ff5d67f74437199d316d2b2660653ae1_1ff5d67f74437199d316d2b2660653ae1" kindref="member">rapidxml::parse_error_handler()</a> function instead of throwing an exception. This function must be defined by the user. <br/><br/>
+ This class derives from <code>std::exception</code> class. </para><p/><h3 class="reference-header" id="classrapidxml_1_1parse__error_4dd8d1bdbd9221df4dcb90cafaee3332_14dd8d1bdbd9221df4dcb90cafaee3332">
+				constructor
+			 parse_error::parse_error</h3><h4>Synopsis</h4><code class="synopsis">parse_error(const char *what, void *where);
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Constructs parse error. </para><p/><h3 class="reference-header" id="classrapidxml_1_1parse__error_ff06f49065b54a8a86e02e9a2441a8ba_1ff06f49065b54a8a86e02e9a2441a8ba">function parse_error::what</h3><h4>Synopsis</h4><code class="synopsis">virtual const char* what() const;
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Gets human readable description of error. </para><h4>Returns</h4>Pointer to null terminated description of the error. <p/><h3 class="reference-header" id="classrapidxml_1_1parse__error_377be7d201d95221c318682c35377aca_1377be7d201d95221c318682c35377aca">function parse_error::where</h3><h4>Synopsis</h4><code class="synopsis">Ch* where() const;
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Gets pointer to character data where error happened. Ch should be the same as char type of <a href="#classrapidxml_1_1xml__document" kindref="compound">xml_document</a> that produced the error. </para><h4>Returns</h4>Pointer to location within the parsed string where error occured. <p/><h3 class="reference-header" id="classrapidxml_1_1xml__attribute">class
+									  template
+								   rapidxml::xml_attribute</h3>
+
+							  Defined in <a href="rapidxml.hpp">rapidxml.hpp</a><br/>
+								  Inherits from
+								  <a href="#classrapidxml_1_1xml__base">xml_base</a> <br/><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Class representing attribute node of XML document. Each attribute has name and value strings, which are available through <a href="#classrapidxml_1_1xml__base_622eade29fdf7806d3ef93ac4d90e707_1622eade29fdf7806d3ef93ac4d90e707" kindref="member">name()</a> and <a href="#classrapidxml_1_1xml__base_c54fa4987fb503916a7b541eb15c9c7f_1c54fa4987fb503916a7b541eb15c9c7f" kindref="member">value()</a> functions (inherited from <a href="#classrapidxml_1_1xml__base" kindref="compound">xml_base</a>). Note that after parse, both name and value of attribute will point to interior of source text used for parsing. Thus, this text must persist in memory for the lifetime of attribute. </para><h4>Parameters</h4><dl><dt class="parameter-name">Ch</dt><dd>Character type to use. </dd></dl><p/><h3 class="reference-header" id="classrapidxml_1_1xml__attribute_d5464aadf08269a886b730993525db34_1d5464aadf08269a886b730993525db34">
+				constructor
+			 xml_attribute::xml_attribute</h3><h4>Synopsis</h4><code class="synopsis">xml_attribute();
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Constructs an empty attribute with the specified type. Consider using <a href="#classrapidxml_1_1memory__pool" kindref="compound">memory_pool</a> of appropriate <a href="#classrapidxml_1_1xml__document" kindref="compound">xml_document</a> if allocating attributes manually. </para><p/><h3 class="reference-header" id="classrapidxml_1_1xml__attribute_77aea7d8d996ba4f6bd61cc478a4e72d_177aea7d8d996ba4f6bd61cc478a4e72d">function xml_attribute::document</h3><h4>Synopsis</h4><code class="synopsis">xml_document&lt;Ch&gt;* document() const;
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Gets document of which attribute is a child. </para><h4>Returns</h4>Pointer to document that contains this attribute, or 0 if there is no parent document. <p/><h3 class="reference-header" id="classrapidxml_1_1xml__attribute_5c4a98d2b75f9b41b12c110108fd55ab_15c4a98d2b75f9b41b12c110108fd55ab">function xml_attribute::previous_attribute</h3><h4>Synopsis</h4><code class="synopsis">xml_attribute&lt;Ch&gt;* previous_attribute(const Ch *name=0, std::size_t name_size=0, bool case_sensitive=true) const;
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Gets previous attribute, optionally matching attribute name. </para><h4>Parameters</h4><dl><dt class="parameter-name">name</dt><dd class="parameter-def">Name of attribute to find, or 0 to return previous attribute regardless of its name; this string doesn&apos;t have to be zero-terminated if name_size is non-zero </dd></dl><dl><dt class="parameter-name">name_size</dt><dd class="parameter-def">Size of name, in characters, or 0 to have size calculated automatically from string </dd></dl><dl><dt class="parameter-name">case_sensitive</dt><dd class="parameter-def">Should name comparison be case-sensitive; non case-sensitive comparison works properly only for ASCII characters </dd></dl><h4>Returns</h4>Pointer to found attribute, or 0 if not found. <p/><h3 class="reference-header" id="classrapidxml_1_1xml__attribute_1b8a814d0d3a7165396b08433eee8a91_11b8a814d0d3a7165396b08433eee8a91">function xml_attribute::next_attribute</h3><h4>Synopsis</h4><code class="synopsis">xml_attribute&lt;Ch&gt;* next_attribute(const Ch *name=0, std::size_t name_size=0, bool case_sensitive=true) const;
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Gets next attribute, optionally matching attribute name. </para><h4>Parameters</h4><dl><dt class="parameter-name">name</dt><dd class="parameter-def">Name of attribute to find, or 0 to return next attribute regardless of its name; this string doesn&apos;t have to be zero-terminated if name_size is non-zero </dd></dl><dl><dt class="parameter-name">name_size</dt><dd class="parameter-def">Size of name, in characters, or 0 to have size calculated automatically from string </dd></dl><dl><dt class="parameter-name">case_sensitive</dt><dd class="parameter-def">Should name comparison be case-sensitive; non case-sensitive comparison works properly only for ASCII characters </dd></dl><h4>Returns</h4>Pointer to found attribute, or 0 if not found. <p/><h3 class="reference-header" id="classrapidxml_1_1xml__base">class
+									  template
+								   rapidxml::xml_base</h3>
+
+							  Defined in <a href="rapidxml.hpp">rapidxml.hpp</a><br/>
+								  Base class for
+								  <a href="#classrapidxml_1_1xml__attribute">xml_attribute</a> <a href="#classrapidxml_1_1xml__node">xml_node</a> <h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Base class for <a href="#classrapidxml_1_1xml__node" kindref="compound">xml_node</a> and <a href="#classrapidxml_1_1xml__attribute" kindref="compound">xml_attribute</a> implementing common functions: <a href="#classrapidxml_1_1xml__base_622eade29fdf7806d3ef93ac4d90e707_1622eade29fdf7806d3ef93ac4d90e707" kindref="member">name()</a>, <a href="#classrapidxml_1_1xml__base_0dae694c8f7e4d89f1003e2f3a15a43c_10dae694c8f7e4d89f1003e2f3a15a43c" kindref="member">name_size()</a>, <a href="#classrapidxml_1_1xml__base_c54fa4987fb503916a7b541eb15c9c7f_1c54fa4987fb503916a7b541eb15c9c7f" kindref="member">value()</a>, <a href="#classrapidxml_1_1xml__base_aed5ae791b7164c1ee5e649198cbb3db_1aed5ae791b7164c1ee5e649198cbb3db" kindref="member">value_size()</a> and <a href="#classrapidxml_1_1xml__base_798e8df7ea53ade4d9f0701017dce80e_1798e8df7ea53ade4d9f0701017dce80e" kindref="member">parent()</a>. </para><h4>Parameters</h4><dl><dt class="parameter-name">Ch</dt><dd>Character type to use </dd></dl><p/><h3 class="reference-header" id="classrapidxml_1_1xml__base_23630d2c130a9e0e3f3afa7584a9b218_123630d2c130a9e0e3f3afa7584a9b218">
+				constructor
+			 xml_base::xml_base</h3><h4>Synopsis</h4><code class="synopsis">xml_base();
+									  </code><p/><h3 class="reference-header" id="classrapidxml_1_1xml__base_622eade29fdf7806d3ef93ac4d90e707_1622eade29fdf7806d3ef93ac4d90e707">function xml_base::name</h3><h4>Synopsis</h4><code class="synopsis">Ch* name() const;
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Gets name of the node. Interpretation of name depends on type of node. Note that name will not be zero-terminated if <a href="#namespacerapidxml_9cae3801e70437cbc410c24bf6be691c_19cae3801e70437cbc410c24bf6be691c" kindref="member">rapidxml::parse_no_string_terminators</a> option was selected during parse. <br/><br/>
+ Use <a href="#classrapidxml_1_1xml__base_0dae694c8f7e4d89f1003e2f3a15a43c_10dae694c8f7e4d89f1003e2f3a15a43c" kindref="member">name_size()</a> function to determine length of the name. </para><h4>Returns</h4>Name of node, or empty string if node has no name. <p/><h3 class="reference-header" id="classrapidxml_1_1xml__base_0dae694c8f7e4d89f1003e2f3a15a43c_10dae694c8f7e4d89f1003e2f3a15a43c">function xml_base::name_size</h3><h4>Synopsis</h4><code class="synopsis">std::size_t name_size() const;
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Gets size of node name, not including terminator character. This function works correctly irrespective of whether name is or is not zero terminated. </para><h4>Returns</h4>Size of node name, in characters. <p/><h3 class="reference-header" id="classrapidxml_1_1xml__base_c54fa4987fb503916a7b541eb15c9c7f_1c54fa4987fb503916a7b541eb15c9c7f">function xml_base::value</h3><h4>Synopsis</h4><code class="synopsis">Ch* value() const;
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Gets value of node. Interpretation of value depends on type of node. Note that value will not be zero-terminated if <a href="#namespacerapidxml_9cae3801e70437cbc410c24bf6be691c_19cae3801e70437cbc410c24bf6be691c" kindref="member">rapidxml::parse_no_string_terminators</a> option was selected during parse. <br/><br/>
+ Use <a href="#classrapidxml_1_1xml__base_aed5ae791b7164c1ee5e649198cbb3db_1aed5ae791b7164c1ee5e649198cbb3db" kindref="member">value_size()</a> function to determine length of the value. </para><h4>Returns</h4>Value of node, or empty string if node has no value. <p/><h3 class="reference-header" id="classrapidxml_1_1xml__base_aed5ae791b7164c1ee5e649198cbb3db_1aed5ae791b7164c1ee5e649198cbb3db">function xml_base::value_size</h3><h4>Synopsis</h4><code class="synopsis">std::size_t value_size() const;
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Gets size of node value, not including terminator character. This function works correctly irrespective of whether value is or is not zero terminated. </para><h4>Returns</h4>Size of node value, in characters. <p/><h3 class="reference-header" id="classrapidxml_1_1xml__base_4e7e23d06d48126c65b1f6266acfba5c_14e7e23d06d48126c65b1f6266acfba5c">function xml_base::name</h3><h4>Synopsis</h4><code class="synopsis">void name(const Ch *name, std::size_t size);
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Sets name of node to a non zero-terminated string. See <a href="#namespacerapidxml_1ownership_of_strings" kindref="member">Ownership Of Strings</a> . <br/><br/>
+ Note that node does not own its name or value, it only stores a pointer to it. It will not delete or otherwise free the pointer on destruction. It is reponsibility of the user to properly manage lifetime of the string. The easiest way to achieve it is to use <a href="#classrapidxml_1_1memory__pool" kindref="compound">memory_pool</a> of the document to allocate the string - on destruction of the document the string will be automatically freed. <br/><br/>
+ Size of name must be specified separately, because name does not have to be zero terminated. Use <a href="#classrapidxml_1_1xml__base_e099c291e104a0d277307fe71f5e0f9e_1e099c291e104a0d277307fe71f5e0f9e" kindref="member">name(const Ch *)</a> function to have the length automatically calculated (string must be zero terminated). </para><h4>Parameters</h4><dl><dt class="parameter-name">name</dt><dd class="parameter-def">Name of node to set. Does not have to be zero terminated. </dd></dl><dl><dt class="parameter-name">size</dt><dd class="parameter-def">Size of name, in characters. This does not include zero terminator, if one is present. </dd></dl><p/><h3 class="reference-header" id="classrapidxml_1_1xml__base_e099c291e104a0d277307fe71f5e0f9e_1e099c291e104a0d277307fe71f5e0f9e">function xml_base::name</h3><h4>Synopsis</h4><code class="synopsis">void name(const Ch *name);
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Sets name of node to a zero-terminated string. See also <a href="#namespacerapidxml_1ownership_of_strings" kindref="member">Ownership Of Strings</a>  and <a href="#classrapidxml_1_1xml__base_4e7e23d06d48126c65b1f6266acfba5c_14e7e23d06d48126c65b1f6266acfba5c" kindref="member">xml_node::name(const Ch *, std::size_t)</a>. </para><h4>Parameters</h4><dl><dt class="parameter-name">name</dt><dd class="parameter-def">Name of node to set. Must be zero terminated. </dd></dl><p/><h3 class="reference-header" id="classrapidxml_1_1xml__base_d9640aa3f5374673cb72a5289b6c91eb_1d9640aa3f5374673cb72a5289b6c91eb">function xml_base::value</h3><h4>Synopsis</h4><code class="synopsis">void value(const Ch *value, std::size_t size);
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Sets value of node to a non zero-terminated string. See <a href="#namespacerapidxml_1ownership_of_strings" kindref="member">Ownership Of Strings</a> . <br/><br/>
+ Note that node does not own its name or value, it only stores a pointer to it. It will not delete or otherwise free the pointer on destruction. It is reponsibility of the user to properly manage lifetime of the string. The easiest way to achieve it is to use <a href="#classrapidxml_1_1memory__pool" kindref="compound">memory_pool</a> of the document to allocate the string - on destruction of the document the string will be automatically freed. <br/><br/>
+ Size of value must be specified separately, because it does not have to be zero terminated. Use <a href="#classrapidxml_1_1xml__base_18c7469acdca771de9b4f3054053029c_118c7469acdca771de9b4f3054053029c" kindref="member">value(const Ch *)</a> function to have the length automatically calculated (string must be zero terminated). <br/><br/>
+ If an element has a child node of type node_data, it will take precedence over element value when printing. If you want to manipulate data of elements using values, use parser flag <a href="#namespacerapidxml_87e8bbab53702cf3b438bd553c10b6b9_187e8bbab53702cf3b438bd553c10b6b9" kindref="member">rapidxml::parse_no_data_nodes</a> to prevent creation of data nodes by the parser. </para><h4>Parameters</h4><dl><dt class="parameter-name">value</dt><dd class="parameter-def">value of node to set. Does not have to be zero terminated. </dd></dl><dl><dt class="parameter-name">size</dt><dd class="parameter-def">Size of value, in characters. This does not include zero terminator, if one is present. </dd></dl><p/><h3 class="reference-header" id="classrapidxml_1_1xml__base_18c7469acdca771de9b4f3054053029c_118c7469acdca771de9b4f3054053029c">function xml_base::value</h3><h4>Synopsis</h4><code class="synopsis">void value(const Ch *value);
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Sets value of node to a zero-terminated string. See also <a href="#namespacerapidxml_1ownership_of_strings" kindref="member">Ownership Of Strings</a>  and <a href="#classrapidxml_1_1xml__base_d9640aa3f5374673cb72a5289b6c91eb_1d9640aa3f5374673cb72a5289b6c91eb" kindref="member">xml_node::value(const Ch *, std::size_t)</a>. </para><h4>Parameters</h4><dl><dt class="parameter-name">value</dt><dd class="parameter-def">Vame of node to set. Must be zero terminated. </dd></dl><p/><h3 class="reference-header" id="classrapidxml_1_1xml__base_798e8df7ea53ade4d9f0701017dce80e_1798e8df7ea53ade4d9f0701017dce80e">function xml_base::parent</h3><h4>Synopsis</h4><code class="synopsis">xml_node&lt;Ch&gt;* parent() const;
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Gets node parent. </para><h4>Returns</h4>Pointer to parent node, or 0 if there is no parent. <p/><h3 class="reference-header" id="classrapidxml_1_1xml__document">class
+									  template
+								   rapidxml::xml_document</h3>
+
+							  Defined in <a href="rapidxml.hpp">rapidxml.hpp</a><br/>
+								  Inherits from
+								  <a href="#classrapidxml_1_1xml__node">xml_node</a> <a href="#classrapidxml_1_1memory__pool">memory_pool</a> <br/><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">This class represents root of the DOM hierarchy. It is also an <a href="#classrapidxml_1_1xml__node" kindref="compound">xml_node</a> and a <a href="#classrapidxml_1_1memory__pool" kindref="compound">memory_pool</a> through public inheritance. Use <a href="#classrapidxml_1_1xml__document_8338ce6042e7b04d5a42144fb446b69c_18338ce6042e7b04d5a42144fb446b69c" kindref="member">parse()</a> function to build a DOM tree from a zero-terminated XML text string. <a href="#classrapidxml_1_1xml__document_8338ce6042e7b04d5a42144fb446b69c_18338ce6042e7b04d5a42144fb446b69c" kindref="member">parse()</a> function allocates memory for nodes and attributes by using functions of <a href="#classrapidxml_1_1xml__document" kindref="compound">xml_document</a>, which are inherited from <a href="#classrapidxml_1_1memory__pool" kindref="compound">memory_pool</a>. To access root node of the document, use the document itself, as if it was an <a href="#classrapidxml_1_1xml__node" kindref="compound">xml_node</a>. </para><h4>Parameters</h4><dl><dt class="parameter-name">Ch</dt><dd>Character type to use. </dd></dl><p/><h3 class="reference-header" id="classrapidxml_1_1xml__document_6ce266cc52d549c42abe3a3d5e8af9ba_16ce266cc52d549c42abe3a3d5e8af9ba">
+				constructor
+			 xml_document::xml_document</h3><h4>Synopsis</h4><code class="synopsis">xml_document();
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Constructs empty XML document. </para><p/><h3 class="reference-header" id="classrapidxml_1_1xml__document_8338ce6042e7b04d5a42144fb446b69c_18338ce6042e7b04d5a42144fb446b69c">function xml_document::parse</h3><h4>Synopsis</h4><code class="synopsis">void parse(Ch *text);
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Parses zero-terminated XML string according to given flags. Passed string will be modified by the parser, unless <a href="#namespacerapidxml_a97ba1a0a79a6d66f4eef3612508d943_1a97ba1a0a79a6d66f4eef3612508d943" kindref="member">rapidxml::parse_non_destructive</a> flag is used. The string must persist for the lifetime of the document. In case of error, <a href="#classrapidxml_1_1parse__error" kindref="compound">rapidxml::parse_error</a> exception will be thrown. <br/><br/>
+ If you want to parse contents of a file, you must first load the file into the memory, and pass pointer to its beginning. Make sure that data is zero-terminated. <br/><br/>
+ Document can be parsed into multiple times. Each new call to parse removes previous nodes and attributes (if any), but does not clear memory pool. </para><h4>Parameters</h4><dl><dt class="parameter-name">text</dt><dd class="parameter-def">XML data to parse; pointer is non-const to denote fact that this data may be modified by the parser. </dd></dl><p/><h3 class="reference-header" id="classrapidxml_1_1xml__document_c8bb3912a3ce86b15842e79d0b421204_1c8bb3912a3ce86b15842e79d0b421204">function xml_document::clear</h3><h4>Synopsis</h4><code class="synopsis">void clear();
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Clears the document by deleting all nodes and clearing the memory pool. All nodes owned by document pool are destroyed. </para><p/><h3 class="reference-header" id="classrapidxml_1_1xml__node">class
+									  template
+								   rapidxml::xml_node</h3>
+
+							  Defined in <a href="rapidxml.hpp">rapidxml.hpp</a><br/>
+								  Inherits from
+								  <a href="#classrapidxml_1_1xml__base">xml_base</a> <br/>
+								  Base class for
+								  <a href="#classrapidxml_1_1xml__document">xml_document</a> <h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Class representing a node of XML document. Each node may have associated name and value strings, which are available through <a href="#classrapidxml_1_1xml__base_622eade29fdf7806d3ef93ac4d90e707_1622eade29fdf7806d3ef93ac4d90e707" kindref="member">name()</a> and <a href="#classrapidxml_1_1xml__base_c54fa4987fb503916a7b541eb15c9c7f_1c54fa4987fb503916a7b541eb15c9c7f" kindref="member">value()</a> functions. Interpretation of name and value depends on type of the node. Type of node can be determined by using <a href="#classrapidxml_1_1xml__node_975e86937621ae4afe6a423219de30d0_1975e86937621ae4afe6a423219de30d0" kindref="member">type()</a> function. <br/><br/>
+ Note that after parse, both name and value of node, if any, will point interior of source text used for parsing. Thus, this text must persist in the memory for the lifetime of node. </para><h4>Parameters</h4><dl><dt class="parameter-name">Ch</dt><dd>Character type to use. </dd></dl><p/><h3 class="reference-header" id="classrapidxml_1_1xml__node_34c55af3504549a475e5b9dfcaa6adf5_134c55af3504549a475e5b9dfcaa6adf5">
+				constructor
+			 xml_node::xml_node</h3><h4>Synopsis</h4><code class="synopsis">xml_node(node_type type);
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Constructs an empty node with the specified type. Consider using <a href="#classrapidxml_1_1memory__pool" kindref="compound">memory_pool</a> of appropriate document to allocate nodes manually. </para><h4>Parameters</h4><dl><dt class="parameter-name">type</dt><dd class="parameter-def">Type of node to construct. </dd></dl><p/><h3 class="reference-header" id="classrapidxml_1_1xml__node_975e86937621ae4afe6a423219de30d0_1975e86937621ae4afe6a423219de30d0">function xml_node::type</h3><h4>Synopsis</h4><code class="synopsis">node_type type() const;
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Gets type of node. </para><h4>Returns</h4>Type of node. <p/><h3 class="reference-header" id="classrapidxml_1_1xml__node_77aea7d8d996ba4f6bd61cc478a4e72d_177aea7d8d996ba4f6bd61cc478a4e72d">function xml_node::document</h3><h4>Synopsis</h4><code class="synopsis">xml_document&lt;Ch&gt;* document() const;
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Gets document of which node is a child. </para><h4>Returns</h4>Pointer to document that contains this node, or 0 if there is no parent document. <p/><h3 class="reference-header" id="classrapidxml_1_1xml__node_7823e36687669e59c2afdf66334ef35a_17823e36687669e59c2afdf66334ef35a">function xml_node::first_node</h3><h4>Synopsis</h4><code class="synopsis">xml_node&lt;Ch&gt;* first_node(const Ch *name=0, std::size_t name_size=0, bool case_sensitive=true) const;
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Gets first child node, optionally matching node name. </para><h4>Parameters</h4><dl><dt class="parameter-name">name</dt><dd class="parameter-def">Name of child to find, or 0 to return first child regardless of its name; this string doesn&apos;t have to be zero-terminated if name_size is non-zero </dd></dl><dl><dt class="parameter-name">name_size</dt><dd class="parameter-def">Size of name, in characters, or 0 to have size calculated automatically from string </dd></dl><dl><dt class="parameter-name">case_sensitive</dt><dd class="parameter-def">Should name comparison be case-sensitive; non case-sensitive comparison works properly only for ASCII characters </dd></dl><h4>Returns</h4>Pointer to found child, or 0 if not found. <p/><h3 class="reference-header" id="classrapidxml_1_1xml__node_fcb6e2209b591a36d2dadba20d2bc7cc_1fcb6e2209b591a36d2dadba20d2bc7cc">function xml_node::last_node</h3><h4>Synopsis</h4><code class="synopsis">xml_node&lt;Ch&gt;* last_node(const Ch *name=0, std::size_t name_size=0, bool case_sensitive=true) const;
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Gets last child node, optionally matching node name. Behaviour is undefined if node has no children. Use <a href="#classrapidxml_1_1xml__node_7823e36687669e59c2afdf66334ef35a_17823e36687669e59c2afdf66334ef35a" kindref="member">first_node()</a> to test if node has children. </para><h4>Parameters</h4><dl><dt class="parameter-name">name</dt><dd class="parameter-def">Name of child to find, or 0 to return last child regardless of its name; this string doesn&apos;t have to be zero-terminated if name_size is non-zero </dd></dl><dl><dt class="parameter-name">name_size</dt><dd class="parameter-def">Size of name, in characters, or 0 to have size calculated automatically from string </dd></dl><dl><dt class="parameter-name">case_sensitive</dt><dd class="parameter-def">Should name comparison be case-sensitive; non case-sensitive comparison works properly only for ASCII characters </dd></dl><h4>Returns</h4>Pointer to found child, or 0 if not found. <p/><h3 class="reference-header" id="classrapidxml_1_1xml__node_ac2f6886c0107e9d5f156e9542546df6_1ac2f6886c0107e9d5f156e9542546df6">function xml_node::previous_sibling</h3><h4>Synopsis</h4><code class="synopsis">xml_node&lt;Ch&gt;* previous_sibling(const Ch *name=0, std::size_t name_size=0, bool case_sensitive=true) const;
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Gets previous sibling node, optionally matching node name. Behaviour is undefined if node has no parent. Use <a href="#classrapidxml_1_1xml__base_798e8df7ea53ade4d9f0701017dce80e_1798e8df7ea53ade4d9f0701017dce80e" kindref="member">parent()</a> to test if node has a parent. </para><h4>Parameters</h4><dl><dt class="parameter-name">name</dt><dd class="parameter-def">Name of sibling to find, or 0 to return previous sibling regardless of its name; this string doesn&apos;t have to be zero-terminated if name_size is non-zero </dd></dl><dl><dt class="parameter-name">name_size</dt><dd class="parameter-def">Size of name, in characters, or 0 to have size calculated automatically from string </dd></dl><dl><dt class="parameter-name">case_sensitive</dt><dd class="parameter-def">Should name comparison be case-sensitive; non case-sensitive comparison works properly only for ASCII characters </dd></dl><h4>Returns</h4>Pointer to found sibling, or 0 if not found. <p/><h3 class="reference-header" id="classrapidxml_1_1xml__node_b3ead2cefecc03a813836203e3f6f38f_1b3ead2cefecc03a813836203e3f6f38f">function xml_node::next_sibling</h3><h4>Synopsis</h4><code class="synopsis">xml_node&lt;Ch&gt;* next_sibling(const Ch *name=0, std::size_t name_size=0, bool case_sensitive=true) const;
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Gets next sibling node, optionally matching node name. Behaviour is undefined if node has no parent. Use <a href="#classrapidxml_1_1xml__base_798e8df7ea53ade4d9f0701017dce80e_1798e8df7ea53ade4d9f0701017dce80e" kindref="member">parent()</a> to test if node has a parent. </para><h4>Parameters</h4><dl><dt class="parameter-name">name</dt><dd class="parameter-def">Name of sibling to find, or 0 to return next sibling regardless of its name; this string doesn&apos;t have to be zero-terminated if name_size is non-zero </dd></dl><dl><dt class="parameter-name">name_size</dt><dd class="parameter-def">Size of name, in characters, or 0 to have size calculated automatically from string </dd></dl><dl><dt class="parameter-name">case_sensitive</dt><dd class="parameter-def">Should name comparison be case-sensitive; non case-sensitive comparison works properly only for ASCII characters </dd></dl><h4>Returns</h4>Pointer to found sibling, or 0 if not found. <p/><h3 class="reference-header" id="classrapidxml_1_1xml__node_5810a09f82f8d53efbe9456286dcec83_15810a09f82f8d53efbe9456286dcec83">function xml_node::first_attribute</h3><h4>Synopsis</h4><code class="synopsis">xml_attribute&lt;Ch&gt;* first_attribute(const Ch *name=0, std::size_t name_size=0, bool case_sensitive=true) const;
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Gets first attribute of node, optionally matching attribute name. </para><h4>Parameters</h4><dl><dt class="parameter-name">name</dt><dd class="parameter-def">Name of attribute to find, or 0 to return first attribute regardless of its name; this string doesn&apos;t have to be zero-terminated if name_size is non-zero </dd></dl><dl><dt class="parameter-name">name_size</dt><dd class="parameter-def">Size of name, in characters, or 0 to have size calculated automatically from string </dd></dl><dl><dt class="parameter-name">case_sensitive</dt><dd class="parameter-def">Should name comparison be case-sensitive; non case-sensitive comparison works properly only for ASCII characters </dd></dl><h4>Returns</h4>Pointer to found attribute, or 0 if not found. <p/><h3 class="reference-header" id="classrapidxml_1_1xml__node_16953d66751b5b949ee4ee2d9c0bc63a_116953d66751b5b949ee4ee2d9c0bc63a">function xml_node::last_attribute</h3><h4>Synopsis</h4><code class="synopsis">xml_attribute&lt;Ch&gt;* last_attribute(const Ch *name=0, std::size_t name_size=0, bool case_sensitive=true) const;
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Gets last attribute of node, optionally matching attribute name. </para><h4>Parameters</h4><dl><dt class="parameter-name">name</dt><dd class="parameter-def">Name of attribute to find, or 0 to return last attribute regardless of its name; this string doesn&apos;t have to be zero-terminated if name_size is non-zero </dd></dl><dl><dt class="parameter-name">name_size</dt><dd class="parameter-def">Size of name, in characters, or 0 to have size calculated automatically from string </dd></dl><dl><dt class="parameter-name">case_sensitive</dt><dd class="parameter-def">Should name comparison be case-sensitive; non case-sensitive comparison works properly only for ASCII characters </dd></dl><h4>Returns</h4>Pointer to found attribute, or 0 if not found. <p/><h3 class="reference-header" id="classrapidxml_1_1xml__node_a78759bfa429fa2ab6bc5fe617cfa3cf_1a78759bfa429fa2ab6bc5fe617cfa3cf">function xml_node::type</h3><h4>Synopsis</h4><code class="synopsis">void type(node_type type);
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Sets type of node. </para><h4>Parameters</h4><dl><dt class="parameter-name">type</dt><dd class="parameter-def">Type of node to set. </dd></dl><p/><h3 class="reference-header" id="classrapidxml_1_1xml__node_0c39df6617e709eb2fba11300dea63f2_10c39df6617e709eb2fba11300dea63f2">function xml_node::prepend_node</h3><h4>Synopsis</h4><code class="synopsis">void prepend_node(xml_node&lt; Ch &gt; *child);
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Prepends a new child node. The prepended child becomes the first child, and all existing children are moved one position back. </para><h4>Parameters</h4><dl><dt class="parameter-name">child</dt><dd class="parameter-def">Node to prepend. </dd></dl><p/><h3 class="reference-header" id="classrapidxml_1_1xml__node_86de2e22276826089b7baed2599f8dee_186de2e22276826089b7baed2599f8dee">function xml_node::append_node</h3><h4>Synopsis</h4><code class="synopsis">void append_node(xml_node&lt; Ch &gt; *child);
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Appends a new child node. The appended child becomes the last child. </para><h4>Parameters</h4><dl><dt class="parameter-name">child</dt><dd class="parameter-def">Node to append. </dd></dl><p/><h3 class="reference-header" id="classrapidxml_1_1xml__node_780972a57fc447250ab47cc8f421b65e_1780972a57fc447250ab47cc8f421b65e">function xml_node::insert_node</h3><h4>Synopsis</h4><code class="synopsis">void insert_node(xml_node&lt; Ch &gt; *where, xml_node&lt; Ch &gt; *child);
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Inserts a new child node at specified place inside the node. All children after and including the specified node are moved one position back. </para><h4>Parameters</h4><dl><dt class="parameter-name">where</dt><dd class="parameter-def">Place where to insert the child, or 0 to insert at the back. </dd></dl><dl><dt class="parameter-name">child</dt><dd class="parameter-def">Node to insert. </dd></dl><p/><h3 class="reference-header" id="classrapidxml_1_1xml__node_9a31d861e1bddc710839c551a5d2b3a4_19a31d861e1bddc710839c551a5d2b3a4">function xml_node::remove_first_node</h3><h4>Synopsis</h4><code class="synopsis">void remove_first_node();
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Removes first child node. If node has no children, behaviour is undefined. Use <a href="#classrapidxml_1_1xml__node_7823e36687669e59c2afdf66334ef35a_17823e36687669e59c2afdf66334ef35a" kindref="member">first_node()</a> to test if node has children. </para><p/><h3 class="reference-header" id="classrapidxml_1_1xml__node_87addf2bc127ee31aa4b5295d3c9b530_187addf2bc127ee31aa4b5295d3c9b530">function xml_node::remove_last_node</h3><h4>Synopsis</h4><code class="synopsis">void remove_last_node();
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Removes last child of the node. If node has no children, behaviour is undefined. Use <a href="#classrapidxml_1_1xml__node_7823e36687669e59c2afdf66334ef35a_17823e36687669e59c2afdf66334ef35a" kindref="member">first_node()</a> to test if node has children. </para><p/><h3 class="reference-header" id="classrapidxml_1_1xml__node_9316463a2201631e7e2062b17729f9cd_19316463a2201631e7e2062b17729f9cd">function xml_node::remove_node</h3><h4>Synopsis</h4><code class="synopsis">void remove_node(xml_node&lt; Ch &gt; *where);
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Removes specified child from the node. </para><p/><h3 class="reference-header" id="classrapidxml_1_1xml__node_0218147d13e41d5fa60ced4e7a7e9726_10218147d13e41d5fa60ced4e7a7e9726">function xml_node::remove_all_nodes</h3><h4>Synopsis</h4><code class="synopsis">void remove_all_nodes();
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Removes all child nodes (but not attributes). </para><p/><h3 class="reference-header" id="classrapidxml_1_1xml__node_f6dffa513da74cc0be71a7ba84f8265e_1f6dffa513da74cc0be71a7ba84f8265e">function xml_node::prepend_attribute</h3><h4>Synopsis</h4><code class="synopsis">void prepend_attribute(xml_attribute&lt; Ch &gt; *attribute);
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Prepends a new attribute to the node. </para><h4>Parameters</h4><dl><dt class="parameter-name">attribute</dt><dd class="parameter-def">Attribute to prepend. </dd></dl><p/><h3 class="reference-header" id="classrapidxml_1_1xml__node_8fbd4f5ef7169d493da9f8d87ac04b77_18fbd4f5ef7169d493da9f8d87ac04b77">function xml_node::append_attribute</h3><h4>Synopsis</h4><code class="synopsis">void append_attribute(xml_attribute&lt; Ch &gt; *attribute);
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Appends a new attribute to the node. </para><h4>Parameters</h4><dl><dt class="parameter-name">attribute</dt><dd class="parameter-def">Attribute to append. </dd></dl><p/><h3 class="reference-header" id="classrapidxml_1_1xml__node_070d5888b0557fe06a5b24961de1b988_1070d5888b0557fe06a5b24961de1b988">function xml_node::insert_attribute</h3><h4>Synopsis</h4><code class="synopsis">void insert_attribute(xml_attribute&lt; Ch &gt; *where, xml_attribute&lt; Ch &gt; *attribute);
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Inserts a new attribute at specified place inside the node. All attributes after and including the specified attribute are moved one position back. </para><h4>Parameters</h4><dl><dt class="parameter-name">where</dt><dd class="parameter-def">Place where to insert the attribute, or 0 to insert at the back. </dd></dl><dl><dt class="parameter-name">attribute</dt><dd class="parameter-def">Attribute to insert. </dd></dl><p/><h3 class="reference-header" id="classrapidxml_1_1xml__node_4eea4a7f6cb484ca9944f7eafe6e1843_14eea4a7f6cb484ca9944f7eafe6e1843">function xml_node::remove_first_attribute</h3><h4>Synopsis</h4><code class="synopsis">void remove_first_attribute();
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Removes first attribute of the node. If node has no attributes, behaviour is undefined. Use <a href="#classrapidxml_1_1xml__node_5810a09f82f8d53efbe9456286dcec83_15810a09f82f8d53efbe9456286dcec83" kindref="member">first_attribute()</a> to test if node has attributes. </para><p/><h3 class="reference-header" id="classrapidxml_1_1xml__node_37d87c4d5d89fa0cf05b72ee8d4cba3b_137d87c4d5d89fa0cf05b72ee8d4cba3b">function xml_node::remove_last_attribute</h3><h4>Synopsis</h4><code class="synopsis">void remove_last_attribute();
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Removes last attribute of the node. If node has no attributes, behaviour is undefined. Use <a href="#classrapidxml_1_1xml__node_5810a09f82f8d53efbe9456286dcec83_15810a09f82f8d53efbe9456286dcec83" kindref="member">first_attribute()</a> to test if node has attributes. </para><p/><h3 class="reference-header" id="classrapidxml_1_1xml__node_c75154db2e768c0e5b541fc8cd0775ab_1c75154db2e768c0e5b541fc8cd0775ab">function xml_node::remove_attribute</h3><h4>Synopsis</h4><code class="synopsis">void remove_attribute(xml_attribute&lt; Ch &gt; *where);
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Removes specified attribute from node. </para><h4>Parameters</h4><dl><dt class="parameter-name">where</dt><dd class="parameter-def">Pointer to attribute to be removed. </dd></dl><p/><h3 class="reference-header" id="classrapidxml_1_1xml__node_59e6ad4cfd5e8096c052e71d79561eda_159e6ad4cfd5e8096c052e71d79561eda">function xml_node::remove_all_attributes</h3><h4>Synopsis</h4><code class="synopsis">void remove_all_attributes();
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Removes all attributes of node. </para><p/><h3 class="reference-header" id="namespacerapidxml_6a276b85e2da28c5f9c3dbce61c55682_16a276b85e2da28c5f9c3dbce61c55682">enum node_type</h3><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Enumeration listing all node types produced by the parser. Use <a href="#classrapidxml_1_1xml__node_975e86937621ae4afe6a423219de30d0_1975e86937621ae4afe6a423219de30d0" kindref="member">xml_node::type()</a> function to query node type. </para><h4>Values</h4><dl><dt class="parameter-name">node_document</dt><dd class="parameter-def">A document node. Name and value are empty. </dd></dl><dl><dt class="parameter-name">node_element</dt><dd class="parameter-def">An element node. Name contains element name. Value contains text of first data node. </dd></dl><dl><dt class="parameter-name">node_data</dt><dd class="parameter-def">A data node. Name is empty. Value contains data text. </dd></dl><dl><dt class="parameter-name">node_cdata</dt><dd class="parameter-def">A CDATA node. Name is empty. Value contains data text. </dd></dl><dl><dt class="parameter-name">node_comment</dt><dd class="parameter-def">A comment node. Name is empty. Value contains comment text. </dd></dl><dl><dt class="parameter-name">node_declaration</dt><dd class="parameter-def">A declaration node. Name and value are empty. Declaration parameters (version, encoding and standalone) are in node attributes. </dd></dl><dl><dt class="parameter-name">node_doctype</dt><dd class="parameter-def">A DOCTYPE node. Name is empty. Value contains DOCTYPE text. </dd></dl><dl><dt class="parameter-name">node_pi</dt><dd class="parameter-def">A PI node. Name contains target. Value contains instructions. </dd></dl><p/><h3 class="reference-header" id="namespacerapidxml_ff5d67f74437199d316d2b2660653ae1_1ff5d67f74437199d316d2b2660653ae1">function parse_error_handler</h3><h4>Synopsis</h4><code class="synopsis">void rapidxml::parse_error_handler(const char *what, void *where);
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">When exceptions are disabled by defining RAPIDXML_NO_EXCEPTIONS, this function is called to notify user about the error. It must be defined by the user. <br/><br/>
+ This function cannot return. If it does, the results are undefined. <br/><br/>
+ A very simple definition might look like that: <preformatted>
+        void rapidxml::parse_error_handler(const char *what, void *where)
+        {
+            std::cout &lt;&lt; &quot;Parse error: &quot; &lt;&lt; what &lt;&lt; &quot;\n&quot;;
+            std::abort();
+        }
+        </preformatted></para><h4>Parameters</h4><dl><dt class="parameter-name">what</dt><dd class="parameter-def">Human readable description of the error. </dd></dl><dl><dt class="parameter-name">where</dt><dd class="parameter-def">Pointer to character data where error was detected. </dd></dl><p/><h3 class="reference-header" id="namespacerapidxml_b94d570fc4c4ab2423813cd0243326b1_1b94d570fc4c4ab2423813cd0243326b1">function print</h3><h4>Synopsis</h4><code class="synopsis">OutIt rapidxml::print(OutIt out, const xml_node&lt; Ch &gt; &amp;node, int flags=0);
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Prints XML to given output iterator. </para><h4>Parameters</h4><dl><dt class="parameter-name">out</dt><dd class="parameter-def">Output iterator to print to. </dd></dl><dl><dt class="parameter-name">node</dt><dd class="parameter-def">Node to be printed. Pass xml_document to print entire document. </dd></dl><dl><dt class="parameter-name">flags</dt><dd class="parameter-def">Flags controlling how XML is printed. </dd></dl><h4>Returns</h4>Output iterator pointing to position immediately after last character of printed text. <p/><h3 class="reference-header" id="namespacerapidxml_13bc37d6d1047acb0efdbc1689221a5e_113bc37d6d1047acb0efdbc1689221a5e">function print</h3><h4>Synopsis</h4><code class="synopsis">std::basic_ostream&lt;Ch&gt;&amp; rapidxml::print(std::basic_ostream&lt; Ch &gt; &amp;out, const xml_node&lt; Ch &gt; &amp;node, int flags=0);
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Prints XML to given output stream. </para><h4>Parameters</h4><dl><dt class="parameter-name">out</dt><dd class="parameter-def">Output stream to print to. </dd></dl><dl><dt class="parameter-name">node</dt><dd class="parameter-def">Node to be printed. Pass xml_document to print entire document. </dd></dl><dl><dt class="parameter-name">flags</dt><dd class="parameter-def">Flags controlling how XML is printed. </dd></dl><h4>Returns</h4>Output stream. <p/><h3 class="reference-header" id="namespacerapidxml_5619b38000d967fb223b2b0a8c17463a_15619b38000d967fb223b2b0a8c17463a">function operator&lt;&lt;</h3><h4>Synopsis</h4><code class="synopsis">std::basic_ostream&lt;Ch&gt;&amp; rapidxml::operator&lt;&lt;(std::basic_ostream&lt; Ch &gt; &amp;out, const xml_node&lt; Ch &gt; &amp;node);
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Prints formatted XML to given output stream. Uses default printing flags. Use <a href="#namespacerapidxml_b94d570fc4c4ab2423813cd0243326b1_1b94d570fc4c4ab2423813cd0243326b1" kindref="member">print()</a> function to customize printing process. </para><h4>Parameters</h4><dl><dt class="parameter-name">out</dt><dd class="parameter-def">Output stream to print to. </dd></dl><dl><dt class="parameter-name">node</dt><dd class="parameter-def">Node to be printed. </dd></dl><h4>Returns</h4>Output stream. <p/><h3 class="reference-header" id="namespacerapidxml_87e8bbab53702cf3b438bd553c10b6b9_187e8bbab53702cf3b438bd553c10b6b9">
+				constant
+			 parse_no_data_nodes</h3><h4>Synopsis</h4><code class="synopsis">const int parse_no_data_nodes
+											  = 0x1;
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Parse flag instructing the parser to not create data nodes. Text of first data node will still be placed in value of parent element, unless <a href="#namespacerapidxml_97e2c4fdc04fae17126f9971a4fc993e_197e2c4fdc04fae17126f9971a4fc993e" kindref="member">rapidxml::parse_no_element_values</a> flag is also specified. Can be combined with other flags by use of | operator. <br/><br/>
+ See <a href="#classrapidxml_1_1xml__document_8338ce6042e7b04d5a42144fb446b69c_18338ce6042e7b04d5a42144fb446b69c" kindref="member">xml_document::parse()</a> function. </para><p/><h3 class="reference-header" id="namespacerapidxml_97e2c4fdc04fae17126f9971a4fc993e_197e2c4fdc04fae17126f9971a4fc993e">
+				constant
+			 parse_no_element_values</h3><h4>Synopsis</h4><code class="synopsis">const int parse_no_element_values
+											  = 0x2;
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Parse flag instructing the parser to not use text of first data node as a value of parent element. Can be combined with other flags by use of | operator. Note that child data nodes of element node take precendence over its value when printing. That is, if element has one or more child data nodes <i>and</i> a value, the value will be ignored. Use <a href="#namespacerapidxml_87e8bbab53702cf3b438bd553c10b6b9_187e8bbab53702cf3b438bd553c10b6b9" kindref="member">rapidxml::parse_no_data_nodes</a> flag to prevent creation of data nodes if you want to manipulate data using values of elements. <br/><br/>
+ See <a href="#classrapidxml_1_1xml__document_8338ce6042e7b04d5a42144fb446b69c_18338ce6042e7b04d5a42144fb446b69c" kindref="member">xml_document::parse()</a> function. </para><p/><h3 class="reference-header" id="namespacerapidxml_9cae3801e70437cbc410c24bf6be691c_19cae3801e70437cbc410c24bf6be691c">
+				constant
+			 parse_no_string_terminators</h3><h4>Synopsis</h4><code class="synopsis">const int parse_no_string_terminators
+											  = 0x4;
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Parse flag instructing the parser to not place zero terminators after strings in the source text. By default zero terminators are placed, modifying source text. Can be combined with other flags by use of | operator. <br/><br/>
+ See <a href="#classrapidxml_1_1xml__document_8338ce6042e7b04d5a42144fb446b69c_18338ce6042e7b04d5a42144fb446b69c" kindref="member">xml_document::parse()</a> function. </para><p/><h3 class="reference-header" id="namespacerapidxml_7223b7815c4fb8b42e6e4e77e1ea6b97_17223b7815c4fb8b42e6e4e77e1ea6b97">
+				constant
+			 parse_no_entity_translation</h3><h4>Synopsis</h4><code class="synopsis">const int parse_no_entity_translation
+											  = 0x8;
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Parse flag instructing the parser to not translate entities in the source text. By default entities are translated, modifying source text. Can be combined with other flags by use of | operator. <br/><br/>
+ See <a href="#classrapidxml_1_1xml__document_8338ce6042e7b04d5a42144fb446b69c_18338ce6042e7b04d5a42144fb446b69c" kindref="member">xml_document::parse()</a> function. </para><p/><h3 class="reference-header" id="namespacerapidxml_ccde57f6054857ee4042a1b4d98c83b9_1ccde57f6054857ee4042a1b4d98c83b9">
+				constant
+			 parse_no_utf8</h3><h4>Synopsis</h4><code class="synopsis">const int parse_no_utf8
+											  = 0x10;
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Parse flag instructing the parser to disable UTF-8 handling and assume plain 8 bit characters. By default, UTF-8 handling is enabled. Can be combined with other flags by use of | operator. <br/><br/>
+ See <a href="#classrapidxml_1_1xml__document_8338ce6042e7b04d5a42144fb446b69c_18338ce6042e7b04d5a42144fb446b69c" kindref="member">xml_document::parse()</a> function. </para><p/><h3 class="reference-header" id="namespacerapidxml_52e2c934ad9c845a5f4cc49570470556_152e2c934ad9c845a5f4cc49570470556">
+				constant
+			 parse_declaration_node</h3><h4>Synopsis</h4><code class="synopsis">const int parse_declaration_node
+											  = 0x20;
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Parse flag instructing the parser to create XML declaration node. By default, declaration node is not created. Can be combined with other flags by use of | operator. <br/><br/>
+ See <a href="#classrapidxml_1_1xml__document_8338ce6042e7b04d5a42144fb446b69c_18338ce6042e7b04d5a42144fb446b69c" kindref="member">xml_document::parse()</a> function. </para><p/><h3 class="reference-header" id="namespacerapidxml_0f7479dacbc868456d07897a8c072784_10f7479dacbc868456d07897a8c072784">
+				constant
+			 parse_comment_nodes</h3><h4>Synopsis</h4><code class="synopsis">const int parse_comment_nodes
+											  = 0x40;
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Parse flag instructing the parser to create comments nodes. By default, comment nodes are not created. Can be combined with other flags by use of | operator. <br/><br/>
+ See <a href="#classrapidxml_1_1xml__document_8338ce6042e7b04d5a42144fb446b69c_18338ce6042e7b04d5a42144fb446b69c" kindref="member">xml_document::parse()</a> function. </para><p/><h3 class="reference-header" id="namespacerapidxml_8e187746ba1ca04f107951ad32df962e_18e187746ba1ca04f107951ad32df962e">
+				constant
+			 parse_doctype_node</h3><h4>Synopsis</h4><code class="synopsis">const int parse_doctype_node
+											  = 0x80;
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Parse flag instructing the parser to create DOCTYPE node. By default, doctype node is not created. Although W3C specification allows at most one DOCTYPE node, RapidXml will silently accept documents with more than one. Can be combined with other flags by use of | operator. <br/><br/>
+ See <a href="#classrapidxml_1_1xml__document_8338ce6042e7b04d5a42144fb446b69c_18338ce6042e7b04d5a42144fb446b69c" kindref="member">xml_document::parse()</a> function. </para><p/><h3 class="reference-header" id="namespacerapidxml_1c20b2b2b75711cd76423e119c49f830_11c20b2b2b75711cd76423e119c49f830">
+				constant
+			 parse_pi_nodes</h3><h4>Synopsis</h4><code class="synopsis">const int parse_pi_nodes
+											  = 0x100;
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Parse flag instructing the parser to create PI nodes. By default, PI nodes are not created. Can be combined with other flags by use of | operator. <br/><br/>
+ See <a href="#classrapidxml_1_1xml__document_8338ce6042e7b04d5a42144fb446b69c_18338ce6042e7b04d5a42144fb446b69c" kindref="member">xml_document::parse()</a> function. </para><p/><h3 class="reference-header" id="namespacerapidxml_a5daff9d61c7d4eaf98e4d42efe628ee_1a5daff9d61c7d4eaf98e4d42efe628ee">
+				constant
+			 parse_validate_closing_tags</h3><h4>Synopsis</h4><code class="synopsis">const int parse_validate_closing_tags
+											  = 0x200;
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Parse flag instructing the parser to validate closing tag names. If not set, name inside closing tag is irrelevant to the parser. By default, closing tags are not validated. Can be combined with other flags by use of | operator. <br/><br/>
+ See <a href="#classrapidxml_1_1xml__document_8338ce6042e7b04d5a42144fb446b69c_18338ce6042e7b04d5a42144fb446b69c" kindref="member">xml_document::parse()</a> function. </para><p/><h3 class="reference-header" id="namespacerapidxml_ac1f06b1afd47b812732fb521b146fd9_1ac1f06b1afd47b812732fb521b146fd9">
+				constant
+			 parse_trim_whitespace</h3><h4>Synopsis</h4><code class="synopsis">const int parse_trim_whitespace
+											  = 0x400;
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Parse flag instructing the parser to trim all leading and trailing whitespace of data nodes. By default, whitespace is not trimmed. This flag does not cause the parser to modify source text. Can be combined with other flags by use of | operator. <br/><br/>
+ See <a href="#classrapidxml_1_1xml__document_8338ce6042e7b04d5a42144fb446b69c_18338ce6042e7b04d5a42144fb446b69c" kindref="member">xml_document::parse()</a> function. </para><p/><h3 class="reference-header" id="namespacerapidxml_88f95d4e275ba01408fefde83078651b_188f95d4e275ba01408fefde83078651b">
+				constant
+			 parse_normalize_whitespace</h3><h4>Synopsis</h4><code class="synopsis">const int parse_normalize_whitespace
+											  = 0x800;
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Parse flag instructing the parser to condense all whitespace runs of data nodes to a single space character. Trimming of leading and trailing whitespace of data is controlled by <a href="#namespacerapidxml_ac1f06b1afd47b812732fb521b146fd9_1ac1f06b1afd47b812732fb521b146fd9" kindref="member">rapidxml::parse_trim_whitespace</a> flag. By default, whitespace is not normalized. If this flag is specified, source text will be modified. Can be combined with other flags by use of | operator. <br/><br/>
+ See <a href="#classrapidxml_1_1xml__document_8338ce6042e7b04d5a42144fb446b69c_18338ce6042e7b04d5a42144fb446b69c" kindref="member">xml_document::parse()</a> function. </para><p/><h3 class="reference-header" id="namespacerapidxml_45751cf2f38fd6915f35b3122b46d5b6_145751cf2f38fd6915f35b3122b46d5b6">
+				constant
+			 parse_default</h3><h4>Synopsis</h4><code class="synopsis">const int parse_default
+											  = 0;
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Parse flags which represent default behaviour of the parser. This is always equal to 0, so that all other flags can be simply ored together. Normally there is no need to inconveniently disable flags by anding with their negated (~) values. This also means that meaning of each flag is a <i>negation</i> of the default setting. For example, if flag name is <a href="#namespacerapidxml_ccde57f6054857ee4042a1b4d98c83b9_1ccde57f6054857ee4042a1b4d98c83b9" kindref="member">rapidxml::parse_no_utf8</a>, it means that utf-8 is <i>enabled</i> by default, and using the flag will disable it. <br/><br/>
+ See <a href="#classrapidxml_1_1xml__document_8338ce6042e7b04d5a42144fb446b69c_18338ce6042e7b04d5a42144fb446b69c" kindref="member">xml_document::parse()</a> function. </para><p/><h3 class="reference-header" id="namespacerapidxml_a97ba1a0a79a6d66f4eef3612508d943_1a97ba1a0a79a6d66f4eef3612508d943">
+				constant
+			 parse_non_destructive</h3><h4>Synopsis</h4><code class="synopsis">const int parse_non_destructive
+											  = parse_no_string_terminators | parse_no_entity_translation;
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">A combination of parse flags that forbids any modifications of the source text. This also results in faster parsing. However, note that the following will occur: <ul><li><para>names and values of nodes will not be zero terminated, you have to use <a href="#classrapidxml_1_1xml__base_0dae694c8f7e4d89f1003e2f3a15a43c_10dae694c8f7e4d89f1003e2f3a15a43c" kindref="member">xml_base::name_size()</a> and <a href="#classrapidxml_1_1xml__base_aed5ae791b7164c1ee5e649198cbb3db_1aed5ae791b7164c1ee5e649198cbb3db" kindref="member">xml_base::value_size()</a> functions to determine where name and value ends </para></li><li><para>entities will not be translated </para></li><li><para>whitespace will not be normalized </para></li></ul>
+See <a href="#classrapidxml_1_1xml__document_8338ce6042e7b04d5a42144fb446b69c_18338ce6042e7b04d5a42144fb446b69c" kindref="member">xml_document::parse()</a> function. </para><p/><h3 class="reference-header" id="namespacerapidxml_398c5476e76102f8bd76c10bb0abbe10_1398c5476e76102f8bd76c10bb0abbe10">
+				constant
+			 parse_fastest</h3><h4>Synopsis</h4><code class="synopsis">const int parse_fastest
+											  = parse_non_destructive | parse_no_data_nodes;
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">A combination of parse flags resulting in fastest possible parsing, without sacrificing important data. <br/><br/>
+ See <a href="#classrapidxml_1_1xml__document_8338ce6042e7b04d5a42144fb446b69c_18338ce6042e7b04d5a42144fb446b69c" kindref="member">xml_document::parse()</a> function. </para><p/><h3 class="reference-header" id="namespacerapidxml_b4f2515265facb42291570307924bd57_1b4f2515265facb42291570307924bd57">
+				constant
+			 parse_full</h3><h4>Synopsis</h4><code class="synopsis">const int parse_full
+											  = parse_declaration_node | parse_comment_nodes | parse_doctype_node | parse_pi_nodes | parse_validate_closing_tags;
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">A combination of parse flags resulting in largest amount of data being extracted. This usually results in slowest parsing. <br/><br/>
+ See <a href="#classrapidxml_1_1xml__document_8338ce6042e7b04d5a42144fb446b69c_18338ce6042e7b04d5a42144fb446b69c" kindref="member">xml_document::parse()</a> function. </para><p/><h3 class="reference-header" id="namespacerapidxml_b08b8d4293c203b69ed6c5ae77ac1907_1b08b8d4293c203b69ed6c5ae77ac1907">
+				constant
+			 print_no_indenting</h3><h4>Synopsis</h4><code class="synopsis">const int print_no_indenting
+											  = 0x1;
+									  </code><h4>Description</h4><para xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Printer flag instructing the printer to suppress indenting of XML. See <a href="#namespacerapidxml_b94d570fc4c4ab2423813cd0243326b1_1b94d570fc4c4ab2423813cd0243326b1" kindref="member">print()</a> function. </para><p/></body></html>
\ No newline at end of file
diff --git a/thirdpt/rapidxml-1.13/rapidxml.hpp b/thirdpt/rapidxml-1.13/rapidxml.hpp
new file mode 100644
index 0000000..ae91e08
--- /dev/null
+++ b/thirdpt/rapidxml-1.13/rapidxml.hpp
@@ -0,0 +1,2596 @@
+#ifndef RAPIDXML_HPP_INCLUDED
+#define RAPIDXML_HPP_INCLUDED
+
+// Copyright (C) 2006, 2009 Marcin Kalicinski
+// Version 1.13
+// Revision $DateTime: 2009/05/13 01:46:17 $
+//! \file rapidxml.hpp This file contains rapidxml parser and DOM implementation
+
+// If standard library is disabled, user must provide implementations of required functions and typedefs
+#if !defined(RAPIDXML_NO_STDLIB)
+    #include <cstdlib>      // For std::size_t
+    #include <cassert>      // For assert
+    #include <new>          // For placement new
+#endif
+
+// On MSVC, disable "conditional expression is constant" warning (level 4). 
+// This warning is almost impossible to avoid with certain types of templated code
+#ifdef _MSC_VER
+    #pragma warning(push)
+    #pragma warning(disable:4127)   // Conditional expression is constant
+#endif
+
+///////////////////////////////////////////////////////////////////////////
+// RAPIDXML_PARSE_ERROR
+    
+#if defined(RAPIDXML_NO_EXCEPTIONS)
+
+#define RAPIDXML_PARSE_ERROR(what, where) { parse_error_handler(what, where); assert(0); }
+
+namespace rapidxml
+{
+    //! When exceptions are disabled by defining RAPIDXML_NO_EXCEPTIONS, 
+    //! this function is called to notify user about the error.
+    //! It must be defined by the user.
+    //! <br><br>
+    //! This function cannot return. If it does, the results are undefined.
+    //! <br><br>
+    //! A very simple definition might look like that:
+    //! <pre>
+    //! void %rapidxml::%parse_error_handler(const char *what, void *where)
+    //! {
+    //!     std::cout << "Parse error: " << what << "\n";
+    //!     std::abort();
+    //! }
+    //! </pre>
+    //! \param what Human readable description of the error.
+    //! \param where Pointer to character data where error was detected.
+    void parse_error_handler(const char *what, void *where);
+}
+
+#else
+    
+#include <exception>    // For std::exception
+
+#define RAPIDXML_PARSE_ERROR(what, where) throw parse_error(what, where)
+
+namespace rapidxml
+{
+
+    //! Parse error exception. 
+    //! This exception is thrown by the parser when an error occurs. 
+    //! Use what() function to get human-readable error message. 
+    //! Use where() function to get a pointer to position within source text where error was detected.
+    //! <br><br>
+    //! If throwing exceptions by the parser is undesirable, 
+    //! it can be disabled by defining RAPIDXML_NO_EXCEPTIONS macro before rapidxml.hpp is included.
+    //! This will cause the parser to call rapidxml::parse_error_handler() function instead of throwing an exception.
+    //! This function must be defined by the user.
+    //! <br><br>
+    //! This class derives from <code>std::exception</code> class.
+    class parse_error: public std::exception
+    {
+    
+    public:
+    
+        //! Constructs parse error
+        parse_error(const char *what, void *where)
+            : m_what(what)
+            , m_where(where)
+        {
+        }
+
+        //! Gets human readable description of error.
+        //! \return Pointer to null terminated description of the error.
+        virtual const char *what() const throw()
+        {
+            return m_what;
+        }
+
+        //! Gets pointer to character data where error happened.
+        //! Ch should be the same as char type of xml_document that produced the error.
+        //! \return Pointer to location within the parsed string where error occured.
+        template<class Ch>
+        Ch *where() const
+        {
+            return reinterpret_cast<Ch *>(m_where);
+        }
+
+    private:  
+
+        const char *m_what;
+        void *m_where;
+
+    };
+}
+
+#endif
+
+///////////////////////////////////////////////////////////////////////////
+// Pool sizes
+
+#ifndef RAPIDXML_STATIC_POOL_SIZE
+    // Size of static memory block of memory_pool.
+    // Define RAPIDXML_STATIC_POOL_SIZE before including rapidxml.hpp if you want to override the default value.
+    // No dynamic memory allocations are performed by memory_pool until static memory is exhausted.
+    #define RAPIDXML_STATIC_POOL_SIZE (64 * 1024)
+#endif
+
+#ifndef RAPIDXML_DYNAMIC_POOL_SIZE
+    // Size of dynamic memory block of memory_pool.
+    // Define RAPIDXML_DYNAMIC_POOL_SIZE before including rapidxml.hpp if you want to override the default value.
+    // After the static block is exhausted, dynamic blocks with approximately this size are allocated by memory_pool.
+    #define RAPIDXML_DYNAMIC_POOL_SIZE (64 * 1024)
+#endif
+
+#ifndef RAPIDXML_ALIGNMENT
+    // Memory allocation alignment.
+    // Define RAPIDXML_ALIGNMENT before including rapidxml.hpp if you want to override the default value, which is the size of pointer.
+    // All memory allocations for nodes, attributes and strings will be aligned to this value.
+    // This must be a power of 2 and at least 1, otherwise memory_pool will not work.
+    #define RAPIDXML_ALIGNMENT sizeof(void *)
+#endif
+
+namespace rapidxml
+{
+    // Forward declarations
+    template<class Ch> class xml_node;
+    template<class Ch> class xml_attribute;
+    template<class Ch> class xml_document;
+    
+    //! Enumeration listing all node types produced by the parser.
+    //! Use xml_node::type() function to query node type.
+    enum node_type
+    {
+        node_document,      //!< A document node. Name and value are empty.
+        node_element,       //!< An element node. Name contains element name. Value contains text of first data node.
+        node_data,          //!< A data node. Name is empty. Value contains data text.
+        node_cdata,         //!< A CDATA node. Name is empty. Value contains data text.
+        node_comment,       //!< A comment node. Name is empty. Value contains comment text.
+        node_declaration,   //!< A declaration node. Name and value are empty. Declaration parameters (version, encoding and standalone) are in node attributes.
+        node_doctype,       //!< A DOCTYPE node. Name is empty. Value contains DOCTYPE text.
+        node_pi             //!< A PI node. Name contains target. Value contains instructions.
+    };
+
+    ///////////////////////////////////////////////////////////////////////
+    // Parsing flags
+
+    //! Parse flag instructing the parser to not create data nodes. 
+    //! Text of first data node will still be placed in value of parent element, unless rapidxml::parse_no_element_values flag is also specified.
+    //! Can be combined with other flags by use of | operator.
+    //! <br><br>
+    //! See xml_document::parse() function.
+    const int parse_no_data_nodes = 0x1;            
+
+    //! Parse flag instructing the parser to not use text of first data node as a value of parent element.
+    //! Can be combined with other flags by use of | operator.
+    //! Note that child data nodes of element node take precendence over its value when printing. 
+    //! That is, if element has one or more child data nodes <em>and</em> a value, the value will be ignored.
+    //! Use rapidxml::parse_no_data_nodes flag to prevent creation of data nodes if you want to manipulate data using values of elements.
+    //! <br><br>
+    //! See xml_document::parse() function.
+    const int parse_no_element_values = 0x2;
+    
+    //! Parse flag instructing the parser to not place zero terminators after strings in the source text.
+    //! By default zero terminators are placed, modifying source text.
+    //! Can be combined with other flags by use of | operator.
+    //! <br><br>
+    //! See xml_document::parse() function.
+    const int parse_no_string_terminators = 0x4;
+    
+    //! Parse flag instructing the parser to not translate entities in the source text.
+    //! By default entities are translated, modifying source text.
+    //! Can be combined with other flags by use of | operator.
+    //! <br><br>
+    //! See xml_document::parse() function.
+    const int parse_no_entity_translation = 0x8;
+    
+    //! Parse flag instructing the parser to disable UTF-8 handling and assume plain 8 bit characters.
+    //! By default, UTF-8 handling is enabled.
+    //! Can be combined with other flags by use of | operator.
+    //! <br><br>
+    //! See xml_document::parse() function.
+    const int parse_no_utf8 = 0x10;
+    
+    //! Parse flag instructing the parser to create XML declaration node.
+    //! By default, declaration node is not created.
+    //! Can be combined with other flags by use of | operator.
+    //! <br><br>
+    //! See xml_document::parse() function.
+    const int parse_declaration_node = 0x20;
+    
+    //! Parse flag instructing the parser to create comments nodes.
+    //! By default, comment nodes are not created.
+    //! Can be combined with other flags by use of | operator.
+    //! <br><br>
+    //! See xml_document::parse() function.
+    const int parse_comment_nodes = 0x40;
+    
+    //! Parse flag instructing the parser to create DOCTYPE node.
+    //! By default, doctype node is not created.
+    //! Although W3C specification allows at most one DOCTYPE node, RapidXml will silently accept documents with more than one.
+    //! Can be combined with other flags by use of | operator.
+    //! <br><br>
+    //! See xml_document::parse() function.
+    const int parse_doctype_node = 0x80;
+    
+    //! Parse flag instructing the parser to create PI nodes.
+    //! By default, PI nodes are not created.
+    //! Can be combined with other flags by use of | operator.
+    //! <br><br>
+    //! See xml_document::parse() function.
+    const int parse_pi_nodes = 0x100;
+    
+    //! Parse flag instructing the parser to validate closing tag names. 
+    //! If not set, name inside closing tag is irrelevant to the parser.
+    //! By default, closing tags are not validated.
+    //! Can be combined with other flags by use of | operator.
+    //! <br><br>
+    //! See xml_document::parse() function.
+    const int parse_validate_closing_tags = 0x200;
+    
+    //! Parse flag instructing the parser to trim all leading and trailing whitespace of data nodes.
+    //! By default, whitespace is not trimmed. 
+    //! This flag does not cause the parser to modify source text.
+    //! Can be combined with other flags by use of | operator.
+    //! <br><br>
+    //! See xml_document::parse() function.
+    const int parse_trim_whitespace = 0x400;
+
+    //! Parse flag instructing the parser to condense all whitespace runs of data nodes to a single space character.
+    //! Trimming of leading and trailing whitespace of data is controlled by rapidxml::parse_trim_whitespace flag.
+    //! By default, whitespace is not normalized. 
+    //! If this flag is specified, source text will be modified.
+    //! Can be combined with other flags by use of | operator.
+    //! <br><br>
+    //! See xml_document::parse() function.
+    const int parse_normalize_whitespace = 0x800;
+
+    // Compound flags
+    
+    //! Parse flags which represent default behaviour of the parser. 
+    //! This is always equal to 0, so that all other flags can be simply ored together.
+    //! Normally there is no need to inconveniently disable flags by anding with their negated (~) values.
+    //! This also means that meaning of each flag is a <i>negation</i> of the default setting. 
+    //! For example, if flag name is rapidxml::parse_no_utf8, it means that utf-8 is <i>enabled</i> by default,
+    //! and using the flag will disable it.
+    //! <br><br>
+    //! See xml_document::parse() function.
+    const int parse_default = 0;
+    
+    //! A combination of parse flags that forbids any modifications of the source text. 
+    //! This also results in faster parsing. However, note that the following will occur:
+    //! <ul>
+    //! <li>names and values of nodes will not be zero terminated, you have to use xml_base::name_size() and xml_base::value_size() functions to determine where name and value ends</li>
+    //! <li>entities will not be translated</li>
+    //! <li>whitespace will not be normalized</li>
+    //! </ul>
+    //! See xml_document::parse() function.
+    const int parse_non_destructive = parse_no_string_terminators | parse_no_entity_translation;
+    
+    //! A combination of parse flags resulting in fastest possible parsing, without sacrificing important data.
+    //! <br><br>
+    //! See xml_document::parse() function.
+    const int parse_fastest = parse_non_destructive | parse_no_data_nodes;
+    
+    //! A combination of parse flags resulting in largest amount of data being extracted. 
+    //! This usually results in slowest parsing.
+    //! <br><br>
+    //! See xml_document::parse() function.
+    const int parse_full = parse_declaration_node | parse_comment_nodes | parse_doctype_node | parse_pi_nodes | parse_validate_closing_tags;
+
+    ///////////////////////////////////////////////////////////////////////
+    // Internals
+
+    //! \cond internal
+    namespace internal
+    {
+
+        // Struct that contains lookup tables for the parser
+        // It must be a template to allow correct linking (because it has static data members, which are defined in a header file).
+        template<int Dummy>
+        struct lookup_tables
+        {
+            static const unsigned char lookup_whitespace[256];              // Whitespace table
+            static const unsigned char lookup_node_name[256];               // Node name table
+            static const unsigned char lookup_text[256];                    // Text table
+            static const unsigned char lookup_text_pure_no_ws[256];         // Text table
+            static const unsigned char lookup_text_pure_with_ws[256];       // Text table
+            static const unsigned char lookup_attribute_name[256];          // Attribute name table
+            static const unsigned char lookup_attribute_data_1[256];        // Attribute data table with single quote
+            static const unsigned char lookup_attribute_data_1_pure[256];   // Attribute data table with single quote
+            static const unsigned char lookup_attribute_data_2[256];        // Attribute data table with double quotes
+            static const unsigned char lookup_attribute_data_2_pure[256];   // Attribute data table with double quotes
+            static const unsigned char lookup_digits[256];                  // Digits
+            static const unsigned char lookup_upcase[256];                  // To uppercase conversion table for ASCII characters
+        };
+
+        // Find length of the string
+        template<class Ch>
+        inline std::size_t measure(const Ch *p)
+        {
+            const Ch *tmp = p;
+            while (*tmp) 
+                ++tmp;
+            return tmp - p;
+        }
+
+        // Compare strings for equality
+        template<class Ch>
+        inline bool compare(const Ch *p1, std::size_t size1, const Ch *p2, std::size_t size2, bool case_sensitive)
+        {
+            if (size1 != size2)
+                return false;
+            if (case_sensitive)
+            {
+                for (const Ch *end = p1 + size1; p1 < end; ++p1, ++p2)
+                    if (*p1 != *p2)
+                        return false;
+            }
+            else
+            {
+                for (const Ch *end = p1 + size1; p1 < end; ++p1, ++p2)
+                    if (lookup_tables<0>::lookup_upcase[static_cast<unsigned char>(*p1)] != lookup_tables<0>::lookup_upcase[static_cast<unsigned char>(*p2)])
+                        return false;
+            }
+            return true;
+        }
+    }
+    //! \endcond
+
+    ///////////////////////////////////////////////////////////////////////
+    // Memory pool
+    
+    //! This class is used by the parser to create new nodes and attributes, without overheads of dynamic memory allocation.
+    //! In most cases, you will not need to use this class directly. 
+    //! However, if you need to create nodes manually or modify names/values of nodes, 
+    //! you are encouraged to use memory_pool of relevant xml_document to allocate the memory. 
+    //! Not only is this faster than allocating them by using <code>new</code> operator, 
+    //! but also their lifetime will be tied to the lifetime of document, 
+    //! possibly simplyfing memory management. 
+    //! <br><br>
+    //! Call allocate_node() or allocate_attribute() functions to obtain new nodes or attributes from the pool. 
+    //! You can also call allocate_string() function to allocate strings.
+    //! Such strings can then be used as names or values of nodes without worrying about their lifetime.
+    //! Note that there is no <code>free()</code> function -- all allocations are freed at once when clear() function is called, 
+    //! or when the pool is destroyed.
+    //! <br><br>
+    //! It is also possible to create a standalone memory_pool, and use it 
+    //! to allocate nodes, whose lifetime will not be tied to any document.
+    //! <br><br>
+    //! Pool maintains <code>RAPIDXML_STATIC_POOL_SIZE</code> bytes of statically allocated memory. 
+    //! Until static memory is exhausted, no dynamic memory allocations are done.
+    //! When static memory is exhausted, pool allocates additional blocks of memory of size <code>RAPIDXML_DYNAMIC_POOL_SIZE</code> each,
+    //! by using global <code>new[]</code> and <code>delete[]</code> operators. 
+    //! This behaviour can be changed by setting custom allocation routines. 
+    //! Use set_allocator() function to set them.
+    //! <br><br>
+    //! Allocations for nodes, attributes and strings are aligned at <code>RAPIDXML_ALIGNMENT</code> bytes.
+    //! This value defaults to the size of pointer on target architecture.
+    //! <br><br>
+    //! To obtain absolutely top performance from the parser,
+    //! it is important that all nodes are allocated from a single, contiguous block of memory.
+    //! Otherwise, cache misses when jumping between two (or more) disjoint blocks of memory can slow down parsing quite considerably.
+    //! If required, you can tweak <code>RAPIDXML_STATIC_POOL_SIZE</code>, <code>RAPIDXML_DYNAMIC_POOL_SIZE</code> and <code>RAPIDXML_ALIGNMENT</code> 
+    //! to obtain best wasted memory to performance compromise.
+    //! To do it, define their values before rapidxml.hpp file is included.
+    //! \param Ch Character type of created nodes. 
+    template<class Ch = char>
+    class memory_pool
+    {
+        
+    public:
+
+        //! \cond internal
+        typedef void *(alloc_func)(std::size_t);       // Type of user-defined function used to allocate memory
+        typedef void (free_func)(void *);              // Type of user-defined function used to free memory
+        //! \endcond
+        
+        //! Constructs empty pool with default allocator functions.
+        memory_pool()
+            : m_alloc_func(0)
+            , m_free_func(0)
+        {
+            init();
+        }
+
+        //! Destroys pool and frees all the memory. 
+        //! This causes memory occupied by nodes allocated by the pool to be freed.
+        //! Nodes allocated from the pool are no longer valid.
+        ~memory_pool()
+        {
+            clear();
+        }
+
+        //! Allocates a new node from the pool, and optionally assigns name and value to it. 
+        //! If the allocation request cannot be accomodated, this function will throw <code>std::bad_alloc</code>.
+        //! If exceptions are disabled by defining RAPIDXML_NO_EXCEPTIONS, this function
+        //! will call rapidxml::parse_error_handler() function.
+        //! \param type Type of node to create.
+        //! \param name Name to assign to the node, or 0 to assign no name.
+        //! \param value Value to assign to the node, or 0 to assign no value.
+        //! \param name_size Size of name to assign, or 0 to automatically calculate size from name string.
+        //! \param value_size Size of value to assign, or 0 to automatically calculate size from value string.
+        //! \return Pointer to allocated node. This pointer will never be NULL.
+        xml_node<Ch> *allocate_node(node_type type, 
+                                    const Ch *name = 0, const Ch *value = 0, 
+                                    std::size_t name_size = 0, std::size_t value_size = 0)
+        {
+            void *memory = allocate_aligned(sizeof(xml_node<Ch>));
+            xml_node<Ch> *node = new(memory) xml_node<Ch>(type);
+            if (name)
+            {
+                if (name_size > 0)
+                    node->name(name, name_size);
+                else
+                    node->name(name);
+            }
+            if (value)
+            {
+                if (value_size > 0)
+                    node->value(value, value_size);
+                else
+                    node->value(value);
+            }
+            return node;
+        }
+
+        //! Allocates a new attribute from the pool, and optionally assigns name and value to it.
+        //! If the allocation request cannot be accomodated, this function will throw <code>std::bad_alloc</code>.
+        //! If exceptions are disabled by defining RAPIDXML_NO_EXCEPTIONS, this function
+        //! will call rapidxml::parse_error_handler() function.
+        //! \param name Name to assign to the attribute, or 0 to assign no name.
+        //! \param value Value to assign to the attribute, or 0 to assign no value.
+        //! \param name_size Size of name to assign, or 0 to automatically calculate size from name string.
+        //! \param value_size Size of value to assign, or 0 to automatically calculate size from value string.
+        //! \return Pointer to allocated attribute. This pointer will never be NULL.
+        xml_attribute<Ch> *allocate_attribute(const Ch *name = 0, const Ch *value = 0, 
+                                              std::size_t name_size = 0, std::size_t value_size = 0)
+        {
+            void *memory = allocate_aligned(sizeof(xml_attribute<Ch>));
+            xml_attribute<Ch> *attribute = new(memory) xml_attribute<Ch>;
+            if (name)
+            {
+                if (name_size > 0)
+                    attribute->name(name, name_size);
+                else
+                    attribute->name(name);
+            }
+            if (value)
+            {
+                if (value_size > 0)
+                    attribute->value(value, value_size);
+                else
+                    attribute->value(value);
+            }
+            return attribute;
+        }
+
+        //! Allocates a char array of given size from the pool, and optionally copies a given string to it.
+        //! If the allocation request cannot be accomodated, this function will throw <code>std::bad_alloc</code>.
+        //! If exceptions are disabled by defining RAPIDXML_NO_EXCEPTIONS, this function
+        //! will call rapidxml::parse_error_handler() function.
+        //! \param source String to initialize the allocated memory with, or 0 to not initialize it.
+        //! \param size Number of characters to allocate, or zero to calculate it automatically from source string length; if size is 0, source string must be specified and null terminated.
+        //! \return Pointer to allocated char array. This pointer will never be NULL.
+        Ch *allocate_string(const Ch *source = 0, std::size_t size = 0)
+        {
+            assert(source || size);     // Either source or size (or both) must be specified
+            if (size == 0)
+                size = internal::measure(source) + 1;
+            Ch *result = static_cast<Ch *>(allocate_aligned(size * sizeof(Ch)));
+            if (source)
+                for (std::size_t i = 0; i < size; ++i)
+                    result[i] = source[i];
+            return result;
+        }
+
+        //! Clones an xml_node and its hierarchy of child nodes and attributes.
+        //! Nodes and attributes are allocated from this memory pool.
+        //! Names and values are not cloned, they are shared between the clone and the source.
+        //! Result node can be optionally specified as a second parameter, 
+        //! in which case its contents will be replaced with cloned source node.
+        //! This is useful when you want to clone entire document.
+        //! \param source Node to clone.
+        //! \param result Node to put results in, or 0 to automatically allocate result node
+        //! \return Pointer to cloned node. This pointer will never be NULL.
+        xml_node<Ch> *clone_node(const xml_node<Ch> *source, xml_node<Ch> *result = 0)
+        {
+            // Prepare result node
+            if (result)
+            {
+                result->remove_all_attributes();
+                result->remove_all_nodes();
+                result->type(source->type());
+            }
+            else
+                result = allocate_node(source->type());
+
+            // Clone name and value
+            result->name(source->name(), source->name_size());
+            result->value(source->value(), source->value_size());
+
+            // Clone child nodes and attributes
+            for (xml_node<Ch> *child = source->first_node(); child; child = child->next_sibling())
+                result->append_node(clone_node(child));
+            for (xml_attribute<Ch> *attr = source->first_attribute(); attr; attr = attr->next_attribute())
+                result->append_attribute(allocate_attribute(attr->name(), attr->value(), attr->name_size(), attr->value_size()));
+
+            return result;
+        }
+
+        //! Clears the pool. 
+        //! This causes memory occupied by nodes allocated by the pool to be freed.
+        //! Any nodes or strings allocated from the pool will no longer be valid.
+        void clear()
+        {
+            while (m_begin != m_static_memory)
+            {
+                char *previous_begin = reinterpret_cast<header *>(align(m_begin))->previous_begin;
+                if (m_free_func)
+                    m_free_func(m_begin);
+                else
+                    delete[] m_begin;
+                m_begin = previous_begin;
+            }
+            init();
+        }
+
+        //! Sets or resets the user-defined memory allocation functions for the pool.
+        //! This can only be called when no memory is allocated from the pool yet, otherwise results are undefined.
+        //! Allocation function must not return invalid pointer on failure. It should either throw,
+        //! stop the program, or use <code>longjmp()</code> function to pass control to other place of program. 
+        //! If it returns invalid pointer, results are undefined.
+        //! <br><br>
+        //! User defined allocation functions must have the following forms:
+        //! <br><code>
+        //! <br>void *allocate(std::size_t size);
+        //! <br>void free(void *pointer);
+        //! </code><br>
+        //! \param af Allocation function, or 0 to restore default function
+        //! \param ff Free function, or 0 to restore default function
+        void set_allocator(alloc_func *af, free_func *ff)
+        {
+            assert(m_begin == m_static_memory && m_ptr == align(m_begin));    // Verify that no memory is allocated yet
+            m_alloc_func = af;
+            m_free_func = ff;
+        }
+
+    private:
+
+        struct header
+        {
+            char *previous_begin;
+        };
+
+        void init()
+        {
+            m_begin = m_static_memory;
+            m_ptr = align(m_begin);
+            m_end = m_static_memory + sizeof(m_static_memory);
+        }
+        
+        char *align(char *ptr)
+        {
+            std::size_t alignment = ((RAPIDXML_ALIGNMENT - (std::size_t(ptr) & (RAPIDXML_ALIGNMENT - 1))) & (RAPIDXML_ALIGNMENT - 1));
+            return ptr + alignment;
+        }
+        
+        char *allocate_raw(std::size_t size)
+        {
+            // Allocate
+            void *memory;   
+            if (m_alloc_func)   // Allocate memory using either user-specified allocation function or global operator new[]
+            {
+                memory = m_alloc_func(size);
+                assert(memory); // Allocator is not allowed to return 0, on failure it must either throw, stop the program or use longjmp
+            }
+            else
+            {
+                memory = new char[size];
+#ifdef RAPIDXML_NO_EXCEPTIONS
+                if (!memory)            // If exceptions are disabled, verify memory allocation, because new will not be able to throw bad_alloc
+                    RAPIDXML_PARSE_ERROR("out of memory", 0);
+#endif
+            }
+            return static_cast<char *>(memory);
+        }
+        
+        void *allocate_aligned(std::size_t size)
+        {
+            // Calculate aligned pointer
+            char *result = align(m_ptr);
+
+            // If not enough memory left in current pool, allocate a new pool
+            if (result + size > m_end)
+            {
+                // Calculate required pool size (may be bigger than RAPIDXML_DYNAMIC_POOL_SIZE)
+                std::size_t pool_size = RAPIDXML_DYNAMIC_POOL_SIZE;
+                if (pool_size < size)
+                    pool_size = size;
+                
+                // Allocate
+                std::size_t alloc_size = sizeof(header) + (2 * RAPIDXML_ALIGNMENT - 2) + pool_size;     // 2 alignments required in worst case: one for header, one for actual allocation
+                char *raw_memory = allocate_raw(alloc_size);
+                    
+                // Setup new pool in allocated memory
+                char *pool = align(raw_memory);
+                header *new_header = reinterpret_cast<header *>(pool);
+                new_header->previous_begin = m_begin;
+                m_begin = raw_memory;
+                m_ptr = pool + sizeof(header);
+                m_end = raw_memory + alloc_size;
+
+                // Calculate aligned pointer again using new pool
+                result = align(m_ptr);
+            }
+
+            // Update pool and return aligned pointer
+            m_ptr = result + size;
+            return result;
+        }
+
+        char *m_begin;                                      // Start of raw memory making up current pool
+        char *m_ptr;                                        // First free byte in current pool
+        char *m_end;                                        // One past last available byte in current pool
+        char m_static_memory[RAPIDXML_STATIC_POOL_SIZE];    // Static raw memory
+        alloc_func *m_alloc_func;                           // Allocator function, or 0 if default is to be used
+        free_func *m_free_func;                             // Free function, or 0 if default is to be used
+    };
+
+    ///////////////////////////////////////////////////////////////////////////
+    // XML base
+
+    //! Base class for xml_node and xml_attribute implementing common functions: 
+    //! name(), name_size(), value(), value_size() and parent().
+    //! \param Ch Character type to use
+    template<class Ch = char>
+    class xml_base
+    {
+
+    public:
+        
+        ///////////////////////////////////////////////////////////////////////////
+        // Construction & destruction
+    
+        // Construct a base with empty name, value and parent
+        xml_base()
+            : m_name(0)
+            , m_value(0)
+            , m_parent(0)
+        {
+        }
+
+        ///////////////////////////////////////////////////////////////////////////
+        // Node data access
+    
+        //! Gets name of the node. 
+        //! Interpretation of name depends on type of node.
+        //! Note that name will not be zero-terminated if rapidxml::parse_no_string_terminators option was selected during parse.
+        //! <br><br>
+        //! Use name_size() function to determine length of the name.
+        //! \return Name of node, or empty string if node has no name.
+        Ch *name() const
+        {
+            return m_name ? m_name : nullstr();
+        }
+
+        //! Gets size of node name, not including terminator character.
+        //! This function works correctly irrespective of whether name is or is not zero terminated.
+        //! \return Size of node name, in characters.
+        std::size_t name_size() const
+        {
+            return m_name ? m_name_size : 0;
+        }
+
+        //! Gets value of node. 
+        //! Interpretation of value depends on type of node.
+        //! Note that value will not be zero-terminated if rapidxml::parse_no_string_terminators option was selected during parse.
+        //! <br><br>
+        //! Use value_size() function to determine length of the value.
+        //! \return Value of node, or empty string if node has no value.
+        Ch *value() const
+        {
+            return m_value ? m_value : nullstr();
+        }
+
+        //! Gets size of node value, not including terminator character.
+        //! This function works correctly irrespective of whether value is or is not zero terminated.
+        //! \return Size of node value, in characters.
+        std::size_t value_size() const
+        {
+            return m_value ? m_value_size : 0;
+        }
+
+        ///////////////////////////////////////////////////////////////////////////
+        // Node modification
+    
+        //! Sets name of node to a non zero-terminated string.
+        //! See \ref ownership_of_strings.
+        //! <br><br>
+        //! Note that node does not own its name or value, it only stores a pointer to it. 
+        //! It will not delete or otherwise free the pointer on destruction.
+        //! It is reponsibility of the user to properly manage lifetime of the string.
+        //! The easiest way to achieve it is to use memory_pool of the document to allocate the string -
+        //! on destruction of the document the string will be automatically freed.
+        //! <br><br>
+        //! Size of name must be specified separately, because name does not have to be zero terminated.
+        //! Use name(const Ch *) function to have the length automatically calculated (string must be zero terminated).
+        //! \param name Name of node to set. Does not have to be zero terminated.
+        //! \param size Size of name, in characters. This does not include zero terminator, if one is present.
+        void name(const Ch *name, std::size_t size)
+        {
+            m_name = const_cast<Ch *>(name);
+            m_name_size = size;
+        }
+
+        //! Sets name of node to a zero-terminated string.
+        //! See also \ref ownership_of_strings and xml_node::name(const Ch *, std::size_t).
+        //! \param name Name of node to set. Must be zero terminated.
+        void name(const Ch *name)
+        {
+            this->name(name, internal::measure(name));
+        }
+
+        //! Sets value of node to a non zero-terminated string.
+        //! See \ref ownership_of_strings.
+        //! <br><br>
+        //! Note that node does not own its name or value, it only stores a pointer to it. 
+        //! It will not delete or otherwise free the pointer on destruction.
+        //! It is reponsibility of the user to properly manage lifetime of the string.
+        //! The easiest way to achieve it is to use memory_pool of the document to allocate the string -
+        //! on destruction of the document the string will be automatically freed.
+        //! <br><br>
+        //! Size of value must be specified separately, because it does not have to be zero terminated.
+        //! Use value(const Ch *) function to have the length automatically calculated (string must be zero terminated).
+        //! <br><br>
+        //! If an element has a child node of type node_data, it will take precedence over element value when printing.
+        //! If you want to manipulate data of elements using values, use parser flag rapidxml::parse_no_data_nodes to prevent creation of data nodes by the parser.
+        //! \param value value of node to set. Does not have to be zero terminated.
+        //! \param size Size of value, in characters. This does not include zero terminator, if one is present.
+        void value(const Ch *value, std::size_t size)
+        {
+            m_value = const_cast<Ch *>(value);
+            m_value_size = size;
+        }
+
+        //! Sets value of node to a zero-terminated string.
+        //! See also \ref ownership_of_strings and xml_node::value(const Ch *, std::size_t).
+        //! \param value Vame of node to set. Must be zero terminated.
+        void value(const Ch *value)
+        {
+            this->value(value, internal::measure(value));
+        }
+
+        ///////////////////////////////////////////////////////////////////////////
+        // Related nodes access
+    
+        //! Gets node parent.
+        //! \return Pointer to parent node, or 0 if there is no parent.
+        xml_node<Ch> *parent() const
+        {
+            return m_parent;
+        }
+
+    protected:
+
+        // Return empty string
+        static Ch *nullstr()
+        {
+            static Ch zero = Ch('\0');
+            return &zero;
+        }
+
+        Ch *m_name;                         // Name of node, or 0 if no name
+        Ch *m_value;                        // Value of node, or 0 if no value
+        std::size_t m_name_size;            // Length of node name, or undefined of no name
+        std::size_t m_value_size;           // Length of node value, or undefined if no value
+        xml_node<Ch> *m_parent;             // Pointer to parent node, or 0 if none
+
+    };
+
+    //! Class representing attribute node of XML document. 
+    //! Each attribute has name and value strings, which are available through name() and value() functions (inherited from xml_base).
+    //! Note that after parse, both name and value of attribute will point to interior of source text used for parsing. 
+    //! Thus, this text must persist in memory for the lifetime of attribute.
+    //! \param Ch Character type to use.
+    template<class Ch = char>
+    class xml_attribute: public xml_base<Ch>
+    {
+
+        friend class xml_node<Ch>;
+    
+    public:
+
+        ///////////////////////////////////////////////////////////////////////////
+        // Construction & destruction
+    
+        //! Constructs an empty attribute with the specified type. 
+        //! Consider using memory_pool of appropriate xml_document if allocating attributes manually.
+        xml_attribute()
+        {
+        }
+
+        ///////////////////////////////////////////////////////////////////////////
+        // Related nodes access
+    
+        //! Gets document of which attribute is a child.
+        //! \return Pointer to document that contains this attribute, or 0 if there is no parent document.
+        xml_document<Ch> *document() const
+        {
+            if (xml_node<Ch> *node = this->parent())
+            {
+                while (node->parent())
+                    node = node->parent();
+                return node->type() == node_document ? static_cast<xml_document<Ch> *>(node) : 0;
+            }
+            else
+                return 0;
+        }
+
+        //! Gets previous attribute, optionally matching attribute name. 
+        //! \param name Name of attribute to find, or 0 to return previous attribute regardless of its name; this string doesn't have to be zero-terminated if name_size is non-zero
+        //! \param name_size Size of name, in characters, or 0 to have size calculated automatically from string
+        //! \param case_sensitive Should name comparison be case-sensitive; non case-sensitive comparison works properly only for ASCII characters
+        //! \return Pointer to found attribute, or 0 if not found.
+        xml_attribute<Ch> *previous_attribute(const Ch *name = 0, std::size_t name_size = 0, bool case_sensitive = true) const
+        {
+            if (name)
+            {
+                if (name_size == 0)
+                    name_size = internal::measure(name);
+                for (xml_attribute<Ch> *attribute = m_prev_attribute; attribute; attribute = attribute->m_prev_attribute)
+                    if (internal::compare(attribute->name(), attribute->name_size(), name, name_size, case_sensitive))
+                        return attribute;
+                return 0;
+            }
+            else
+                return this->m_parent ? m_prev_attribute : 0;
+        }
+
+        //! Gets next attribute, optionally matching attribute name. 
+        //! \param name Name of attribute to find, or 0 to return next attribute regardless of its name; this string doesn't have to be zero-terminated if name_size is non-zero
+        //! \param name_size Size of name, in characters, or 0 to have size calculated automatically from string
+        //! \param case_sensitive Should name comparison be case-sensitive; non case-sensitive comparison works properly only for ASCII characters
+        //! \return Pointer to found attribute, or 0 if not found.
+        xml_attribute<Ch> *next_attribute(const Ch *name = 0, std::size_t name_size = 0, bool case_sensitive = true) const
+        {
+            if (name)
+            {
+                if (name_size == 0)
+                    name_size = internal::measure(name);
+                for (xml_attribute<Ch> *attribute = m_next_attribute; attribute; attribute = attribute->m_next_attribute)
+                    if (internal::compare(attribute->name(), attribute->name_size(), name, name_size, case_sensitive))
+                        return attribute;
+                return 0;
+            }
+            else
+                return this->m_parent ? m_next_attribute : 0;
+        }
+
+    private:
+
+        xml_attribute<Ch> *m_prev_attribute;        // Pointer to previous sibling of attribute, or 0 if none; only valid if parent is non-zero
+        xml_attribute<Ch> *m_next_attribute;        // Pointer to next sibling of attribute, or 0 if none; only valid if parent is non-zero
+    
+    };
+
+    ///////////////////////////////////////////////////////////////////////////
+    // XML node
+
+    //! Class representing a node of XML document. 
+    //! Each node may have associated name and value strings, which are available through name() and value() functions. 
+    //! Interpretation of name and value depends on type of the node.
+    //! Type of node can be determined by using type() function.
+    //! <br><br>
+    //! Note that after parse, both name and value of node, if any, will point interior of source text used for parsing. 
+    //! Thus, this text must persist in the memory for the lifetime of node.
+    //! \param Ch Character type to use.
+    template<class Ch = char>
+    class xml_node: public xml_base<Ch>
+    {
+
+    public:
+
+        ///////////////////////////////////////////////////////////////////////////
+        // Construction & destruction
+    
+        //! Constructs an empty node with the specified type. 
+        //! Consider using memory_pool of appropriate document to allocate nodes manually.
+        //! \param type Type of node to construct.
+        xml_node(node_type type)
+            : m_type(type)
+            , m_first_node(0)
+            , m_first_attribute(0)
+        {
+        }
+
+        ///////////////////////////////////////////////////////////////////////////
+        // Node data access
+    
+        //! Gets type of node.
+        //! \return Type of node.
+        node_type type() const
+        {
+            return m_type;
+        }
+
+        ///////////////////////////////////////////////////////////////////////////
+        // Related nodes access
+    
+        //! Gets document of which node is a child.
+        //! \return Pointer to document that contains this node, or 0 if there is no parent document.
+        xml_document<Ch> *document() const
+        {
+            xml_node<Ch> *node = const_cast<xml_node<Ch> *>(this);
+            while (node->parent())
+                node = node->parent();
+            return node->type() == node_document ? static_cast<xml_document<Ch> *>(node) : 0;
+        }
+
+        //! Gets first child node, optionally matching node name.
+        //! \param name Name of child to find, or 0 to return first child regardless of its name; this string doesn't have to be zero-terminated if name_size is non-zero
+        //! \param name_size Size of name, in characters, or 0 to have size calculated automatically from string
+        //! \param case_sensitive Should name comparison be case-sensitive; non case-sensitive comparison works properly only for ASCII characters
+        //! \return Pointer to found child, or 0 if not found.
+        xml_node<Ch> *first_node(const Ch *name = 0, std::size_t name_size = 0, bool case_sensitive = true) const
+        {
+            if (name)
+            {
+                if (name_size == 0)
+                    name_size = internal::measure(name);
+                for (xml_node<Ch> *child = m_first_node; child; child = child->next_sibling())
+                    if (internal::compare(child->name(), child->name_size(), name, name_size, case_sensitive))
+                        return child;
+                return 0;
+            }
+            else
+                return m_first_node;
+        }
+
+        //! Gets last child node, optionally matching node name. 
+        //! Behaviour is undefined if node has no children.
+        //! Use first_node() to test if node has children.
+        //! \param name Name of child to find, or 0 to return last child regardless of its name; this string doesn't have to be zero-terminated if name_size is non-zero
+        //! \param name_size Size of name, in characters, or 0 to have size calculated automatically from string
+        //! \param case_sensitive Should name comparison be case-sensitive; non case-sensitive comparison works properly only for ASCII characters
+        //! \return Pointer to found child, or 0 if not found.
+        xml_node<Ch> *last_node(const Ch *name = 0, std::size_t name_size = 0, bool case_sensitive = true) const
+        {
+            assert(m_first_node);  // Cannot query for last child if node has no children
+            if (name)
+            {
+                if (name_size == 0)
+                    name_size = internal::measure(name);
+                for (xml_node<Ch> *child = m_last_node; child; child = child->previous_sibling())
+                    if (internal::compare(child->name(), child->name_size(), name, name_size, case_sensitive))
+                        return child;
+                return 0;
+            }
+            else
+                return m_last_node;
+        }
+
+        //! Gets previous sibling node, optionally matching node name. 
+        //! Behaviour is undefined if node has no parent.
+        //! Use parent() to test if node has a parent.
+        //! \param name Name of sibling to find, or 0 to return previous sibling regardless of its name; this string doesn't have to be zero-terminated if name_size is non-zero
+        //! \param name_size Size of name, in characters, or 0 to have size calculated automatically from string
+        //! \param case_sensitive Should name comparison be case-sensitive; non case-sensitive comparison works properly only for ASCII characters
+        //! \return Pointer to found sibling, or 0 if not found.
+        xml_node<Ch> *previous_sibling(const Ch *name = 0, std::size_t name_size = 0, bool case_sensitive = true) const
+        {
+            assert(this->m_parent);     // Cannot query for siblings if node has no parent
+            if (name)
+            {
+                if (name_size == 0)
+                    name_size = internal::measure(name);
+                for (xml_node<Ch> *sibling = m_prev_sibling; sibling; sibling = sibling->m_prev_sibling)
+                    if (internal::compare(sibling->name(), sibling->name_size(), name, name_size, case_sensitive))
+                        return sibling;
+                return 0;
+            }
+            else
+                return m_prev_sibling;
+        }
+
+        //! Gets next sibling node, optionally matching node name. 
+        //! Behaviour is undefined if node has no parent.
+        //! Use parent() to test if node has a parent.
+        //! \param name Name of sibling to find, or 0 to return next sibling regardless of its name; this string doesn't have to be zero-terminated if name_size is non-zero
+        //! \param name_size Size of name, in characters, or 0 to have size calculated automatically from string
+        //! \param case_sensitive Should name comparison be case-sensitive; non case-sensitive comparison works properly only for ASCII characters
+        //! \return Pointer to found sibling, or 0 if not found.
+        xml_node<Ch> *next_sibling(const Ch *name = 0, std::size_t name_size = 0, bool case_sensitive = true) const
+        {
+            assert(this->m_parent);     // Cannot query for siblings if node has no parent
+            if (name)
+            {
+                if (name_size == 0)
+                    name_size = internal::measure(name);
+                for (xml_node<Ch> *sibling = m_next_sibling; sibling; sibling = sibling->m_next_sibling)
+                    if (internal::compare(sibling->name(), sibling->name_size(), name, name_size, case_sensitive))
+                        return sibling;
+                return 0;
+            }
+            else
+                return m_next_sibling;
+        }
+
+        //! Gets first attribute of node, optionally matching attribute name.
+        //! \param name Name of attribute to find, or 0 to return first attribute regardless of its name; this string doesn't have to be zero-terminated if name_size is non-zero
+        //! \param name_size Size of name, in characters, or 0 to have size calculated automatically from string
+        //! \param case_sensitive Should name comparison be case-sensitive; non case-sensitive comparison works properly only for ASCII characters
+        //! \return Pointer to found attribute, or 0 if not found.
+        xml_attribute<Ch> *first_attribute(const Ch *name = 0, std::size_t name_size = 0, bool case_sensitive = true) const
+        {
+            if (name)
+            {
+                if (name_size == 0)
+                    name_size = internal::measure(name);
+                for (xml_attribute<Ch> *attribute = m_first_attribute; attribute; attribute = attribute->m_next_attribute)
+                    if (internal::compare(attribute->name(), attribute->name_size(), name, name_size, case_sensitive))
+                        return attribute;
+                return 0;
+            }
+            else
+                return m_first_attribute;
+        }
+
+        //! Gets last attribute of node, optionally matching attribute name.
+        //! \param name Name of attribute to find, or 0 to return last attribute regardless of its name; this string doesn't have to be zero-terminated if name_size is non-zero
+        //! \param name_size Size of name, in characters, or 0 to have size calculated automatically from string
+        //! \param case_sensitive Should name comparison be case-sensitive; non case-sensitive comparison works properly only for ASCII characters
+        //! \return Pointer to found attribute, or 0 if not found.
+        xml_attribute<Ch> *last_attribute(const Ch *name = 0, std::size_t name_size = 0, bool case_sensitive = true) const
+        {
+            if (name)
+            {
+                if (name_size == 0)
+                    name_size = internal::measure(name);
+                for (xml_attribute<Ch> *attribute = m_last_attribute; attribute; attribute = attribute->m_prev_attribute)
+                    if (internal::compare(attribute->name(), attribute->name_size(), name, name_size, case_sensitive))
+                        return attribute;
+                return 0;
+            }
+            else
+                return m_first_attribute ? m_last_attribute : 0;
+        }
+
+        ///////////////////////////////////////////////////////////////////////////
+        // Node modification
+    
+        //! Sets type of node.
+        //! \param type Type of node to set.
+        void type(node_type type)
+        {
+            m_type = type;
+        }
+
+        ///////////////////////////////////////////////////////////////////////////
+        // Node manipulation
+
+        //! Prepends a new child node.
+        //! The prepended child becomes the first child, and all existing children are moved one position back.
+        //! \param child Node to prepend.
+        void prepend_node(xml_node<Ch> *child)
+        {
+            assert(child && !child->parent() && child->type() != node_document);
+            if (first_node())
+            {
+                child->m_next_sibling = m_first_node;
+                m_first_node->m_prev_sibling = child;
+            }
+            else
+            {
+                child->m_next_sibling = 0;
+                m_last_node = child;
+            }
+            m_first_node = child;
+            child->m_parent = this;
+            child->m_prev_sibling = 0;
+        }
+
+        //! Appends a new child node. 
+        //! The appended child becomes the last child.
+        //! \param child Node to append.
+        void append_node(xml_node<Ch> *child)
+        {
+            assert(child && !child->parent() && child->type() != node_document);
+            if (first_node())
+            {
+                child->m_prev_sibling = m_last_node;
+                m_last_node->m_next_sibling = child;
+            }
+            else
+            {
+                child->m_prev_sibling = 0;
+                m_first_node = child;
+            }
+            m_last_node = child;
+            child->m_parent = this;
+            child->m_next_sibling = 0;
+        }
+
+        //! Inserts a new child node at specified place inside the node. 
+        //! All children after and including the specified node are moved one position back.
+        //! \param where Place where to insert the child, or 0 to insert at the back.
+        //! \param child Node to insert.
+        void insert_node(xml_node<Ch> *where, xml_node<Ch> *child)
+        {
+            assert(!where || where->parent() == this);
+            assert(child && !child->parent() && child->type() != node_document);
+            if (where == m_first_node)
+                prepend_node(child);
+            else if (where == 0)
+                append_node(child);
+            else
+            {
+                child->m_prev_sibling = where->m_prev_sibling;
+                child->m_next_sibling = where;
+                where->m_prev_sibling->m_next_sibling = child;
+                where->m_prev_sibling = child;
+                child->m_parent = this;
+            }
+        }
+
+        //! Removes first child node. 
+        //! If node has no children, behaviour is undefined.
+        //! Use first_node() to test if node has children.
+        void remove_first_node()
+        {
+            assert(first_node());
+            xml_node<Ch> *child = m_first_node;
+            m_first_node = child->m_next_sibling;
+            if (child->m_next_sibling)
+                child->m_next_sibling->m_prev_sibling = 0;
+            else
+                m_last_node = 0;
+            child->m_parent = 0;
+        }
+
+        //! Removes last child of the node. 
+        //! If node has no children, behaviour is undefined.
+        //! Use first_node() to test if node has children.
+        void remove_last_node()
+        {
+            assert(first_node());
+            xml_node<Ch> *child = m_last_node;
+            if (child->m_prev_sibling)
+            {
+                m_last_node = child->m_prev_sibling;
+                child->m_prev_sibling->m_next_sibling = 0;
+            }
+            else
+                m_first_node = 0;
+            child->m_parent = 0;
+        }
+
+        //! Removes specified child from the node
+        // \param where Pointer to child to be removed.
+        void remove_node(xml_node<Ch> *where)
+        {
+            assert(where && where->parent() == this);
+            assert(first_node());
+            if (where == m_first_node)
+                remove_first_node();
+            else if (where == m_last_node)
+                remove_last_node();
+            else
+            {
+                where->m_prev_sibling->m_next_sibling = where->m_next_sibling;
+                where->m_next_sibling->m_prev_sibling = where->m_prev_sibling;
+                where->m_parent = 0;
+            }
+        }
+
+        //! Removes all child nodes (but not attributes).
+        void remove_all_nodes()
+        {
+            for (xml_node<Ch> *node = first_node(); node; node = node->m_next_sibling)
+                node->m_parent = 0;
+            m_first_node = 0;
+        }
+
+        //! Prepends a new attribute to the node.
+        //! \param attribute Attribute to prepend.
+        void prepend_attribute(xml_attribute<Ch> *attribute)
+        {
+            assert(attribute && !attribute->parent());
+            if (first_attribute())
+            {
+                attribute->m_next_attribute = m_first_attribute;
+                m_first_attribute->m_prev_attribute = attribute;
+            }
+            else
+            {
+                attribute->m_next_attribute = 0;
+                m_last_attribute = attribute;
+            }
+            m_first_attribute = attribute;
+            attribute->m_parent = this;
+            attribute->m_prev_attribute = 0;
+        }
+
+        //! Appends a new attribute to the node.
+        //! \param attribute Attribute to append.
+        void append_attribute(xml_attribute<Ch> *attribute)
+        {
+            assert(attribute && !attribute->parent());
+            if (first_attribute())
+            {
+                attribute->m_prev_attribute = m_last_attribute;
+                m_last_attribute->m_next_attribute = attribute;
+            }
+            else
+            {
+                attribute->m_prev_attribute = 0;
+                m_first_attribute = attribute;
+            }
+            m_last_attribute = attribute;
+            attribute->m_parent = this;
+            attribute->m_next_attribute = 0;
+        }
+
+        //! Inserts a new attribute at specified place inside the node. 
+        //! All attributes after and including the specified attribute are moved one position back.
+        //! \param where Place where to insert the attribute, or 0 to insert at the back.
+        //! \param attribute Attribute to insert.
+        void insert_attribute(xml_attribute<Ch> *where, xml_attribute<Ch> *attribute)
+        {
+            assert(!where || where->parent() == this);
+            assert(attribute && !attribute->parent());
+            if (where == m_first_attribute)
+                prepend_attribute(attribute);
+            else if (where == 0)
+                append_attribute(attribute);
+            else
+            {
+                attribute->m_prev_attribute = where->m_prev_attribute;
+                attribute->m_next_attribute = where;
+                where->m_prev_attribute->m_next_attribute = attribute;
+                where->m_prev_attribute = attribute;
+                attribute->m_parent = this;
+            }
+        }
+
+        //! Removes first attribute of the node. 
+        //! If node has no attributes, behaviour is undefined.
+        //! Use first_attribute() to test if node has attributes.
+        void remove_first_attribute()
+        {
+            assert(first_attribute());
+            xml_attribute<Ch> *attribute = m_first_attribute;
+            if (attribute->m_next_attribute)
+            {
+                attribute->m_next_attribute->m_prev_attribute = 0;
+            }
+            else
+                m_last_attribute = 0;
+            attribute->m_parent = 0;
+            m_first_attribute = attribute->m_next_attribute;
+        }
+
+        //! Removes last attribute of the node. 
+        //! If node has no attributes, behaviour is undefined.
+        //! Use first_attribute() to test if node has attributes.
+        void remove_last_attribute()
+        {
+            assert(first_attribute());
+            xml_attribute<Ch> *attribute = m_last_attribute;
+            if (attribute->m_prev_attribute)
+            {
+                attribute->m_prev_attribute->m_next_attribute = 0;
+                m_last_attribute = attribute->m_prev_attribute;
+            }
+            else
+                m_first_attribute = 0;
+            attribute->m_parent = 0;
+        }
+
+        //! Removes specified attribute from node.
+        //! \param where Pointer to attribute to be removed.
+        void remove_attribute(xml_attribute<Ch> *where)
+        {
+            assert(first_attribute() && where->parent() == this);
+            if (where == m_first_attribute)
+                remove_first_attribute();
+            else if (where == m_last_attribute)
+                remove_last_attribute();
+            else
+            {
+                where->m_prev_attribute->m_next_attribute = where->m_next_attribute;
+                where->m_next_attribute->m_prev_attribute = where->m_prev_attribute;
+                where->m_parent = 0;
+            }
+        }
+
+        //! Removes all attributes of node.
+        void remove_all_attributes()
+        {
+            for (xml_attribute<Ch> *attribute = first_attribute(); attribute; attribute = attribute->m_next_attribute)
+                attribute->m_parent = 0;
+            m_first_attribute = 0;
+        }
+        
+    private:
+
+        ///////////////////////////////////////////////////////////////////////////
+        // Restrictions
+
+        // No copying
+        xml_node(const xml_node &);
+        void operator =(const xml_node &);
+    
+        ///////////////////////////////////////////////////////////////////////////
+        // Data members
+    
+        // Note that some of the pointers below have UNDEFINED values if certain other pointers are 0.
+        // This is required for maximum performance, as it allows the parser to omit initialization of 
+        // unneded/redundant values.
+        //
+        // The rules are as follows:
+        // 1. first_node and first_attribute contain valid pointers, or 0 if node has no children/attributes respectively
+        // 2. last_node and last_attribute are valid only if node has at least one child/attribute respectively, otherwise they contain garbage
+        // 3. prev_sibling and next_sibling are valid only if node has a parent, otherwise they contain garbage
+
+        node_type m_type;                       // Type of node; always valid
+        xml_node<Ch> *m_first_node;             // Pointer to first child node, or 0 if none; always valid
+        xml_node<Ch> *m_last_node;              // Pointer to last child node, or 0 if none; this value is only valid if m_first_node is non-zero
+        xml_attribute<Ch> *m_first_attribute;   // Pointer to first attribute of node, or 0 if none; always valid
+        xml_attribute<Ch> *m_last_attribute;    // Pointer to last attribute of node, or 0 if none; this value is only valid if m_first_attribute is non-zero
+        xml_node<Ch> *m_prev_sibling;           // Pointer to previous sibling of node, or 0 if none; this value is only valid if m_parent is non-zero
+        xml_node<Ch> *m_next_sibling;           // Pointer to next sibling of node, or 0 if none; this value is only valid if m_parent is non-zero
+
+    };
+
+    ///////////////////////////////////////////////////////////////////////////
+    // XML document
+    
+    //! This class represents root of the DOM hierarchy. 
+    //! It is also an xml_node and a memory_pool through public inheritance.
+    //! Use parse() function to build a DOM tree from a zero-terminated XML text string.
+    //! parse() function allocates memory for nodes and attributes by using functions of xml_document, 
+    //! which are inherited from memory_pool.
+    //! To access root node of the document, use the document itself, as if it was an xml_node.
+    //! \param Ch Character type to use.
+    template<class Ch = char>
+    class xml_document: public xml_node<Ch>, public memory_pool<Ch>
+    {
+    
+    public:
+
+        //! Constructs empty XML document
+        xml_document()
+            : xml_node<Ch>(node_document)
+        {
+        }
+
+        //! Parses zero-terminated XML string according to given flags.
+        //! Passed string will be modified by the parser, unless rapidxml::parse_non_destructive flag is used.
+        //! The string must persist for the lifetime of the document.
+        //! In case of error, rapidxml::parse_error exception will be thrown.
+        //! <br><br>
+        //! If you want to parse contents of a file, you must first load the file into the memory, and pass pointer to its beginning.
+        //! Make sure that data is zero-terminated.
+        //! <br><br>
+        //! Document can be parsed into multiple times. 
+        //! Each new call to parse removes previous nodes and attributes (if any), but does not clear memory pool.
+        //! \param text XML data to parse; pointer is non-const to denote fact that this data may be modified by the parser.
+        template<int Flags>
+        void parse(Ch *text)
+        {
+            assert(text);
+            
+            // Remove current contents
+            this->remove_all_nodes();
+            this->remove_all_attributes();
+            
+            // Parse BOM, if any
+            parse_bom<Flags>(text);
+            
+            // Parse children
+            while (1)
+            {
+                // Skip whitespace before node
+                skip<whitespace_pred, Flags>(text);
+                if (*text == 0)
+                    break;
+
+                // Parse and append new child
+                if (*text == Ch('<'))
+                {
+                    ++text;     // Skip '<'
+                    if (xml_node<Ch> *node = parse_node<Flags>(text))
+                        this->append_node(node);
+                }
+                else
+                    RAPIDXML_PARSE_ERROR("expected <", text);
+            }
+
+        }
+
+        //! Clears the document by deleting all nodes and clearing the memory pool.
+        //! All nodes owned by document pool are destroyed.
+        void clear()
+        {
+            this->remove_all_nodes();
+            this->remove_all_attributes();
+            memory_pool<Ch>::clear();
+        }
+        
+    private:
+
+        ///////////////////////////////////////////////////////////////////////
+        // Internal character utility functions
+        
+        // Detect whitespace character
+        struct whitespace_pred
+        {
+            static unsigned char test(Ch ch)
+            {
+                return internal::lookup_tables<0>::lookup_whitespace[static_cast<unsigned char>(ch)];
+            }
+        };
+
+        // Detect node name character
+        struct node_name_pred
+        {
+            static unsigned char test(Ch ch)
+            {
+                return internal::lookup_tables<0>::lookup_node_name[static_cast<unsigned char>(ch)];
+            }
+        };
+
+        // Detect attribute name character
+        struct attribute_name_pred
+        {
+            static unsigned char test(Ch ch)
+            {
+                return internal::lookup_tables<0>::lookup_attribute_name[static_cast<unsigned char>(ch)];
+            }
+        };
+
+        // Detect text character (PCDATA)
+        struct text_pred
+        {
+            static unsigned char test(Ch ch)
+            {
+                return internal::lookup_tables<0>::lookup_text[static_cast<unsigned char>(ch)];
+            }
+        };
+
+        // Detect text character (PCDATA) that does not require processing
+        struct text_pure_no_ws_pred
+        {
+            static unsigned char test(Ch ch)
+            {
+                return internal::lookup_tables<0>::lookup_text_pure_no_ws[static_cast<unsigned char>(ch)];
+            }
+        };
+
+        // Detect text character (PCDATA) that does not require processing
+        struct text_pure_with_ws_pred
+        {
+            static unsigned char test(Ch ch)
+            {
+                return internal::lookup_tables<0>::lookup_text_pure_with_ws[static_cast<unsigned char>(ch)];
+            }
+        };
+
+        // Detect attribute value character
+        template<Ch Quote>
+        struct attribute_value_pred
+        {
+            static unsigned char test(Ch ch)
+            {
+                if (Quote == Ch('\''))
+                    return internal::lookup_tables<0>::lookup_attribute_data_1[static_cast<unsigned char>(ch)];
+                if (Quote == Ch('\"'))
+                    return internal::lookup_tables<0>::lookup_attribute_data_2[static_cast<unsigned char>(ch)];
+                return 0;       // Should never be executed, to avoid warnings on Comeau
+            }
+        };
+
+        // Detect attribute value character
+        template<Ch Quote>
+        struct attribute_value_pure_pred
+        {
+            static unsigned char test(Ch ch)
+            {
+                if (Quote == Ch('\''))
+                    return internal::lookup_tables<0>::lookup_attribute_data_1_pure[static_cast<unsigned char>(ch)];
+                if (Quote == Ch('\"'))
+                    return internal::lookup_tables<0>::lookup_attribute_data_2_pure[static_cast<unsigned char>(ch)];
+                return 0;       // Should never be executed, to avoid warnings on Comeau
+            }
+        };
+
+        // Insert coded character, using UTF8 or 8-bit ASCII
+        template<int Flags>
+        static void insert_coded_character(Ch *&text, unsigned long code)
+        {
+            if (Flags & parse_no_utf8)
+            {
+                // Insert 8-bit ASCII character
+                // Todo: possibly verify that code is less than 256 and use replacement char otherwise?
+                text[0] = static_cast<unsigned char>(code);
+                text += 1;
+            }
+            else
+            {
+                // Insert UTF8 sequence
+                if (code < 0x80)    // 1 byte sequence
+                {
+	                text[0] = static_cast<unsigned char>(code);
+                    text += 1;
+                }
+                else if (code < 0x800)  // 2 byte sequence
+                {
+	                text[1] = static_cast<unsigned char>((code | 0x80) & 0xBF); code >>= 6;
+	                text[0] = static_cast<unsigned char>(code | 0xC0);
+                    text += 2;
+                }
+	            else if (code < 0x10000)    // 3 byte sequence
+                {
+	                text[2] = static_cast<unsigned char>((code | 0x80) & 0xBF); code >>= 6;
+	                text[1] = static_cast<unsigned char>((code | 0x80) & 0xBF); code >>= 6;
+	                text[0] = static_cast<unsigned char>(code | 0xE0);
+                    text += 3;
+                }
+	            else if (code < 0x110000)   // 4 byte sequence
+                {
+	                text[3] = static_cast<unsigned char>((code | 0x80) & 0xBF); code >>= 6;
+	                text[2] = static_cast<unsigned char>((code | 0x80) & 0xBF); code >>= 6;
+	                text[1] = static_cast<unsigned char>((code | 0x80) & 0xBF); code >>= 6;
+	                text[0] = static_cast<unsigned char>(code | 0xF0);
+                    text += 4;
+                }
+                else    // Invalid, only codes up to 0x10FFFF are allowed in Unicode
+                {
+                    RAPIDXML_PARSE_ERROR("invalid numeric character entity", text);
+                }
+            }
+        }
+
+        // Skip characters until predicate evaluates to true
+        template<class StopPred, int Flags>
+        static void skip(Ch *&text)
+        {
+            Ch *tmp = text;
+            while (StopPred::test(*tmp))
+                ++tmp;
+            text = tmp;
+        }
+
+        // Skip characters until predicate evaluates to true while doing the following:
+        // - replacing XML character entity references with proper characters (&apos; &amp; &quot; &lt; &gt; &#...;)
+        // - condensing whitespace sequences to single space character
+        template<class StopPred, class StopPredPure, int Flags>
+        static Ch *skip_and_expand_character_refs(Ch *&text)
+        {
+            // If entity translation, whitespace condense and whitespace trimming is disabled, use plain skip
+            if (Flags & parse_no_entity_translation && 
+                !(Flags & parse_normalize_whitespace) &&
+                !(Flags & parse_trim_whitespace))
+            {
+                skip<StopPred, Flags>(text);
+                return text;
+            }
+            
+            // Use simple skip until first modification is detected
+            skip<StopPredPure, Flags>(text);
+
+            // Use translation skip
+            Ch *src = text;
+            Ch *dest = src;
+            while (StopPred::test(*src))
+            {
+                // If entity translation is enabled    
+                if (!(Flags & parse_no_entity_translation))
+                {
+                    // Test if replacement is needed
+                    if (src[0] == Ch('&'))
+                    {
+                        switch (src[1])
+                        {
+
+                        // &amp; &apos;
+                        case Ch('a'): 
+                            if (src[2] == Ch('m') && src[3] == Ch('p') && src[4] == Ch(';'))
+                            {
+                                *dest = Ch('&');
+                                ++dest;
+                                src += 5;
+                                continue;
+                            }
+                            if (src[2] == Ch('p') && src[3] == Ch('o') && src[4] == Ch('s') && src[5] == Ch(';'))
+                            {
+                                *dest = Ch('\'');
+                                ++dest;
+                                src += 6;
+                                continue;
+                            }
+                            break;
+
+                        // &quot;
+                        case Ch('q'): 
+                            if (src[2] == Ch('u') && src[3] == Ch('o') && src[4] == Ch('t') && src[5] == Ch(';'))
+                            {
+                                *dest = Ch('"');
+                                ++dest;
+                                src += 6;
+                                continue;
+                            }
+                            break;
+
+                        // &gt;
+                        case Ch('g'): 
+                            if (src[2] == Ch('t') && src[3] == Ch(';'))
+                            {
+                                *dest = Ch('>');
+                                ++dest;
+                                src += 4;
+                                continue;
+                            }
+                            break;
+
+                        // &lt;
+                        case Ch('l'): 
+                            if (src[2] == Ch('t') && src[3] == Ch(';'))
+                            {
+                                *dest = Ch('<');
+                                ++dest;
+                                src += 4;
+                                continue;
+                            }
+                            break;
+
+                        // &#...; - assumes ASCII
+                        case Ch('#'): 
+                            if (src[2] == Ch('x'))
+                            {
+                                unsigned long code = 0;
+                                src += 3;   // Skip &#x
+                                while (1)
+                                {
+                                    unsigned char digit = internal::lookup_tables<0>::lookup_digits[static_cast<unsigned char>(*src)];
+                                    if (digit == 0xFF)
+                                        break;
+                                    code = code * 16 + digit;
+                                    ++src;
+                                }
+                                insert_coded_character<Flags>(dest, code);    // Put character in output
+                            }
+                            else
+                            {
+                                unsigned long code = 0;
+                                src += 2;   // Skip &#
+                                while (1)
+                                {
+                                    unsigned char digit = internal::lookup_tables<0>::lookup_digits[static_cast<unsigned char>(*src)];
+                                    if (digit == 0xFF)
+                                        break;
+                                    code = code * 10 + digit;
+                                    ++src;
+                                }
+                                insert_coded_character<Flags>(dest, code);    // Put character in output
+                            }
+                            if (*src == Ch(';'))
+                                ++src;
+                            else
+                                RAPIDXML_PARSE_ERROR("expected ;", src);
+                            continue;
+
+                        // Something else
+                        default:
+                            // Ignore, just copy '&' verbatim
+                            break;
+
+                        }
+                    }
+                }
+                
+                // If whitespace condensing is enabled
+                if (Flags & parse_normalize_whitespace)
+                {
+                    // Test if condensing is needed                 
+                    if (whitespace_pred::test(*src))
+                    {
+                        *dest = Ch(' '); ++dest;    // Put single space in dest
+                        ++src;                      // Skip first whitespace char
+                        // Skip remaining whitespace chars
+                        while (whitespace_pred::test(*src))
+                            ++src;
+                        continue;
+                    }
+                }
+
+                // No replacement, only copy character
+                *dest++ = *src++;
+
+            }
+
+            // Return new end
+            text = src;
+            return dest;
+
+        }
+
+        ///////////////////////////////////////////////////////////////////////
+        // Internal parsing functions
+        
+        // Parse BOM, if any
+        template<int Flags>
+        void parse_bom(Ch *&text)
+        {
+            // UTF-8?
+            if (static_cast<unsigned char>(text[0]) == 0xEF && 
+                static_cast<unsigned char>(text[1]) == 0xBB && 
+                static_cast<unsigned char>(text[2]) == 0xBF)
+            {
+                text += 3;      // Skup utf-8 bom
+            }
+        }
+
+        // Parse XML declaration (<?xml...)
+        template<int Flags>
+        xml_node<Ch> *parse_xml_declaration(Ch *&text)
+        {
+            // If parsing of declaration is disabled
+            if (!(Flags & parse_declaration_node))
+            {
+                // Skip until end of declaration
+                while (text[0] != Ch('?') || text[1] != Ch('>'))
+                {
+                    if (!text[0])
+                        RAPIDXML_PARSE_ERROR("unexpected end of data", text);
+                    ++text;
+                }
+                text += 2;    // Skip '?>'
+                return 0;
+            }
+
+            // Create declaration
+            xml_node<Ch> *declaration = this->allocate_node(node_declaration);
+
+            // Skip whitespace before attributes or ?>
+            skip<whitespace_pred, Flags>(text);
+
+            // Parse declaration attributes
+            parse_node_attributes<Flags>(text, declaration);
+            
+            // Skip ?>
+            if (text[0] != Ch('?') || text[1] != Ch('>'))
+                RAPIDXML_PARSE_ERROR("expected ?>", text);
+            text += 2;
+            
+            return declaration;
+        }
+
+        // Parse XML comment (<!--...)
+        template<int Flags>
+        xml_node<Ch> *parse_comment(Ch *&text)
+        {
+            // If parsing of comments is disabled
+            if (!(Flags & parse_comment_nodes))
+            {
+                // Skip until end of comment
+                while (text[0] != Ch('-') || text[1] != Ch('-') || text[2] != Ch('>'))
+                {
+                    if (!text[0])
+                        RAPIDXML_PARSE_ERROR("unexpected end of data", text);
+                    ++text;
+                }
+                text += 3;     // Skip '-->'
+                return 0;      // Do not produce comment node
+            }
+
+            // Remember value start
+            Ch *value = text;
+
+            // Skip until end of comment
+            while (text[0] != Ch('-') || text[1] != Ch('-') || text[2] != Ch('>'))
+            {
+                if (!text[0])
+                    RAPIDXML_PARSE_ERROR("unexpected end of data", text);
+                ++text;
+            }
+
+            // Create comment node
+            xml_node<Ch> *comment = this->allocate_node(node_comment);
+            comment->value(value, text - value);
+            
+            // Place zero terminator after comment value
+            if (!(Flags & parse_no_string_terminators))
+                *text = Ch('\0');
+            
+            text += 3;     // Skip '-->'
+            return comment;
+        }
+
+        // Parse DOCTYPE
+        template<int Flags>
+        xml_node<Ch> *parse_doctype(Ch *&text)
+        {
+            // Remember value start
+            Ch *value = text;
+
+            // Skip to >
+            while (*text != Ch('>'))
+            {
+                // Determine character type
+                switch (*text)
+                {
+                
+                // If '[' encountered, scan for matching ending ']' using naive algorithm with depth
+                // This works for all W3C test files except for 2 most wicked
+                case Ch('['):
+                {
+                    ++text;     // Skip '['
+                    int depth = 1;
+                    while (depth > 0)
+                    {
+                        switch (*text)
+                        {
+                            case Ch('['): ++depth; break;
+                            case Ch(']'): --depth; break;
+                            case 0: RAPIDXML_PARSE_ERROR("unexpected end of data", text);
+                        }
+                        ++text;
+                    }
+                    break;
+                }
+                
+                // Error on end of text
+                case Ch('\0'):
+                    RAPIDXML_PARSE_ERROR("unexpected end of data", text);
+                
+                // Other character, skip it
+                default:
+                    ++text;
+
+                }
+            }
+            
+            // If DOCTYPE nodes enabled
+            if (Flags & parse_doctype_node)
+            {
+                // Create a new doctype node
+                xml_node<Ch> *doctype = this->allocate_node(node_doctype);
+                doctype->value(value, text - value);
+                
+                // Place zero terminator after value
+                if (!(Flags & parse_no_string_terminators))
+                    *text = Ch('\0');
+
+                text += 1;      // skip '>'
+                return doctype;
+            }
+            else
+            {
+                text += 1;      // skip '>'
+                return 0;
+            }
+
+        }
+
+        // Parse PI
+        template<int Flags>
+        xml_node<Ch> *parse_pi(Ch *&text)
+        {
+            // If creation of PI nodes is enabled
+            if (Flags & parse_pi_nodes)
+            {
+                // Create pi node
+                xml_node<Ch> *pi = this->allocate_node(node_pi);
+
+                // Extract PI target name
+                Ch *name = text;
+                skip<node_name_pred, Flags>(text);
+                if (text == name)
+                    RAPIDXML_PARSE_ERROR("expected PI target", text);
+                pi->name(name, text - name);
+                
+                // Skip whitespace between pi target and pi
+                skip<whitespace_pred, Flags>(text);
+
+                // Remember start of pi
+                Ch *value = text;
+                
+                // Skip to '?>'
+                while (text[0] != Ch('?') || text[1] != Ch('>'))
+                {
+                    if (*text == Ch('\0'))
+                        RAPIDXML_PARSE_ERROR("unexpected end of data", text);
+                    ++text;
+                }
+
+                // Set pi value (verbatim, no entity expansion or whitespace normalization)
+                pi->value(value, text - value);     
+                
+                // Place zero terminator after name and value
+                if (!(Flags & parse_no_string_terminators))
+                {
+                    pi->name()[pi->name_size()] = Ch('\0');
+                    pi->value()[pi->value_size()] = Ch('\0');
+                }
+                
+                text += 2;                          // Skip '?>'
+                return pi;
+            }
+            else
+            {
+                // Skip to '?>'
+                while (text[0] != Ch('?') || text[1] != Ch('>'))
+                {
+                    if (*text == Ch('\0'))
+                        RAPIDXML_PARSE_ERROR("unexpected end of data", text);
+                    ++text;
+                }
+                text += 2;    // Skip '?>'
+                return 0;
+            }
+        }
+
+        // Parse and append data
+        // Return character that ends data.
+        // This is necessary because this character might have been overwritten by a terminating 0
+        template<int Flags>
+        Ch parse_and_append_data(xml_node<Ch> *node, Ch *&text, Ch *contents_start)
+        {
+            // Backup to contents start if whitespace trimming is disabled
+            if (!(Flags & parse_trim_whitespace))
+                text = contents_start;     
+            
+            // Skip until end of data
+            Ch *value = text, *end;
+            if (Flags & parse_normalize_whitespace)
+                end = skip_and_expand_character_refs<text_pred, text_pure_with_ws_pred, Flags>(text);   
+            else
+                end = skip_and_expand_character_refs<text_pred, text_pure_no_ws_pred, Flags>(text);
+
+            // Trim trailing whitespace if flag is set; leading was already trimmed by whitespace skip after >
+            if (Flags & parse_trim_whitespace)
+            {
+                if (Flags & parse_normalize_whitespace)
+                {
+                    // Whitespace is already condensed to single space characters by skipping function, so just trim 1 char off the end
+                    if (*(end - 1) == Ch(' '))
+                        --end;
+                }
+                else
+                {
+                    // Backup until non-whitespace character is found
+                    while (whitespace_pred::test(*(end - 1)))
+                        --end;
+                }
+            }
+            
+            // If characters are still left between end and value (this test is only necessary if normalization is enabled)
+            // Create new data node
+            if (!(Flags & parse_no_data_nodes))
+            {
+                xml_node<Ch> *data = this->allocate_node(node_data);
+                data->value(value, end - value);
+                node->append_node(data);
+            }
+
+            // Add data to parent node if no data exists yet
+            if (!(Flags & parse_no_element_values)) 
+                if (*node->value() == Ch('\0'))
+                    node->value(value, end - value);
+
+            // Place zero terminator after value
+            if (!(Flags & parse_no_string_terminators))
+            {
+                Ch ch = *text;
+                *end = Ch('\0');
+                return ch;      // Return character that ends data; this is required because zero terminator overwritten it
+            }
+
+            // Return character that ends data
+            return *text;
+        }
+
+        // Parse CDATA
+        template<int Flags>
+        xml_node<Ch> *parse_cdata(Ch *&text)
+        {
+            // If CDATA is disabled
+            if (Flags & parse_no_data_nodes)
+            {
+                // Skip until end of cdata
+                while (text[0] != Ch(']') || text[1] != Ch(']') || text[2] != Ch('>'))
+                {
+                    if (!text[0])
+                        RAPIDXML_PARSE_ERROR("unexpected end of data", text);
+                    ++text;
+                }
+                text += 3;      // Skip ]]>
+                return 0;       // Do not produce CDATA node
+            }
+
+            // Skip until end of cdata
+            Ch *value = text;
+            while (text[0] != Ch(']') || text[1] != Ch(']') || text[2] != Ch('>'))
+            {
+                if (!text[0])
+                    RAPIDXML_PARSE_ERROR("unexpected end of data", text);
+                ++text;
+            }
+
+            // Create new cdata node
+            xml_node<Ch> *cdata = this->allocate_node(node_cdata);
+            cdata->value(value, text - value);
+
+            // Place zero terminator after value
+            if (!(Flags & parse_no_string_terminators))
+                *text = Ch('\0');
+
+            text += 3;      // Skip ]]>
+            return cdata;
+        }
+        
+        // Parse element node
+        template<int Flags>
+        xml_node<Ch> *parse_element(Ch *&text)
+        {
+            // Create element node
+            xml_node<Ch> *element = this->allocate_node(node_element);
+
+            // Extract element name
+            Ch *name = text;
+            skip<node_name_pred, Flags>(text);
+            if (text == name)
+                RAPIDXML_PARSE_ERROR("expected element name", text);
+            element->name(name, text - name);
+            
+            // Skip whitespace between element name and attributes or >
+            skip<whitespace_pred, Flags>(text);
+
+            // Parse attributes, if any
+            parse_node_attributes<Flags>(text, element);
+
+            // Determine ending type
+            if (*text == Ch('>'))
+            {
+                ++text;
+                parse_node_contents<Flags>(text, element);
+            }
+            else if (*text == Ch('/'))
+            {
+                ++text;
+                if (*text != Ch('>'))
+                    RAPIDXML_PARSE_ERROR("expected >", text);
+                ++text;
+            }
+            else
+                RAPIDXML_PARSE_ERROR("expected >", text);
+
+            // Place zero terminator after name
+            if (!(Flags & parse_no_string_terminators))
+                element->name()[element->name_size()] = Ch('\0');
+
+            // Return parsed element
+            return element;
+        }
+
+        // Determine node type, and parse it
+        template<int Flags>
+        xml_node<Ch> *parse_node(Ch *&text)
+        {
+            // Parse proper node type
+            switch (text[0])
+            {
+
+            // <...
+            default: 
+                // Parse and append element node
+                return parse_element<Flags>(text);
+
+            // <?...
+            case Ch('?'): 
+                ++text;     // Skip ?
+                if ((text[0] == Ch('x') || text[0] == Ch('X')) &&
+                    (text[1] == Ch('m') || text[1] == Ch('M')) && 
+                    (text[2] == Ch('l') || text[2] == Ch('L')) &&
+                    whitespace_pred::test(text[3]))
+                {
+                    // '<?xml ' - xml declaration
+                    text += 4;      // Skip 'xml '
+                    return parse_xml_declaration<Flags>(text);
+                }
+                else
+                {
+                    // Parse PI
+                    return parse_pi<Flags>(text);
+                }
+            
+            // <!...
+            case Ch('!'): 
+
+                // Parse proper subset of <! node
+                switch (text[1])    
+                {
+                
+                // <!-
+                case Ch('-'):
+                    if (text[2] == Ch('-'))
+                    {
+                        // '<!--' - xml comment
+                        text += 3;     // Skip '!--'
+                        return parse_comment<Flags>(text);
+                    }
+                    break;
+
+                // <![
+                case Ch('['):
+                    if (text[2] == Ch('C') && text[3] == Ch('D') && text[4] == Ch('A') && 
+                        text[5] == Ch('T') && text[6] == Ch('A') && text[7] == Ch('['))
+                    {
+                        // '<![CDATA[' - cdata
+                        text += 8;     // Skip '![CDATA['
+                        return parse_cdata<Flags>(text);
+                    }
+                    break;
+
+                // <!D
+                case Ch('D'):
+                    if (text[2] == Ch('O') && text[3] == Ch('C') && text[4] == Ch('T') && 
+                        text[5] == Ch('Y') && text[6] == Ch('P') && text[7] == Ch('E') && 
+                        whitespace_pred::test(text[8]))
+                    {
+                        // '<!DOCTYPE ' - doctype
+                        text += 9;      // skip '!DOCTYPE '
+                        return parse_doctype<Flags>(text);
+                    }
+
+                }   // switch
+
+                // Attempt to skip other, unrecognized node types starting with <!
+                ++text;     // Skip !
+                while (*text != Ch('>'))
+                {
+                    if (*text == 0)
+                        RAPIDXML_PARSE_ERROR("unexpected end of data", text);
+                    ++text;
+                }
+                ++text;     // Skip '>'
+                return 0;   // No node recognized
+
+            }
+        }
+
+        // Parse contents of the node - children, data etc.
+        template<int Flags>
+        void parse_node_contents(Ch *&text, xml_node<Ch> *node)
+        {
+            // For all children and text
+            while (1)
+            {
+                // Skip whitespace between > and node contents
+                Ch *contents_start = text;      // Store start of node contents before whitespace is skipped
+                skip<whitespace_pred, Flags>(text);
+                Ch next_char = *text;
+
+            // After data nodes, instead of continuing the loop, control jumps here.
+            // This is because zero termination inside parse_and_append_data() function
+            // would wreak havoc with the above code.
+            // Also, skipping whitespace after data nodes is unnecessary.
+            after_data_node:    
+                
+                // Determine what comes next: node closing, child node, data node, or 0?
+                switch (next_char)
+                {
+                
+                // Node closing or child node
+                case Ch('<'):
+                    if (text[1] == Ch('/'))
+                    {
+                        // Node closing
+                        text += 2;      // Skip '</'
+                        if (Flags & parse_validate_closing_tags)
+                        {
+                            // Skip and validate closing tag name
+                            Ch *closing_name = text;
+                            skip<node_name_pred, Flags>(text);
+                            if (!internal::compare(node->name(), node->name_size(), closing_name, text - closing_name, true))
+                                RAPIDXML_PARSE_ERROR("invalid closing tag name", text);
+                        }
+                        else
+                        {
+                            // No validation, just skip name
+                            skip<node_name_pred, Flags>(text);
+                        }
+                        // Skip remaining whitespace after node name
+                        skip<whitespace_pred, Flags>(text);
+                        if (*text != Ch('>'))
+                            RAPIDXML_PARSE_ERROR("expected >", text);
+                        ++text;     // Skip '>'
+                        return;     // Node closed, finished parsing contents
+                    }
+                    else
+                    {
+                        // Child node
+                        ++text;     // Skip '<'
+                        if (xml_node<Ch> *child = parse_node<Flags>(text))
+                            node->append_node(child);
+                    }
+                    break;
+
+                // End of data - error
+                case Ch('\0'):
+                    RAPIDXML_PARSE_ERROR("unexpected end of data", text);
+
+                // Data node
+                default:
+                    next_char = parse_and_append_data<Flags>(node, text, contents_start);
+                    goto after_data_node;   // Bypass regular processing after data nodes
+
+                }
+            }
+        }
+        
+        // Parse XML attributes of the node
+        template<int Flags>
+        void parse_node_attributes(Ch *&text, xml_node<Ch> *node)
+        {
+            // For all attributes 
+            while (attribute_name_pred::test(*text))
+            {
+                // Extract attribute name
+                Ch *name = text;
+                ++text;     // Skip first character of attribute name
+                skip<attribute_name_pred, Flags>(text);
+                if (text == name)
+                    RAPIDXML_PARSE_ERROR("expected attribute name", name);
+
+                // Create new attribute
+                xml_attribute<Ch> *attribute = this->allocate_attribute();
+                attribute->name(name, text - name);
+                node->append_attribute(attribute);
+
+                // Skip whitespace after attribute name
+                skip<whitespace_pred, Flags>(text);
+
+                // Skip =
+                if (*text != Ch('='))
+                    RAPIDXML_PARSE_ERROR("expected =", text);
+                ++text;
+
+                // Add terminating zero after name
+                if (!(Flags & parse_no_string_terminators))
+                    attribute->name()[attribute->name_size()] = 0;
+
+                // Skip whitespace after =
+                skip<whitespace_pred, Flags>(text);
+
+                // Skip quote and remember if it was ' or "
+                Ch quote = *text;
+                if (quote != Ch('\'') && quote != Ch('"'))
+                    RAPIDXML_PARSE_ERROR("expected ' or \"", text);
+                ++text;
+
+                // Extract attribute value and expand char refs in it
+                Ch *value = text, *end;
+                const int AttFlags = Flags & ~parse_normalize_whitespace;   // No whitespace normalization in attributes
+                if (quote == Ch('\''))
+                    end = skip_and_expand_character_refs<attribute_value_pred<Ch('\'')>, attribute_value_pure_pred<Ch('\'')>, AttFlags>(text);
+                else
+                    end = skip_and_expand_character_refs<attribute_value_pred<Ch('"')>, attribute_value_pure_pred<Ch('"')>, AttFlags>(text);
+                
+                // Set attribute value
+                attribute->value(value, end - value);
+                
+                // Make sure that end quote is present
+                if (*text != quote)
+                    RAPIDXML_PARSE_ERROR("expected ' or \"", text);
+                ++text;     // Skip quote
+
+                // Add terminating zero after value
+                if (!(Flags & parse_no_string_terminators))
+                    attribute->value()[attribute->value_size()] = 0;
+
+                // Skip whitespace after attribute value
+                skip<whitespace_pred, Flags>(text);
+            }
+        }
+
+    };
+
+    //! \cond internal
+    namespace internal
+    {
+
+        // Whitespace (space \n \r \t)
+        template<int Dummy>
+        const unsigned char lookup_tables<Dummy>::lookup_whitespace[256] = 
+        {
+          // 0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
+             0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  0,  0,  1,  0,  0,  // 0
+             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  // 1
+             1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  // 2
+             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  // 3
+             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  // 4
+             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  // 5
+             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  // 6
+             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  // 7
+             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  // 8
+             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  // 9
+             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  // A
+             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  // B
+             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  // C
+             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  // D
+             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  // E
+             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   // F
+        };
+
+        // Node name (anything but space \n \r \t / > ? \0)
+        template<int Dummy>
+        const unsigned char lookup_tables<Dummy>::lookup_node_name[256] = 
+        {
+          // 0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
+             0,  1,  1,  1,  1,  1,  1,  1,  1,  0,  0,  1,  1,  0,  1,  1,  // 0
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 1
+             0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,  // 2
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,  0,  // 3
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 4
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 5
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 6
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 7
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 8
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 9
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // A
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // B
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // C
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // D
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // E
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1   // F
+        };
+
+        // Text (i.e. PCDATA) (anything but < \0)
+        template<int Dummy>
+        const unsigned char lookup_tables<Dummy>::lookup_text[256] = 
+        {
+          // 0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
+             0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 0
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 1
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 2
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,  1,  1,  1,  // 3
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 4
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 5
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 6
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 7
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 8
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 9
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // A
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // B
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // C
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // D
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // E
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1   // F
+        };
+
+        // Text (i.e. PCDATA) that does not require processing when ws normalization is disabled 
+        // (anything but < \0 &)
+        template<int Dummy>
+        const unsigned char lookup_tables<Dummy>::lookup_text_pure_no_ws[256] = 
+        {
+          // 0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
+             0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 0
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 1
+             1,  1,  1,  1,  1,  1,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 2
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,  1,  1,  1,  // 3
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 4
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 5
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 6
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 7
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 8
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 9
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // A
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // B
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // C
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // D
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // E
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1   // F
+        };
+
+        // Text (i.e. PCDATA) that does not require processing when ws normalizationis is enabled
+        // (anything but < \0 & space \n \r \t)
+        template<int Dummy>
+        const unsigned char lookup_tables<Dummy>::lookup_text_pure_with_ws[256] = 
+        {
+          // 0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
+             0,  1,  1,  1,  1,  1,  1,  1,  1,  0,  0,  1,  1,  0,  1,  1,  // 0
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 1
+             0,  1,  1,  1,  1,  1,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 2
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,  1,  1,  1,  // 3
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 4
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 5
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 6
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 7
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 8
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 9
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // A
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // B
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // C
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // D
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // E
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1   // F
+        };
+
+        // Attribute name (anything but space \n \r \t / < > = ? ! \0)
+        template<int Dummy>
+        const unsigned char lookup_tables<Dummy>::lookup_attribute_name[256] = 
+        {
+          // 0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
+             0,  1,  1,  1,  1,  1,  1,  1,  1,  0,  0,  1,  1,  0,  1,  1,  // 0
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 1
+             0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,  // 2
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,  0,  0,  0,  // 3
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 4
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 5
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 6
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 7
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 8
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 9
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // A
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // B
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // C
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // D
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // E
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1   // F
+        };
+
+        // Attribute data with single quote (anything but ' \0)
+        template<int Dummy>
+        const unsigned char lookup_tables<Dummy>::lookup_attribute_data_1[256] = 
+        {
+          // 0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
+             0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 0
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 1
+             1,  1,  1,  1,  1,  1,  1,  0,  1,  1,  1,  1,  1,  1,  1,  1,  // 2
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 3
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 4
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 5
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 6
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 7
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 8
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 9
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // A
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // B
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // C
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // D
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // E
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1   // F
+        };
+
+        // Attribute data with single quote that does not require processing (anything but ' \0 &)
+        template<int Dummy>
+        const unsigned char lookup_tables<Dummy>::lookup_attribute_data_1_pure[256] = 
+        {
+          // 0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
+             0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 0
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 1
+             1,  1,  1,  1,  1,  1,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  // 2
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 3
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 4
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 5
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 6
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 7
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 8
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 9
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // A
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // B
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // C
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // D
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // E
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1   // F
+        };
+
+        // Attribute data with double quote (anything but " \0)
+        template<int Dummy>
+        const unsigned char lookup_tables<Dummy>::lookup_attribute_data_2[256] = 
+        {
+          // 0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
+             0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 0
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 1
+             1,  1,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 2
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 3
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 4
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 5
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 6
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 7
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 8
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 9
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // A
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // B
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // C
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // D
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // E
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1   // F
+        };
+
+        // Attribute data with double quote that does not require processing (anything but " \0 &)
+        template<int Dummy>
+        const unsigned char lookup_tables<Dummy>::lookup_attribute_data_2_pure[256] = 
+        {
+          // 0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
+             0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 0
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 1
+             1,  1,  0,  1,  1,  1,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 2
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 3
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 4
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 5
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 6
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 7
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 8
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 9
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // A
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // B
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // C
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // D
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // E
+             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1   // F
+        };
+
+        // Digits (dec and hex, 255 denotes end of numeric character reference)
+        template<int Dummy>
+        const unsigned char lookup_tables<Dummy>::lookup_digits[256] = 
+        {
+          // 0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
+           255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,  // 0
+           255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,  // 1
+           255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,  // 2
+             0,  1,  2,  3,  4,  5,  6,  7,  8,  9,255,255,255,255,255,255,  // 3
+           255, 10, 11, 12, 13, 14, 15,255,255,255,255,255,255,255,255,255,  // 4
+           255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,  // 5
+           255, 10, 11, 12, 13, 14, 15,255,255,255,255,255,255,255,255,255,  // 6
+           255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,  // 7
+           255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,  // 8
+           255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,  // 9
+           255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,  // A
+           255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,  // B
+           255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,  // C
+           255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,  // D
+           255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,  // E
+           255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255   // F
+        };
+    
+        // Upper case conversion
+        template<int Dummy>
+        const unsigned char lookup_tables<Dummy>::lookup_upcase[256] = 
+        {
+          // 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  A   B   C   D   E   F
+           0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,   // 0
+           16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,   // 1
+           32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,   // 2
+           48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,   // 3
+           64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,   // 4
+           80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,   // 5
+           96, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,   // 6
+           80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 123,124,125,126,127,  // 7
+           128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,  // 8
+           144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,  // 9
+           160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,  // A
+           176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,  // B
+           192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,  // C
+           208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,  // D
+           224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,  // E
+           240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255   // F
+        };
+    }
+    //! \endcond
+
+}
+
+// Undefine internal macros
+#undef RAPIDXML_PARSE_ERROR
+
+// On MSVC, restore warnings state
+#ifdef _MSC_VER
+    #pragma warning(pop)
+#endif
+
+#endif
diff --git a/thirdpt/rapidxml-1.13/rapidxml_iterators.hpp b/thirdpt/rapidxml-1.13/rapidxml_iterators.hpp
new file mode 100644
index 0000000..52ebc29
--- /dev/null
+++ b/thirdpt/rapidxml-1.13/rapidxml_iterators.hpp
@@ -0,0 +1,174 @@
+#ifndef RAPIDXML_ITERATORS_HPP_INCLUDED
+#define RAPIDXML_ITERATORS_HPP_INCLUDED
+
+// Copyright (C) 2006, 2009 Marcin Kalicinski
+// Version 1.13
+// Revision $DateTime: 2009/05/13 01:46:17 $
+//! \file rapidxml_iterators.hpp This file contains rapidxml iterators
+
+#include "rapidxml.hpp"
+
+namespace rapidxml
+{
+
+    //! Iterator of child nodes of xml_node
+    template<class Ch>
+    class node_iterator
+    {
+    
+    public:
+
+        typedef typename xml_node<Ch> value_type;
+        typedef typename xml_node<Ch> &reference;
+        typedef typename xml_node<Ch> *pointer;
+        typedef std::ptrdiff_t difference_type;
+        typedef std::bidirectional_iterator_tag iterator_category;
+        
+        node_iterator()
+            : m_node(0)
+        {
+        }
+
+        node_iterator(xml_node<Ch> *node)
+            : m_node(node->first_node())
+        {
+        }
+        
+        reference operator *() const
+        {
+            assert(m_node);
+            return *m_node;
+        }
+
+        pointer operator->() const
+        {
+            assert(m_node);
+            return m_node;
+        }
+
+        node_iterator& operator++()
+        {
+            assert(m_node);
+            m_node = m_node->next_sibling();
+            return *this;
+        }
+
+        node_iterator operator++(int)
+        {
+            node_iterator tmp = *this;
+            ++this;
+            return tmp;
+        }
+
+        node_iterator& operator--()
+        {
+            assert(m_node && m_node->previous_sibling());
+            m_node = m_node->previous_sibling();
+            return *this;
+        }
+
+        node_iterator operator--(int)
+        {
+            node_iterator tmp = *this;
+            ++this;
+            return tmp;
+        }
+
+        bool operator ==(const node_iterator<Ch> &rhs)
+        {
+            return m_node == rhs.m_node;
+        }
+
+        bool operator !=(const node_iterator<Ch> &rhs)
+        {
+            return m_node != rhs.m_node;
+        }
+
+    private:
+
+        xml_node<Ch> *m_node;
+
+    };
+
+    //! Iterator of child attributes of xml_node
+    template<class Ch>
+    class attribute_iterator
+    {
+    
+    public:
+
+        typedef typename xml_attribute<Ch> value_type;
+        typedef typename xml_attribute<Ch> &reference;
+        typedef typename xml_attribute<Ch> *pointer;
+        typedef std::ptrdiff_t difference_type;
+        typedef std::bidirectional_iterator_tag iterator_category;
+        
+        attribute_iterator()
+            : m_attribute(0)
+        {
+        }
+
+        attribute_iterator(xml_node<Ch> *node)
+            : m_attribute(node->first_attribute())
+        {
+        }
+        
+        reference operator *() const
+        {
+            assert(m_attribute);
+            return *m_attribute;
+        }
+
+        pointer operator->() const
+        {
+            assert(m_attribute);
+            return m_attribute;
+        }
+
+        attribute_iterator& operator++()
+        {
+            assert(m_attribute);
+            m_attribute = m_attribute->next_attribute();
+            return *this;
+        }
+
+        attribute_iterator operator++(int)
+        {
+            attribute_iterator tmp = *this;
+            ++this;
+            return tmp;
+        }
+
+        attribute_iterator& operator--()
+        {
+            assert(m_attribute && m_attribute->previous_attribute());
+            m_attribute = m_attribute->previous_attribute();
+            return *this;
+        }
+
+        attribute_iterator operator--(int)
+        {
+            attribute_iterator tmp = *this;
+            ++this;
+            return tmp;
+        }
+
+        bool operator ==(const attribute_iterator<Ch> &rhs)
+        {
+            return m_attribute == rhs.m_attribute;
+        }
+
+        bool operator !=(const attribute_iterator<Ch> &rhs)
+        {
+            return m_attribute != rhs.m_attribute;
+        }
+
+    private:
+
+        xml_attribute<Ch> *m_attribute;
+
+    };
+
+}
+
+#endif
diff --git a/thirdpt/rapidxml-1.13/rapidxml_print.hpp b/thirdpt/rapidxml-1.13/rapidxml_print.hpp
new file mode 100644
index 0000000..0ae2b14
--- /dev/null
+++ b/thirdpt/rapidxml-1.13/rapidxml_print.hpp
@@ -0,0 +1,421 @@
+#ifndef RAPIDXML_PRINT_HPP_INCLUDED
+#define RAPIDXML_PRINT_HPP_INCLUDED
+
+// Copyright (C) 2006, 2009 Marcin Kalicinski
+// Version 1.13
+// Revision $DateTime: 2009/05/13 01:46:17 $
+//! \file rapidxml_print.hpp This file contains rapidxml printer implementation
+
+#include "rapidxml.hpp"
+
+// Only include streams if not disabled
+#ifndef RAPIDXML_NO_STREAMS
+    #include <ostream>
+    #include <iterator>
+#endif
+
+namespace rapidxml
+{
+
+    ///////////////////////////////////////////////////////////////////////
+    // Printing flags
+
+    const int print_no_indenting = 0x1;   //!< Printer flag instructing the printer to suppress indenting of XML. See print() function.
+
+    ///////////////////////////////////////////////////////////////////////
+    // Internal
+
+    //! \cond internal
+    namespace internal
+    {
+        
+        ///////////////////////////////////////////////////////////////////////////
+        // Internal character operations
+    
+        // Copy characters from given range to given output iterator
+        template<class OutIt, class Ch>
+        inline OutIt copy_chars(const Ch *begin, const Ch *end, OutIt out)
+        {
+            while (begin != end)
+                *out++ = *begin++;
+            return out;
+        }
+        
+        // Copy characters from given range to given output iterator and expand
+        // characters into references (&lt; &gt; &apos; &quot; &amp;)
+        template<class OutIt, class Ch>
+        inline OutIt copy_and_expand_chars(const Ch *begin, const Ch *end, Ch noexpand, OutIt out)
+        {
+            while (begin != end)
+            {
+                if (*begin == noexpand)
+                {
+                    *out++ = *begin;    // No expansion, copy character
+                }
+                else
+                {
+                    switch (*begin)
+                    {
+                    case Ch('<'):
+                        *out++ = Ch('&'); *out++ = Ch('l'); *out++ = Ch('t'); *out++ = Ch(';');
+                        break;
+                    case Ch('>'): 
+                        *out++ = Ch('&'); *out++ = Ch('g'); *out++ = Ch('t'); *out++ = Ch(';');
+                        break;
+                    case Ch('\''): 
+                        *out++ = Ch('&'); *out++ = Ch('a'); *out++ = Ch('p'); *out++ = Ch('o'); *out++ = Ch('s'); *out++ = Ch(';');
+                        break;
+                    case Ch('"'): 
+                        *out++ = Ch('&'); *out++ = Ch('q'); *out++ = Ch('u'); *out++ = Ch('o'); *out++ = Ch('t'); *out++ = Ch(';');
+                        break;
+                    case Ch('&'): 
+                        *out++ = Ch('&'); *out++ = Ch('a'); *out++ = Ch('m'); *out++ = Ch('p'); *out++ = Ch(';'); 
+                        break;
+                    default:
+                        *out++ = *begin;    // No expansion, copy character
+                    }
+                }
+                ++begin;    // Step to next character
+            }
+            return out;
+        }
+
+        // Fill given output iterator with repetitions of the same character
+        template<class OutIt, class Ch>
+        inline OutIt fill_chars(OutIt out, int n, Ch ch)
+        {
+            for (int i = 0; i < n; ++i)
+                *out++ = ch;
+            return out;
+        }
+
+        // Find character
+        template<class Ch, Ch ch>
+        inline bool find_char(const Ch *begin, const Ch *end)
+        {
+            while (begin != end)
+                if (*begin++ == ch)
+                    return true;
+            return false;
+        }
+
+        ///////////////////////////////////////////////////////////////////////////
+        // Internal printing operations
+    
+        // Print node
+        template<class OutIt, class Ch>
+        inline OutIt print_node(OutIt out, const xml_node<Ch> *node, int flags, int indent)
+        {
+            // Print proper node type
+            switch (node->type())
+            {
+
+            // Document
+            case node_document:
+                out = print_children(out, node, flags, indent);
+                break;
+
+            // Element
+            case node_element:
+                out = print_element_node(out, node, flags, indent);
+                break;
+            
+            // Data
+            case node_data:
+                out = print_data_node(out, node, flags, indent);
+                break;
+            
+            // CDATA
+            case node_cdata:
+                out = print_cdata_node(out, node, flags, indent);
+                break;
+
+            // Declaration
+            case node_declaration:
+                out = print_declaration_node(out, node, flags, indent);
+                break;
+
+            // Comment
+            case node_comment:
+                out = print_comment_node(out, node, flags, indent);
+                break;
+            
+            // Doctype
+            case node_doctype:
+                out = print_doctype_node(out, node, flags, indent);
+                break;
+
+            // Pi
+            case node_pi:
+                out = print_pi_node(out, node, flags, indent);
+                break;
+
+                // Unknown
+            default:
+                assert(0);
+                break;
+            }
+            
+            // If indenting not disabled, add line break after node
+            if (!(flags & print_no_indenting))
+                *out = Ch('\n'), ++out;
+
+            // Return modified iterator
+            return out;
+        }
+        
+        // Print children of the node                               
+        template<class OutIt, class Ch>
+        inline OutIt print_children(OutIt out, const xml_node<Ch> *node, int flags, int indent)
+        {
+            for (xml_node<Ch> *child = node->first_node(); child; child = child->next_sibling())
+                out = print_node(out, child, flags, indent);
+            return out;
+        }
+
+        // Print attributes of the node
+        template<class OutIt, class Ch>
+        inline OutIt print_attributes(OutIt out, const xml_node<Ch> *node, int flags)
+        {
+            for (xml_attribute<Ch> *attribute = node->first_attribute(); attribute; attribute = attribute->next_attribute())
+            {
+                if (attribute->name() && attribute->value())
+                {
+                    // Print attribute name
+                    *out = Ch(' '), ++out;
+                    out = copy_chars(attribute->name(), attribute->name() + attribute->name_size(), out);
+                    *out = Ch('='), ++out;
+                    // Print attribute value using appropriate quote type
+                    if (find_char<Ch, Ch('"')>(attribute->value(), attribute->value() + attribute->value_size()))
+                    {
+                        *out = Ch('\''), ++out;
+                        out = copy_and_expand_chars(attribute->value(), attribute->value() + attribute->value_size(), Ch('"'), out);
+                        *out = Ch('\''), ++out;
+                    }
+                    else
+                    {
+                        *out = Ch('"'), ++out;
+                        out = copy_and_expand_chars(attribute->value(), attribute->value() + attribute->value_size(), Ch('\''), out);
+                        *out = Ch('"'), ++out;
+                    }
+                }
+            }
+            return out;
+        }
+
+        // Print data node
+        template<class OutIt, class Ch>
+        inline OutIt print_data_node(OutIt out, const xml_node<Ch> *node, int flags, int indent)
+        {
+            assert(node->type() == node_data);
+            if (!(flags & print_no_indenting))
+                out = fill_chars(out, indent, Ch('\t'));
+            out = copy_and_expand_chars(node->value(), node->value() + node->value_size(), Ch(0), out);
+            return out;
+        }
+
+        // Print data node
+        template<class OutIt, class Ch>
+        inline OutIt print_cdata_node(OutIt out, const xml_node<Ch> *node, int flags, int indent)
+        {
+            assert(node->type() == node_cdata);
+            if (!(flags & print_no_indenting))
+                out = fill_chars(out, indent, Ch('\t'));
+            *out = Ch('<'); ++out;
+            *out = Ch('!'); ++out;
+            *out = Ch('['); ++out;
+            *out = Ch('C'); ++out;
+            *out = Ch('D'); ++out;
+            *out = Ch('A'); ++out;
+            *out = Ch('T'); ++out;
+            *out = Ch('A'); ++out;
+            *out = Ch('['); ++out;
+            out = copy_chars(node->value(), node->value() + node->value_size(), out);
+            *out = Ch(']'); ++out;
+            *out = Ch(']'); ++out;
+            *out = Ch('>'); ++out;
+            return out;
+        }
+
+        // Print element node
+        template<class OutIt, class Ch>
+        inline OutIt print_element_node(OutIt out, const xml_node<Ch> *node, int flags, int indent)
+        {
+            assert(node->type() == node_element);
+
+            // Print element name and attributes, if any
+            if (!(flags & print_no_indenting))
+                out = fill_chars(out, indent, Ch('\t'));
+            *out = Ch('<'), ++out;
+            out = copy_chars(node->name(), node->name() + node->name_size(), out);
+            out = print_attributes(out, node, flags);
+            
+            // If node is childless
+            if (node->value_size() == 0 && !node->first_node())
+            {
+                // Print childless node tag ending
+                *out = Ch('/'), ++out;
+                *out = Ch('>'), ++out;
+            }
+            else
+            {
+                // Print normal node tag ending
+                *out = Ch('>'), ++out;
+
+                // Test if node contains a single data node only (and no other nodes)
+                xml_node<Ch> *child = node->first_node();
+                if (!child)
+                {
+                    // If node has no children, only print its value without indenting
+                    out = copy_and_expand_chars(node->value(), node->value() + node->value_size(), Ch(0), out);
+                }
+                else if (child->next_sibling() == 0 && child->type() == node_data)
+                {
+                    // If node has a sole data child, only print its value without indenting
+                    out = copy_and_expand_chars(child->value(), child->value() + child->value_size(), Ch(0), out);
+                }
+                else
+                {
+                    // Print all children with full indenting
+                    if (!(flags & print_no_indenting))
+                        *out = Ch('\n'), ++out;
+                    out = print_children(out, node, flags, indent + 1);
+                    if (!(flags & print_no_indenting))
+                        out = fill_chars(out, indent, Ch('\t'));
+                }
+
+                // Print node end
+                *out = Ch('<'), ++out;
+                *out = Ch('/'), ++out;
+                out = copy_chars(node->name(), node->name() + node->name_size(), out);
+                *out = Ch('>'), ++out;
+            }
+            return out;
+        }
+
+        // Print declaration node
+        template<class OutIt, class Ch>
+        inline OutIt print_declaration_node(OutIt out, const xml_node<Ch> *node, int flags, int indent)
+        {
+            // Print declaration start
+            if (!(flags & print_no_indenting))
+                out = fill_chars(out, indent, Ch('\t'));
+            *out = Ch('<'), ++out;
+            *out = Ch('?'), ++out;
+            *out = Ch('x'), ++out;
+            *out = Ch('m'), ++out;
+            *out = Ch('l'), ++out;
+
+            // Print attributes
+            out = print_attributes(out, node, flags);
+            
+            // Print declaration end
+            *out = Ch('?'), ++out;
+            *out = Ch('>'), ++out;
+            
+            return out;
+        }
+
+        // Print comment node
+        template<class OutIt, class Ch>
+        inline OutIt print_comment_node(OutIt out, const xml_node<Ch> *node, int flags, int indent)
+        {
+            assert(node->type() == node_comment);
+            if (!(flags & print_no_indenting))
+                out = fill_chars(out, indent, Ch('\t'));
+            *out = Ch('<'), ++out;
+            *out = Ch('!'), ++out;
+            *out = Ch('-'), ++out;
+            *out = Ch('-'), ++out;
+            out = copy_chars(node->value(), node->value() + node->value_size(), out);
+            *out = Ch('-'), ++out;
+            *out = Ch('-'), ++out;
+            *out = Ch('>'), ++out;
+            return out;
+        }
+
+        // Print doctype node
+        template<class OutIt, class Ch>
+        inline OutIt print_doctype_node(OutIt out, const xml_node<Ch> *node, int flags, int indent)
+        {
+            assert(node->type() == node_doctype);
+            if (!(flags & print_no_indenting))
+                out = fill_chars(out, indent, Ch('\t'));
+            *out = Ch('<'), ++out;
+            *out = Ch('!'), ++out;
+            *out = Ch('D'), ++out;
+            *out = Ch('O'), ++out;
+            *out = Ch('C'), ++out;
+            *out = Ch('T'), ++out;
+            *out = Ch('Y'), ++out;
+            *out = Ch('P'), ++out;
+            *out = Ch('E'), ++out;
+            *out = Ch(' '), ++out;
+            out = copy_chars(node->value(), node->value() + node->value_size(), out);
+            *out = Ch('>'), ++out;
+            return out;
+        }
+
+        // Print pi node
+        template<class OutIt, class Ch>
+        inline OutIt print_pi_node(OutIt out, const xml_node<Ch> *node, int flags, int indent)
+        {
+            assert(node->type() == node_pi);
+            if (!(flags & print_no_indenting))
+                out = fill_chars(out, indent, Ch('\t'));
+            *out = Ch('<'), ++out;
+            *out = Ch('?'), ++out;
+            out = copy_chars(node->name(), node->name() + node->name_size(), out);
+            *out = Ch(' '), ++out;
+            out = copy_chars(node->value(), node->value() + node->value_size(), out);
+            *out = Ch('?'), ++out;
+            *out = Ch('>'), ++out;
+            return out;
+        }
+
+    }
+    //! \endcond
+
+    ///////////////////////////////////////////////////////////////////////////
+    // Printing
+
+    //! Prints XML to given output iterator.
+    //! \param out Output iterator to print to.
+    //! \param node Node to be printed. Pass xml_document to print entire document.
+    //! \param flags Flags controlling how XML is printed.
+    //! \return Output iterator pointing to position immediately after last character of printed text.
+    template<class OutIt, class Ch> 
+    inline OutIt print(OutIt out, const xml_node<Ch> &node, int flags = 0)
+    {
+        return internal::print_node(out, &node, flags, 0);
+    }
+
+#ifndef RAPIDXML_NO_STREAMS
+
+    //! Prints XML to given output stream.
+    //! \param out Output stream to print to.
+    //! \param node Node to be printed. Pass xml_document to print entire document.
+    //! \param flags Flags controlling how XML is printed.
+    //! \return Output stream.
+    template<class Ch> 
+    inline std::basic_ostream<Ch> &print(std::basic_ostream<Ch> &out, const xml_node<Ch> &node, int flags = 0)
+    {
+        print(std::ostream_iterator<Ch>(out), node, flags);
+        return out;
+    }
+
+    //! Prints formatted XML to given output stream. Uses default printing flags. Use print() function to customize printing process.
+    //! \param out Output stream to print to.
+    //! \param node Node to be printed.
+    //! \return Output stream.
+    template<class Ch> 
+    inline std::basic_ostream<Ch> &operator <<(std::basic_ostream<Ch> &out, const xml_node<Ch> &node)
+    {
+        return print(out, node);
+    }
+
+#endif
+
+}
+
+#endif
diff --git a/thirdpt/rapidxml-1.13/rapidxml_utils.hpp b/thirdpt/rapidxml-1.13/rapidxml_utils.hpp
new file mode 100644
index 0000000..37c2953
--- /dev/null
+++ b/thirdpt/rapidxml-1.13/rapidxml_utils.hpp
@@ -0,0 +1,122 @@
+#ifndef RAPIDXML_UTILS_HPP_INCLUDED
+#define RAPIDXML_UTILS_HPP_INCLUDED
+
+// Copyright (C) 2006, 2009 Marcin Kalicinski
+// Version 1.13
+// Revision $DateTime: 2009/05/13 01:46:17 $
+//! \file rapidxml_utils.hpp This file contains high-level rapidxml utilities that can be useful
+//! in certain simple scenarios. They should probably not be used if maximizing performance is the main objective.
+
+#include "rapidxml.hpp"
+#include <vector>
+#include <string>
+#include <fstream>
+#include <stdexcept>
+
+namespace rapidxml
+{
+
+    //! Represents data loaded from a file
+    template<class Ch = char>
+    class file
+    {
+        
+    public:
+        
+        //! Loads file into the memory. Data will be automatically destroyed by the destructor.
+        //! \param filename Filename to load.
+        file(const char *filename)
+        {
+            using namespace std;
+
+            // Open stream
+            basic_ifstream<Ch> stream(filename, ios::binary);
+            if (!stream)
+                throw runtime_error(string("cannot open file ") + filename);
+            stream.unsetf(ios::skipws);
+            
+            // Determine stream size
+            stream.seekg(0, ios::end);
+            size_t size = stream.tellg();
+            stream.seekg(0);   
+            
+            // Load data and add terminating 0
+            m_data.resize(size + 1);
+            stream.read(&m_data.front(), static_cast<streamsize>(size));
+            m_data[size] = 0;
+        }
+
+        //! Loads file into the memory. Data will be automatically destroyed by the destructor
+        //! \param stream Stream to load from
+        file(std::basic_istream<Ch> &stream)
+        {
+            using namespace std;
+
+            // Load data and add terminating 0
+            stream.unsetf(ios::skipws);
+            m_data.assign(istreambuf_iterator<Ch>(stream), istreambuf_iterator<Ch>());
+            if (stream.fail() || stream.bad())
+                throw runtime_error("error reading stream");
+            m_data.push_back(0);
+        }
+        
+        //! Gets file data.
+        //! \return Pointer to data of file.
+        Ch *data()
+        {
+            return &m_data.front();
+        }
+
+        //! Gets file data.
+        //! \return Pointer to data of file.
+        const Ch *data() const
+        {
+            return &m_data.front();
+        }
+
+        //! Gets file data size.
+        //! \return Size of file data, in characters.
+        std::size_t size() const
+        {
+            return m_data.size();
+        }
+
+    private:
+
+        std::vector<Ch> m_data;   // File data
+
+    };
+
+    //! Counts children of node. Time complexity is O(n).
+    //! \return Number of children of node
+    template<class Ch>
+    inline std::size_t count_children(xml_node<Ch> *node)
+    {
+        xml_node<Ch> *child = node->first_node();
+        std::size_t count = 0;
+        while (child)
+        {
+            ++count;
+            child = child->next_sibling();
+        }
+        return count;
+    }
+
+    //! Counts attributes of node. Time complexity is O(n).
+    //! \return Number of attributes of node
+    template<class Ch>
+    inline std::size_t count_attributes(xml_node<Ch> *node)
+    {
+        xml_attribute<Ch> *attr = node->first_attribute();
+        std::size_t count = 0;
+        while (attr)
+        {
+            ++count;
+            attr = attr->next_attribute();
+        }
+        return count;
+    }
+
+}
+
+#endif