KCPP Fetches AMD ROCm Memory without a stick, CC_TURING Gets the Boot…

…, koboldcpp_hipblas.dll Talks To The Hand, and hipBLAS Compiler Finds Its Independence! (#517) * AMD ROCm memory fetching and max mem setting * Update .gitignore with koboldcpp_hipblas.dll * Update CMakeLists.txt remove CC_TURING for AMD * separate hipBLAS compiler, update MMV_Y, move CXX/CC print separate hipBLAS compiler, update MMV_Y value, move the section that prints CXX and CC compiler name
LostRuins · Nov 5, 2023 · e2e5fe5 · e2e5fe5
1 parent a62468e
commit e2e5fe5
Show file tree

Hide file tree

Showing 4 changed files with 33 additions and 26 deletions.
diff --git a/.gitignore b/.gitignore
@@ -111,6 +111,7 @@ tests/test-tokenizer-1-bpe
 rocblas.dll
 hipblas.dll
 koboldcpp_hipblas.so
+koboldcpp_hipblas.dll
 
 # Jetbrains idea folder
 .idea/
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -153,7 +153,6 @@ if (LLAMA_HIPBLAS)
         target_compile_definitions(ggml-rocm PUBLIC GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
         target_compile_definitions(ggml-rocm PUBLIC GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
         target_compile_definitions(ggml-rocm PUBLIC K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
-        target_compile_definitions(ggml-rocm PUBLIC CC_TURING=1000000000)
         set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
         target_link_libraries(ggml-rocm PUBLIC hip::device hip::host roc::rocblas roc::hipblas)
 
@@ -165,7 +164,6 @@ if (LLAMA_HIPBLAS)
         target_compile_definitions(ggml-v2-rocm PUBLIC GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
         target_compile_definitions(ggml-v2-rocm PUBLIC GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
         target_compile_definitions(ggml-v2-rocm PUBLIC K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
-        target_compile_definitions(ggml-v2-rocm PUBLIC CC_TURING=1000000000)
         set_source_files_properties(otherarch/ggml_v2-cuda.cu PROPERTIES LANGUAGE CXX)
         target_link_libraries(ggml-v2-rocm PUBLIC hip::device hip::host roc::rocblas roc::hipblas)
 
@@ -177,7 +175,6 @@ if (LLAMA_HIPBLAS)
         target_compile_definitions(ggml-v2-legacy-rocm PUBLIC GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
         target_compile_definitions(ggml-v2-legacy-rocm PUBLIC GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
         target_compile_definitions(ggml-v2-legacy-rocm PUBLIC K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
-        target_compile_definitions(ggml-v2-legacy-rocm PUBLIC CC_TURING=1000000000)
         set_source_files_properties(otherarch/ggml_v2-cuda-legacy.cu PROPERTIES LANGUAGE CXX)
         target_link_libraries(ggml-v2-legacy-rocm PUBLIC hip::device hip::host roc::rocblas roc::hipblas)
 
@@ -437,4 +434,4 @@ target_link_libraries(llama PRIVATE
     ${LLAMA_EXTRA_LIBS}
     )
 add_subdirectory(examples)
-endif()
+endif()
diff --git a/Makefile b/Makefile
@@ -196,16 +196,13 @@ endif # LLAMA_CUBLAS
 
 ifdef LLAMA_HIPBLAS
 	ROCM_PATH	?= /opt/rocm
-	CC         := $(ROCM_PATH)/llvm/bin/clang
-	CXX        := $(ROCM_PATH)/llvm/bin/clang++
+	HCC         := $(ROCM_PATH)/llvm/bin/clang
+	HCXX        := $(ROCM_PATH)/llvm/bin/clang++
 	GPU_TARGETS ?= gfx803 gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100 $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
 	LLAMA_CUDA_DMMV_X ?= 32
-	LLAMA_CUDA_MMV_Y ?= 2
+	LLAMA_CUDA_MMV_Y ?= 1
 	LLAMA_CUDA_KQUANTS_ITER ?= 2
 	HIPFLAGS   += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
-ifdef LLAMA_CUDA_FORCE_DMMV
-	HIPFLAGS 	+= -DGGML_CUDA_FORCE_DMMV
-endif # LLAMA_CUDA_FORCE_DMMV
 	HIPLDFLAGS    += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 -lrocblas
 	HIP_OBJS       += ggml-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o
 ggml-cuda.o: HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) \
@@ -221,11 +218,11 @@ ggml_v2-cuda-legacy.o: HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) \
                         -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) \
                         -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
-	$(CXX) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
+	$(HCXX) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
 ggml_v2-cuda.o: otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h
-	$(CXX) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
+	$(HCXX) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
 ggml_v2-cuda-legacy.o: otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h
-	$(CXX) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
+	$(HCXX) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
 endif # LLAMA_HIPBLAS
 
 
@@ -259,8 +256,6 @@ ifneq ($(filter armv8%,$(UNAME_M)),)
 	CFLAGS += -mfp16-format=ieee -mno-unaligned-access
 endif
 
-CCV := $(shell $(CC) --version | head -n 1)
-CXXV := $(shell $(CXX) --version | head -n 1)
 
 DEFAULT_BUILD =
 FAILSAFE_BUILD =
@@ -281,7 +276,7 @@ ifeq ($(OS),Windows_NT)
 		CUBLAS_BUILD = $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $^ -shared -o [email protected] $(CUBLASLD_FLAGS) $(LDFLAGS)
 	endif
 	ifdef LLAMA_HIPBLAS
-		HIPBLAS_BUILD = $(CXX) $(CXXFLAGS) $(HIPFLAGS) $^ -shared -o [email protected] $(HIPLDFLAGS) $(LDFLAGS)
+		HIPBLAS_BUILD = $(HCXX) $(CXXFLAGS) $(HIPFLAGS) $^ -shared -o [email protected] $(HIPLDFLAGS) $(LDFLAGS)
 	endif
 else
 	DEFAULT_BUILD = $(CXX) $(CXXFLAGS)  $^ -shared -o [email protected] $(LDFLAGS)
@@ -300,7 +295,7 @@ else
 		CUBLAS_BUILD = $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $^ -shared -o [email protected] $(CUBLASLD_FLAGS) $(LDFLAGS)
 	endif
 	ifdef LLAMA_HIPBLAS
-		HIPBLAS_BUILD = $(CXX) $(CXXFLAGS) $(HIPFLAGS) $^ -shared -o [email protected] $(HIPLDFLAGS) $(LDFLAGS)
+		HIPBLAS_BUILD = $(HCXX) $(CXXFLAGS) $(HIPFLAGS) $^ -shared -o [email protected] $(HIPLDFLAGS) $(LDFLAGS)
 	endif
 
 	ifndef LLAMA_OPENBLAS
@@ -314,7 +309,8 @@ else
 	endif
 endif
 
-
+CCV := $(shell $(CC) --version | head -n 1)
+CXXV := $(shell $(CXX) --version | head -n 1)
 
 #
 # Print build information

diff --git a/koboldcpp.py b/koboldcpp.py
@@ -1086,6 +1086,7 @@ def auto_gpu_heuristics():
         from subprocess import run, CalledProcessError
         FetchedCUdevices = []
         FetchedCUdeviceMem = []
+        AMDgpu = None
         try: # Get OpenCL GPU names on windows using a special binary. overwrite at known index if found.
             basepath = os.path.abspath(os.path.dirname(__file__))
             output = run([((os.path.join(basepath, "winclinfo.exe")) if os.name == 'nt' else "clinfo"),"--json"], capture_output=True, text=True, check=True, encoding='utf-8').stdout
@@ -1119,29 +1120,41 @@ def auto_gpu_heuristics():
             try: # Get AMD ROCm GPU names
                 output = run(['rocminfo'], capture_output=True, text=True, check=True, encoding='utf-8').stdout
                 device_name = None
-                for line in output.splitlines():
+                for line in output.splitlines(): # read through the output line by line
                     line = line.strip()
-                    if line.startswith("Marketing Name:"): device_name = line.split(":", 1)[1].strip()
-                    elif line.startswith("Device Type:") and "GPU" in line and device_name is not None: FetchedCUdevices.append(device_name)
+                    if line.startswith("Marketing Name:"): device_name = line.split(":", 1)[1].strip() # if we find a named device, temporarily save the name
+                    elif line.startswith("Device Type:") and "GPU" in line and device_name is not None: # if the following Device Type is a GPU (not a CPU) then add it to devices list
+                        FetchedCUdevices.append(device_name)
+                        AMDgpu = True
                     elif line.startswith("Device Type:") and "GPU" not in line: device_name = None
+                if FetchedCUdevices:
+                    getamdvram = run(['rocm-smi', '--showmeminfo', 'vram', '--csv'], capture_output=True, text=True, check=True, encoding='utf-8').stdout # fetch VRAM of devices
+                    FetchedCUdeviceMem = [line.split(",")[1].strip() for line in getamdvram.splitlines()[1:] if line.strip()]
             except Exception as e:
                 pass
 
         for idx in range(0,4):
             if(len(FetchedCUdevices)>idx):
                 CUDevicesNames[idx] = FetchedCUdevices[idx]
-                MaxMemory[0] = max(int(FetchedCUdeviceMem[idx])*1024*1024,MaxMemory[0])
-                pass
+                if AMDgpu:
+                    MaxMemory[0] = max(int(FetchedCUdeviceMem[idx]),MaxMemory[0])
+                else:
+                    MaxMemory[0] = max(int(FetchedCUdeviceMem[idx])*1024*1024,MaxMemory[0])
+                    pass
 
         #autopick cublas if suitable
         global exitcounter
-        if exitcounter < 100 and MaxMemory[0]>3500000000 and CUDevicesNames[0]!="" and "Use CuBLAS" in runopts and runopts_var.get()=="Use OpenBLAS":
-            runopts_var.set("Use CuBLAS")
-            pass
+        if exitcounter < 100 and MaxMemory[0]>3500000000 and CUDevicesNames[0]!="" and "Use CuBLAS" or "Use hipBLAS (ROCM)" in runopts and runopts_var.get()=="Use OpenBLAS":
+            if "Use CuBLAS" in runopts:
+                runopts_var.set("Use CuBLAS")
+                pass
+            elif "Use hipBLAS (ROCM)" in runopts:
+                runopts_var.set("Use hipBLAS (ROCM)")
 
         changed_gpu_choice_var()
         return
 
+
     def autoset_gpu_layers(filepath): #shitty algo to determine how many layers to use
         try:
             global gui_layers_untouched