Skip to content

Commit

Permalink
添加了Model的tokenize接口
Browse files Browse the repository at this point in the history
  • Loading branch information
LeoMax-Xiong committed May 28, 2024
1 parent e74b331 commit a4eb625
Show file tree
Hide file tree
Showing 11 changed files with 145 additions and 7 deletions.
3 changes: 2 additions & 1 deletion leomax_tokenizer/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,9 @@ set(CPP_PACKAGE_DIR ${CMAKE_BINARY_DIR}/cpp/leomax_tokenizer)
set(TOKENIZER_CORE_NAME "core_tokenizers")

set(TOKENIZER_CORE_LIBS_PATH "${TOKENIZER_CORE_PATH}/lib${TOKENIZER_CORE_NAME}.so")

set(TOKENIZER_INSTALL_INCLUDE_DIR ${PROJECT_SOURCE_DIR})
include_directories(${TOKENIZERS_INSTALL_INCLUDE_DIR})
message(STATUS "tokenizer install include dir ${TOKENIZER_INSTALL_INCLUDE_DIR}")

# 添加预定义的函数以及第三方库需要的一些配置
# config GIT_URL with github mirrors to speed up dependent repos clone
Expand Down
5 changes: 4 additions & 1 deletion leomax_tokenizer/leomax_tokenizer/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
add_subdirectory(models)

if (WITH_PYTHON)
cc_library(core_tokenizers SHARED
SRCS pybind/pybind.cc tokenizers/ernie_fast_tokenizer.cc
DEPS pybind)
pybind/pytoken.cc pybind/pymodels.cc
DEPS pybind)

# 设置编译出来的库没有 "lib"前缀
set_target_properties(core_tokenizers PROPERTIES PREFIX "")
Expand Down
18 changes: 18 additions & 0 deletions leomax_tokenizer/leomax_tokenizer/base/base.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
//
// Created by xiongxinlei on 5/28/24.
//

#ifndef LEOMAX_TOKENIZER_BASE_H
#define LEOMAX_TOKENIZER_BASE_H

using Offset = std::pair<uint32_t, uint32_t>;

struct Token {
uint32_t id_; // token id
std::string value_;
Offset offset_;
Token(uint32_t id, const std::string& value, const Offset& offset)
id_(id), value_(value), offset_(offset): {
}
};
#endif // LEOMAX_TOKENIZER_BASE_H
Empty file.
19 changes: 19 additions & 0 deletions leomax_tokenizer/leomax_tokenizer/models/models.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
//
// Created by xiongxinlei on 5/27/24.
//

#ifndef MODELS_MODELS_H
#define MODELS_MODELS_H
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>

namespace leomax_tokenizer {
namespace models {
class Model {
public:
virtual void tokenize(const std::string& tokens) = 0;
};

}
}
#endif // MODELS_MODELS_H
13 changes: 9 additions & 4 deletions leomax_tokenizer/leomax_tokenizer/pybind/pybind.cc
Original file line number Diff line number Diff line change
@@ -1,16 +1,21 @@
#include <pybind11/pybind11.h>
#include <Python.h>
#include <pybind11/pybind11.h>

#include "pytokens.h"
#include "pymodels.h"
namespace leomax_tokenizer {
namespace pybind {

namespace py = pybind11;
//namespace py = pybind11;

int add(int i, int j) {
return i + j;
}

PYBIND11_MODULE(core_tokenizers, m) {
m.doc() = "pybind11 example plugin";
m.doc() = "pybind11 leomax tokenizer core module";
m.def("add", &add, "A function which adds two number");

bind_models(&m);
}

} // namespace pybind
Expand Down
19 changes: 19 additions & 0 deletions leomax_tokenizer/leomax_tokenizer/pybind/pymodels.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
//
// Created by xiongxinlei on 5/27/24.
//

#include "pymodels.h"
#include "pytokens.h"
namespace py = pybind11;
namespace leomax_tokenizer {
namespace pybind {

void bind_models(pybind11::module* m) {
auto submodule = m->def_submodule("models", "The models module");
py::class_<models::Model, PyModel/*辅助类*/>(submodule, "Model")
.def(py::init<>())
.def("tokenize", &models::Model::tokenize);
}

} //pybind
} // leomax_tokenizer
29 changes: 29 additions & 0 deletions leomax_tokenizer/leomax_tokenizer/pybind/pymodels.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
//
// Created by xiongxinlei on 5/27/24.
//

#ifndef PYBIND_PYMODELS_H
#define PYBIND_PYMODELS_H
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include "../models/models.h"

namespace leomax_tokenizer {
namespace pybind {

class PyModel : public models::Model {
public:
virtual void tokenize(const std::string& tokens) override {
PYBIND11_OVERRIDE_PURE(
void,
models::Model,
tokenize,
tokens);
}
};

void bind_models(pybind11::module* m);

}
}
#endif // PYBIND_PYMODELS_H
13 changes: 13 additions & 0 deletions leomax_tokenizer/leomax_tokenizer/pybind/pytoken.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
//
// Created by xiongxinlei on 5/26/24.
//
#include "pytokens.h"
#include "pymodels.h"
namespace py = pybind11;

namespace leomax_tokenizer {
namespace pybind {


} //pybind
} // leomax_tokenizer
22 changes: 22 additions & 0 deletions leomax_tokenizer/leomax_tokenizer/pybind/pytokens.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
//
// Created by xiongxinlei on 5/26/24.
//

#ifndef PYBIND_TOKEN_H
#define PYBIND_TOKEN_H
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>

namespace leomax_tokenizer {
namespace pybind {
class PyToken {
public:
void tokenizer(const std::string& tokens) {
std::cout << "PyModel tokenizer" << std::endl;
}

};

}
}
#endif // PYBIND_TOKEN_H
11 changes: 10 additions & 1 deletion leomax_tokenizer/python/tests/run_test.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
from leomax_tokenizer.tokenizer_impl import SentencePieceBPELeoMaxTokenizer
import leomax_tokenizer
class TestModel(leomax_tokenizer.core_tokenizers.models.Model):
def __init__(self):
super().__init__()

def tokenize(self, tokens):
print("测试模型")
return tokens
if __name__ == '__main__':
# tokenizer = SentencePieceBPELeoMaxTokenizer(vocab='vocab.json',
# merges='merges.txt',)
print(leomax_tokenizer.core_tokenizers.add(1, 2))
print(leomax_tokenizer.core_tokenizers.add(1, 2))
model = TestModel()
print(model.tokenize("今天天气很不错"))

0 comments on commit a4eb625

Please sign in to comment.