Skip to content

Commit

Permalink
添加bpe部分代码
Browse files Browse the repository at this point in the history
  • Loading branch information
LeoMax-Xiong committed May 28, 2024
1 parent a4eb625 commit a6ea495
Show file tree
Hide file tree
Showing 10 changed files with 153 additions and 26 deletions.
2 changes: 1 addition & 1 deletion leomax_tokenizer/leomax_tokenizer/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ add_subdirectory(models)
if (WITH_PYTHON)
cc_library(core_tokenizers SHARED
SRCS pybind/pybind.cc tokenizers/ernie_fast_tokenizer.cc
pybind/pytoken.cc pybind/pymodels.cc
pybind/pytoken.cc pybind/pymodels.cc models/bpe.cc
DEPS pybind)

# 设置编译出来的库没有 "lib"前缀
Expand Down
18 changes: 0 additions & 18 deletions leomax_tokenizer/leomax_tokenizer/base/base.h

This file was deleted.

28 changes: 28 additions & 0 deletions leomax_tokenizer/leomax_tokenizer/core/base.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
//
// Created by xiongxinlei on 5/28/24.
//
#include "unordered_map"
#include <utility>
#ifndef LEOMAX_TOKENIZER_BASE_H
#define LEOMAX_TOKENIZER_BASE_H
namespace leomax_tokenizer {
namespace core {

using Offset = std::pair<uint32_t, uint32_t>;
using Vocab = std::unordered_map<std::string, uint32_t>;
using Merges = std::vector<std::pair<std::string, std::string>>;


struct Token {
uint32_t id_; // token id
std::string value_;
Offset offset_;
Token(uint32_t id, const std::string& value, const Offset& offset) :
id_(id), value_(value), offset_(offset) {
}
};

}
}

#endif // LEOMAX_TOKENIZER_BASE_H
8 changes: 8 additions & 0 deletions leomax_tokenizer/leomax_tokenizer/core/cache.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
//
// Created by xiongxinlei on 5/29/24.
//

#ifndef LEOMAX_TOKENIZER_CACHE_H
#define LEOMAX_TOKENIZER_CACHE_H

#endif // PADDLENLP_CACHE_H
53 changes: 53 additions & 0 deletions leomax_tokenizer/leomax_tokenizer/models/bpe.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
//
// Created by xiongxinlei on 5/28/24.
//
#include "bpe.h"
#include <iostream>
namespace leomax_tokenizer {
namespace models {

BPE::BPE() :
fuse_unk_(false){
}

BPE::BPE(const core::Vocab& vocab,
const core::Merges& merges,
const std::vector<float>& dropout,
bool fuse_unk) :
dropout_(dropout),
fuse_unk_(fuse_unk) {

}

void BPE::tokenize_with_cache(const std::string& text, std::vector<core::Token>& tokens) {
core::BPEWord bpe_word;
if (cache_.get_value(text &bpe_worod)) {
word_to_tokens(bpe_word, tokens);
} else {
// 根据合并规则,将文本拆分成多个词
merge_word(text, &bpe_word);
// 将每个词转换为token
word_to_tokens(bpe_word, tokens);
// 将词加入缓存
cache_.set_value(text, bpe_word);
}
}

std::vector<core::Token> BPE::tokenize(const std::string& text) {
std::vector<core::Token> tokens;
std::cout << "BPE tokenizer " << std::endl;
std::cout << "text: " << text << std::endl;

if (text.empty()) {
return tokens;
}

if (dropout_.empty()) {
return tokens;
}


return tokens;
}
} // namespace models
} // namespace leomax_tokenizer
34 changes: 34 additions & 0 deletions leomax_tokenizer/leomax_tokenizer/models/bpe.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
//
// Created by xiongxinlei on 5/28/24.
//

#ifndef LEOMAX_TOKENIZER_BPE_H
#define LEOMAX_TOKENIZER_BPE_H
#include "models.h"
#include "vector"
#include "../core/base.h"
namespace leomax_tokenizer {
namespace models {

class BPE : public Model {
public:
BPE();
BPE(const core::Vocab& vocab,
const core::Merges& merges,
const std::vector<float>& dropout = {},
bool fuse_unk = false);

virtual std::vector<core::Token> tokenize(const std::string& text) override;
private:
void tokenize_with_cache(const std::string& text, std::vector<core::Token>& tokens);
private:
std::vector<float> dropout_;
bool fuse_unk_;


};

} // namespace models
} // namespace leomax_tokenizer

#endif // LEOMAX_TOKENIZER_BPE_H
4 changes: 3 additions & 1 deletion leomax_tokenizer/leomax_tokenizer/models/models.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,14 @@
#define MODELS_MODELS_H
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <vector>
#include "../core/base.h"

namespace leomax_tokenizer {
namespace models {
class Model {
public:
virtual void tokenize(const std::string& tokens) = 0;
virtual std::vector<core::Token> tokenize(const std::string& tokens) = 0;
};

}
Expand Down
12 changes: 8 additions & 4 deletions leomax_tokenizer/leomax_tokenizer/pybind/pymodels.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,14 @@ namespace leomax_tokenizer {
namespace pybind {

void bind_models(pybind11::module* m) {
auto submodule = m->def_submodule("models", "The models module");
py::class_<models::Model, PyModel/*辅助类*/>(submodule, "Model")
.def(py::init<>())
.def("tokenize", &models::Model::tokenize);
auto submodule = m->def_submodule("models", "The models module");
py::class_<models::Model, PyModel/*辅助类*/>(submodule, "Model")
.def(py::init<>())
.def("tokenize", &models::Model::tokenize);

py::class_<models::BPE, PyBPE/*辅助类*/>(submodule, "BPE")
.def(py::init<>())
.def("tokenize", &models::BPE::tokenize);;
}

} //pybind
Expand Down
17 changes: 15 additions & 2 deletions leomax_tokenizer/leomax_tokenizer/pybind/pymodels.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,34 @@
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include "../models/models.h"
#include "../models/bpe.h"

namespace leomax_tokenizer {
namespace pybind {

class PyModel : public models::Model {
public:
virtual void tokenize(const std::string& tokens) override {
virtual std::vector<core::Token> tokenize(const std::string& tokens) override {
PYBIND11_OVERRIDE_PURE(
void,
std::vector<core::Token>,
models::Model,
tokenize,
tokens);
}
};


class PyBPE : public models::BPE {
virtual std::vector<core::Token> tokenize (
const std::string& tokens) override {
PYBIND11_OVERRIDE_PURE(
std::vector<core::Token>,
models::BPE,
tokenize,
tokens);
}
};

void bind_models(pybind11::module* m);

}
Expand Down
3 changes: 3 additions & 0 deletions leomax_tokenizer/python/tests/run_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,6 @@ def tokenize(self, tokens):
print(leomax_tokenizer.core_tokenizers.add(1, 2))
model = TestModel()
print(model.tokenize("今天天气很不错"))

model = leomax_tokenizer.core_tokenizers.models.BPE()
print(model.tokenize("今天天气很不错"))

0 comments on commit a6ea495

Please sign in to comment.