forked from PaddlePaddle/PaddleNLP
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
e74b331
commit a4eb625
Showing
11 changed files
with
145 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
// | ||
// Created by xiongxinlei on 5/28/24. | ||
// | ||
|
||
#ifndef LEOMAX_TOKENIZER_BASE_H | ||
#define LEOMAX_TOKENIZER_BASE_H | ||
|
||
using Offset = std::pair<uint32_t, uint32_t>; | ||
|
||
struct Token { | ||
uint32_t id_; // token id | ||
std::string value_; | ||
Offset offset_; | ||
Token(uint32_t id, const std::string& value, const Offset& offset) | ||
id_(id), value_(value), offset_(offset): { | ||
} | ||
}; | ||
#endif // LEOMAX_TOKENIZER_BASE_H |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
// | ||
// Created by xiongxinlei on 5/27/24. | ||
// | ||
|
||
#ifndef MODELS_MODELS_H | ||
#define MODELS_MODELS_H | ||
#include <pybind11/pybind11.h> | ||
#include <pybind11/stl.h> | ||
|
||
namespace leomax_tokenizer { | ||
namespace models { | ||
class Model { | ||
public: | ||
virtual void tokenize(const std::string& tokens) = 0; | ||
}; | ||
|
||
} | ||
} | ||
#endif // MODELS_MODELS_H |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
// | ||
// Created by xiongxinlei on 5/27/24. | ||
// | ||
|
||
#include "pymodels.h" | ||
#include "pytokens.h" | ||
namespace py = pybind11; | ||
namespace leomax_tokenizer { | ||
namespace pybind { | ||
|
||
void bind_models(pybind11::module* m) { | ||
auto submodule = m->def_submodule("models", "The models module"); | ||
py::class_<models::Model, PyModel/*辅助类*/>(submodule, "Model") | ||
.def(py::init<>()) | ||
.def("tokenize", &models::Model::tokenize); | ||
} | ||
|
||
} //pybind | ||
} // leomax_tokenizer |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
// | ||
// Created by xiongxinlei on 5/27/24. | ||
// | ||
|
||
#ifndef PYBIND_PYMODELS_H | ||
#define PYBIND_PYMODELS_H | ||
#include <pybind11/pybind11.h> | ||
#include <pybind11/stl.h> | ||
#include "../models/models.h" | ||
|
||
namespace leomax_tokenizer { | ||
namespace pybind { | ||
|
||
class PyModel : public models::Model { | ||
public: | ||
virtual void tokenize(const std::string& tokens) override { | ||
PYBIND11_OVERRIDE_PURE( | ||
void, | ||
models::Model, | ||
tokenize, | ||
tokens); | ||
} | ||
}; | ||
|
||
void bind_models(pybind11::module* m); | ||
|
||
} | ||
} | ||
#endif // PYBIND_PYMODELS_H |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
// | ||
// Created by xiongxinlei on 5/26/24. | ||
// | ||
#include "pytokens.h" | ||
#include "pymodels.h" | ||
namespace py = pybind11; | ||
|
||
namespace leomax_tokenizer { | ||
namespace pybind { | ||
|
||
|
||
} //pybind | ||
} // leomax_tokenizer |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
// | ||
// Created by xiongxinlei on 5/26/24. | ||
// | ||
|
||
#ifndef PYBIND_TOKEN_H | ||
#define PYBIND_TOKEN_H | ||
#include <pybind11/pybind11.h> | ||
#include <pybind11/stl.h> | ||
|
||
namespace leomax_tokenizer { | ||
namespace pybind { | ||
class PyToken { | ||
public: | ||
void tokenizer(const std::string& tokens) { | ||
std::cout << "PyModel tokenizer" << std::endl; | ||
} | ||
|
||
}; | ||
|
||
} | ||
} | ||
#endif // PYBIND_TOKEN_H |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,15 @@ | ||
from leomax_tokenizer.tokenizer_impl import SentencePieceBPELeoMaxTokenizer | ||
import leomax_tokenizer | ||
class TestModel(leomax_tokenizer.core_tokenizers.models.Model): | ||
def __init__(self): | ||
super().__init__() | ||
|
||
def tokenize(self, tokens): | ||
print("测试模型") | ||
return tokens | ||
if __name__ == '__main__': | ||
# tokenizer = SentencePieceBPELeoMaxTokenizer(vocab='vocab.json', | ||
# merges='merges.txt',) | ||
print(leomax_tokenizer.core_tokenizers.add(1, 2)) | ||
print(leomax_tokenizer.core_tokenizers.add(1, 2)) | ||
model = TestModel() | ||
print(model.tokenize("今天天气很不错")) |