forked from PaddlePaddle/PaddleNLP
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
a4eb625
commit a6ea495
Showing
10 changed files
with
153 additions
and
26 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
// | ||
// Created by xiongxinlei on 5/28/24. | ||
// | ||
#include "unordered_map" | ||
#include <utility> | ||
#ifndef LEOMAX_TOKENIZER_BASE_H | ||
#define LEOMAX_TOKENIZER_BASE_H | ||
namespace leomax_tokenizer { | ||
namespace core { | ||
|
||
using Offset = std::pair<uint32_t, uint32_t>; | ||
using Vocab = std::unordered_map<std::string, uint32_t>; | ||
using Merges = std::vector<std::pair<std::string, std::string>>; | ||
|
||
|
||
struct Token { | ||
uint32_t id_; // token id | ||
std::string value_; | ||
Offset offset_; | ||
Token(uint32_t id, const std::string& value, const Offset& offset) : | ||
id_(id), value_(value), offset_(offset) { | ||
} | ||
}; | ||
|
||
} | ||
} | ||
|
||
#endif // LEOMAX_TOKENIZER_BASE_H |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
// | ||
// Created by xiongxinlei on 5/29/24. | ||
// | ||
|
||
#ifndef LEOMAX_TOKENIZER_CACHE_H | ||
#define LEOMAX_TOKENIZER_CACHE_H | ||
|
||
#endif // PADDLENLP_CACHE_H |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
// | ||
// Created by xiongxinlei on 5/28/24. | ||
// | ||
#include "bpe.h" | ||
#include <iostream> | ||
namespace leomax_tokenizer { | ||
namespace models { | ||
|
||
BPE::BPE() : | ||
fuse_unk_(false){ | ||
} | ||
|
||
BPE::BPE(const core::Vocab& vocab, | ||
const core::Merges& merges, | ||
const std::vector<float>& dropout, | ||
bool fuse_unk) : | ||
dropout_(dropout), | ||
fuse_unk_(fuse_unk) { | ||
|
||
} | ||
|
||
void BPE::tokenize_with_cache(const std::string& text, std::vector<core::Token>& tokens) { | ||
core::BPEWord bpe_word; | ||
if (cache_.get_value(text &bpe_worod)) { | ||
word_to_tokens(bpe_word, tokens); | ||
} else { | ||
// 根据合并规则,将文本拆分成多个词 | ||
merge_word(text, &bpe_word); | ||
// 将每个词转换为token | ||
word_to_tokens(bpe_word, tokens); | ||
// 将词加入缓存 | ||
cache_.set_value(text, bpe_word); | ||
} | ||
} | ||
|
||
std::vector<core::Token> BPE::tokenize(const std::string& text) { | ||
std::vector<core::Token> tokens; | ||
std::cout << "BPE tokenizer " << std::endl; | ||
std::cout << "text: " << text << std::endl; | ||
|
||
if (text.empty()) { | ||
return tokens; | ||
} | ||
|
||
if (dropout_.empty()) { | ||
return tokens; | ||
} | ||
|
||
|
||
return tokens; | ||
} | ||
} // namespace models | ||
} // namespace leomax_tokenizer |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
// | ||
// Created by xiongxinlei on 5/28/24. | ||
// | ||
|
||
#ifndef LEOMAX_TOKENIZER_BPE_H | ||
#define LEOMAX_TOKENIZER_BPE_H | ||
#include "models.h" | ||
#include "vector" | ||
#include "../core/base.h" | ||
namespace leomax_tokenizer { | ||
namespace models { | ||
|
||
class BPE : public Model { | ||
public: | ||
BPE(); | ||
BPE(const core::Vocab& vocab, | ||
const core::Merges& merges, | ||
const std::vector<float>& dropout = {}, | ||
bool fuse_unk = false); | ||
|
||
virtual std::vector<core::Token> tokenize(const std::string& text) override; | ||
private: | ||
void tokenize_with_cache(const std::string& text, std::vector<core::Token>& tokens); | ||
private: | ||
std::vector<float> dropout_; | ||
bool fuse_unk_; | ||
|
||
|
||
}; | ||
|
||
} // namespace models | ||
} // namespace leomax_tokenizer | ||
|
||
#endif // LEOMAX_TOKENIZER_BPE_H |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters