Skip to content

Commit

Permalink
tweak SquareTree parser
Browse files Browse the repository at this point in the history
  • Loading branch information
kjk committed Aug 11, 2024
1 parent ec241c2 commit 23d2559
Show file tree
Hide file tree
Showing 15 changed files with 153 additions and 131 deletions.
6 changes: 3 additions & 3 deletions src/ChmFile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ TempStr ChmFile::SmartToUtf8Temp(const char* s, uint overrideCP) const {
}

WCHAR* ChmFile::SmartToWStr(const char* text) const {
return strconv::StrToWStr(text, codepage);
return strconv::StrCPToWStr(text, codepage);
}

static char* GetCharZ(const ByteSlice& d, size_t off) {
Expand Down Expand Up @@ -378,7 +378,7 @@ static bool VisitChmTocItem(EbookTocVisitor* visitor, HtmlElement* el, uint cp,
AutoFreeWStr attrVal(el->GetAttribute("value"));
if (attrName && attrVal && cp != CP_CHM_DEFAULT) {
AutoFreeStr bytes = strconv::WStrToCodePage(CP_CHM_DEFAULT, attrVal);
attrVal.Set(strconv::StrToWStr(bytes.Get(), cp));
attrVal.Set(strconv::StrCPToWStr(bytes.Get(), cp));
}
if (!attrName || !attrVal) {
/* ignore incomplete/unneeded <param> */;
Expand Down Expand Up @@ -430,7 +430,7 @@ static bool VisitChmIndexItem(EbookTocVisitor* visitor, HtmlElement* el, uint cp
if (attrName && attrVal && cp != CP_CHM_DEFAULT) {
// TODO: convert attrVal to CP_CHM_DEFAULT
// AutoFreeStr bytes = strconv::WStrToCodePage(CP_CHM_DEFAULT, attrVal);
// attrVal.Set(strconv::StrToWStr(bytes.Get(), cp));
// attrVal.Set(strconv::StrCPToWStr(bytes.Get(), cp));
}
if (!attrName || !attrVal) {
/* ignore incomplete/unneeded <param> */;
Expand Down
14 changes: 7 additions & 7 deletions src/EbookDoc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,13 @@ static bool IsValidUtf8(const char* string) {
}

static TempStr DecodeTextToUtf8Temp(const char* s, bool isXML = false) {
if (str::StartsWith(s, UTF8_BOM)) {
return str::DupTemp(s + 3);
}
if (str::StartsWith(s, UTF16_BOM)) {
s += 2;
return ToUtf8Temp((WCHAR*)s);
}
if (str::StartsWith(s, UTF16BE_BOM)) {
// convert from utf16 big endian to utf16
s += 2;
Expand All @@ -93,13 +100,6 @@ static TempStr DecodeTextToUtf8Temp(const char* s, bool isXML = false) {
}
return ToUtf8Temp((WCHAR*)s);
}
if (str::StartsWith(s, UTF16_BOM)) {
s += 2;
return ToUtf8Temp((WCHAR*)s);
}
if (str::StartsWith(s, UTF8_BOM)) {
return str::DupTemp(s + 3);
}
uint codePage = isXML ? GetCodepageFromPI(s) : CP_ACP;
if (CP_ACP == codePage && IsValidUtf8(s)) {
return str::DupTemp(s);
Expand Down
2 changes: 1 addition & 1 deletion src/EngineMupdf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2048,7 +2048,7 @@ bool EngineMupdf::LoadFromStream(fz_stream* stm, const char* nameHint, PasswordU
// note: such passwords aren't portable when stored as Unicode text
if (!ok && GetACP() != 1252) {
AutoFreeStr pwd_ansi = str::Dup(pwd.Get());
AutoFreeWStr pwd_cp1252(strconv::StrToWStr(pwd_ansi.Get(), 1252));
AutoFreeWStr pwd_cp1252(strconv::StrCPToWStr(pwd_ansi.Get(), 1252));
pwdA = ToUtf8(pwd_cp1252);
ok = fz_authenticate_password(ctx, _doc, pwdA.Get());
}
Expand Down
5 changes: 3 additions & 2 deletions src/SumatraPDF.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -201,8 +201,9 @@ void InitializePolicies(bool restrict) {
}

ByteSlice restrictData = file::ReadFile(restrictPath);
SquareTree sqt(restrictData);
SquareTreeNode* polsec = sqt.root ? sqt.root->GetChild("Policies") : nullptr;
SquareTreeNode* root = ParseSquareTree(restrictData);
AutoDelete delRoot(root);
SquareTreeNode* polsec = root ? root->GetChild("Policies") : nullptr;
// if the restriction file is broken, err on the side of full restriction
if (!polsec) {
return;
Expand Down
7 changes: 4 additions & 3 deletions src/UpdateCheck.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,11 +109,12 @@ static UpdateInfo* ParseUpdateInfo(const char* d) {
return nullptr;
}

SquareTree tree(d);
if (!tree.root) {
SquareTreeNode* root = ParseSquareTree(d);
if (!root) {
return nullptr;
}
SquareTreeNode* node = tree.root->GetChild("SumatraPDF");
AutoDelete delRoot(root);
SquareTreeNode* node = root->GetChild("SumatraPDF");
if (!node) {
return nullptr;
}
Expand Down
2 changes: 1 addition & 1 deletion src/ifilter/TeXFilter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ HRESULT TeXFilter::OnInit() {
return res;
}

m_pData = strconv::StrToWStr(data, CP_ACP);
m_pData = strconv::StrCPToWStr(data, CP_ACP);
m_pBuffer = AllocArray<WCHAR>(data.size() + 1);
data.Free();

Expand Down
5 changes: 0 additions & 5 deletions src/utils/Scoped.h
Original file line number Diff line number Diff line change
Expand Up @@ -166,11 +166,6 @@ struct AutoFree {
char* StealData() {
return this->Release();
}

void TakeOwnershipOf(const char* s) {
free(data);
data = (char*)s;
}
};

// TODO: replace most of AutoFree with AutoFreeStr
Expand Down
11 changes: 7 additions & 4 deletions src/utils/SettingsUtil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -507,14 +507,17 @@ static void* DeserializeStructRec(const StructInfo* info, SquareTreeNode* node,
ByteSlice SerializeStruct(const StructInfo* info, const void* strct, const char* prevData) {
str::Str out;
out.Append(UTF8_BOM);
SquareTree prevSqt(prevData);
SerializeStructRec(out, info, strct, prevSqt.root);
SquareTreeNode* root = ParseSquareTree(prevData);
SerializeStructRec(out, info, strct, root);
delete root;
return out.StealAsByteSlice();
}

void* DeserializeStruct(const StructInfo* info, const char* data, void* strct) {
SquareTree sqt(data);
return DeserializeStructRec(info, sqt.root, (u8*)strct, !strct);
SquareTreeNode* root = ParseSquareTree(data);
auto res = DeserializeStructRec(info, root, (u8*)strct, !strct);
delete root;
return res;
}

static void FreeStructData(const StructInfo* info, u8* base) {
Expand Down
25 changes: 7 additions & 18 deletions src/utils/SquareTreeParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -205,23 +205,12 @@ static SquareTreeNode* ParseSquareTreeRec(char*& data, bool isTopLevel = false)
return node;
}

SquareTree::SquareTree(const char* data) : root(nullptr) {
// convert the file content to UTF-8
if (str::StartsWith(data, UTF8_BOM)) {
dataUtf8.SetCopy(data + 3);
} else if (str::StartsWith(data, UTF16_BOM)) {
auto tmp = ToUtf8((const WCHAR*)(data + 2));
dataUtf8.Set(tmp);
} else if (data) {
AutoFreeWStr tmp(strconv::AnsiToWStr(data));
auto tmp2 = ToUtf8(tmp.Get());
dataUtf8.Set(tmp2);
SquareTreeNode* ParseSquareTree(const char* s) {
char* data = strconv::UnknownToUtf8Temp(s);
if (!data) {
return nullptr;
}
if (!dataUtf8) {
return;
}

char* start = dataUtf8.Get();
root = ParseSquareTreeRec(start, true);
ReportIf(*start || !root);
char* tmp = data;
auto res = ParseSquareTreeRec(tmp, true);
return res;
}
15 changes: 2 additions & 13 deletions src/utils/SquareTreeParser.h
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
/* Copyright 2022 the SumatraPDF project authors (see AUTHORS file).
License: Simplified BSD (see COPYING.BSD) */

class SquareTreeNode {
public:
struct SquareTreeNode {
SquareTreeNode() = default;
~SquareTreeNode();

Expand Down Expand Up @@ -31,14 +30,4 @@ class SquareTreeNode {
SquareTreeNode* GetChild(const char* key, size_t* startIdx = nullptr) const;
};

class SquareTree {
AutoFree dataUtf8;

public:
explicit SquareTree(const char* data);
~SquareTree() {
delete root;
}

SquareTreeNode* root;
};
SquareTreeNode* ParseSquareTree(const char* s);
42 changes: 29 additions & 13 deletions src/utils/StrconvUtil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ char* WStrToUtf8(const WCHAR* s, size_t cch, Allocator* a) {
}

// caller needs to free() the result
WCHAR* StrToWStr(const char* src, uint codePage, int cbSrc) {
WCHAR* StrCPToWStr(const char* src, uint codePage, int cbSrc) {
ReportIf(!src);
if (!src) {
return nullptr;
Expand Down Expand Up @@ -96,7 +96,7 @@ TempStr ToMultiByteTemp(const char* src, uint codePageSrc, uint codePageDest) {
return str::DupTemp(src);
}

WCHAR* tmp = StrToWStr(src, codePageSrc);
WCHAR* tmp = StrCPToWStr(src, codePageSrc);
if (!tmp) {
return nullptr;
}
Expand All @@ -114,41 +114,57 @@ TempStr StrToUtf8Temp(const char* src, uint codePage) {
// tries to convert a string in unknown encoding to utf8, as best
// as it can
// caller has to free() it
char* UnknownToUtf8(const char* s) {
char* UnknownToUtf8Temp(const char* s) {
size_t len = str::Len(s);

if (len < 3) {
return str::Dup(s, len);
return str::DupTemp(s, len);
}

if (str::StartsWith(s, UTF8_BOM)) {
return str::Dup(s + 3, len - 3);
return str::DupTemp(s + 3, len - 3);
}

// TODO: UTF16BE_BOM

if (str::StartsWith(s, UTF16_BOM)) {
s += 2;
int cch = (int)((len - 2) / 2);
return ToUtf8((const WCHAR*)s, cch);
// codeql complains about char* => WCHAR* cast
void* d = (void*)s;
return ToUtf8Temp((const WCHAR*)d, cch);
}

if (str::StartsWith(s, UTF16BE_BOM)) {
// convert from utf16 big endian to utf16
s += 2;
int n = str::Leni((WCHAR*)s);
char* tmp = (char*)s;
for (int i = 0; i < n; i++) {
int idx = i * 2;
std::swap(tmp[idx], tmp[idx + 1]);
}
// codeql complains about char* => WCHAR* cast
void* d = (void*)s;
return ToUtf8Temp((const WCHAR*)d);
}

// if s is valid utf8, leave it alone
const u8* tmp = (const u8*)s;
if (isLegalUTF8String(&tmp, tmp + len)) {
return str::Dup(s, len);
return str::DupTemp(s, len);
}

AutoFreeWStr uni = strconv::AnsiToWStr(s, len);
return ToUtf8(uni.Get());
WCHAR* ws = strconv::AnsiToWStr(s, len);
auto res = ToUtf8Temp(ws);
str::Free(ws);
return res;
}

WCHAR* AnsiToWStr(const char* src, size_t cbLen) {
return StrToWStr(src, CP_ACP, (int)cbLen);
return StrCPToWStr(src, CP_ACP, (int)cbLen);
}

char* AnsiToUtf8(const char* src, size_t cbLen) {
WCHAR* ws = StrToWStr(src, CP_ACP, (int)cbLen);
WCHAR* ws = StrCPToWStr(src, CP_ACP, (int)cbLen);
char* res = ToUtf8(ws);
str::Free(ws);
return res;
Expand Down
4 changes: 2 additions & 2 deletions src/utils/StrconvUtil.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ char* WStrToUtf8(const WCHAR* s, size_t cch = (size_t)-1, Allocator* a = nullptr

char* WStrToCodePage(uint codePage, const WCHAR* s, size_t cch = (size_t)-1, Allocator* a = nullptr);
TempStr ToMultiByteTemp(const char* src, uint codePageSrc, uint codePageDest);
WCHAR* StrToWStr(const char* src, uint codePage, int cbSrc = -1);
WCHAR* StrCPToWStr(const char* src, uint codePage, int cbSrc = -1);
TempStr StrToUtf8Temp(const char* src, uint codePage);

char* UnknownToUtf8(const char*);
char* UnknownToUtf8Temp(const char*);

char* WStrToAnsi(const WCHAR*);
char* Utf8ToAnsi(const char*);
Expand Down
2 changes: 1 addition & 1 deletion src/utils/TrivialHtmlParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ static WCHAR IntToChar(int codepoint) {

// caller needs to free() the result
WCHAR* DecodeHtmlEntitites(const char* string, uint codepage) {
WCHAR* fixed = strconv::StrToWStr(string, codepage);
WCHAR* fixed = strconv::StrCPToWStr(string, codepage);
WCHAR* dst = fixed;
const WCHAR* src = fixed;

Expand Down
Loading

0 comments on commit 23d2559

Please sign in to comment.