-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathcdless
executable file
·40 lines (38 loc) · 2.13 KB
/
cdless
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
#!/usr/local/bin/pike
/*
Simple way to quickly eyeball a text file in an unknown encoding.
Calls on chardet(1) to make an educated guess as to what encoding seems most
likely, then decodes the file using that, and pipes it into less(1). If the
selected encoding is incorrect, it can be quickly overridden with a second
argument. The original file is not mutated in any way.
*/
int main(int argc,array(string) argv)
{
if (argc<2) exit(0,"USAGE: %s filename [encoding]\n",argv[0]);
string data=Stdio.read_file(argv[1]);
string enc;
if (argc>2 && lower_case(argv[2])!="win") enc=argv[2];
else
{
sscanf(Process.run(({"chardet"}),(["stdin":data]))->stdout,"<stdin>: %s with confidence",enc);
//Hack: The pseudo-encoding "win" means "detect the encoding, but translate ISO-8859 into Windows code pages".
//Note that not all of these codepages are supersets of the corresponding ISO-8859 character sets, so this
//is not something you'd do by default. But if chardet's choice leaves some unexpected characters around, you
//can try switching out to a similar codepage - particularly if it's likely that the file came from Windows.
if (argc>2 && sscanf(enc,"ISO-8859-%d",int iso) && iso<10) enc = ({0,"1252","1250",0,0,0,0,"1253","1255","1254"})[iso] || enc;
}
//Decoding "UTF-8 with BOM" is as simple as decoding UTF-8 and stripping the BOM.
//(Don't skip the decode and encode, incidentally. I want to get an error thrown if it's invalid UTF-8.)
if (enc=="UTF-8-SIG" && has_prefix(data,"\xef\xbb\xbf")) {enc="UTF-8"; data=data[3..];}
//chardet picks MacCyrillic for a number of Arabic documents.
//Since we don't have a decoder for MacCyrillic anyway, let's
//second-guess the detection and pick the Windows Arabic cp.
if (enc=="MacCyrillic") enc="1256";
string txt=Charset.decoder(enc)->feed(data)->drain();
Stdio.File send=Stdio.File(),recv=send->pipe();
send->write("Selected encoding: %s\n\n",enc);
object proc=Process.create_process(({"less"}),(["stdin":recv,"callback":lambda(object p) {if (p->status()==2) exit(0);}]));
send->set_buffer_mode(0,Stdio.Buffer(string_to_utf8(txt)));
send->set_nonblocking(0,lambda() {send->close();});
return -1;
}