let UNICODE = require("./tria.js");
let UTF8 = require("./dio.js");
let str = "he,βγ,你好,にほ,네이,龠龥";
console.log("str: "+str);
let unicode = UNICODE.encode(str)
console.log("unicode: "+unicode)
let utf8 = UTF8.encode(unicode);
console.log("utf8: "+utf8);
let unicode2 = UTF8.decode(utf8);
console.log("unicode: "+unicode2);
let str2 = UNICODE.decode(unicode2);
console.log("str: "+str2)
console.log(`
=====================我是风骚的分隔线^_^==========================
`);
let $str = "he,βγ,你好,にほ,네이,龠龥";
console.log("str: "+str);
let $utf8 = UTF8.str2utf8($str);
console.log("utf8: "+utf8);
let $$str = UTF8.utf82str($utf8);
console.log("str: "+$$str);
/**
* 采用小端法表示
*
| Unicode符号范围 | UTF-8编码方式
n | (十六进制) | (二进制)
---+-----------------------+------------------------------------------------------
1 | 0000 0000 - 0000 007F | 0xxxxxxx
2 | 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
3 | 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
4 | 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
5 | 0020 0000 - 03FF FFFF | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
6 | 0400 0000 - 7FFF FFFF | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
表 1. UTF-8的编码规则
// #txt---end
*
*
**/
function binary2hexstr(bin){
let str = bin.toString(16);
return '\\x'+str;
}
function binary2unicodestr(bin){
let str = bin.toString(16);
// let number = 4-str.length;
// let ret = "";
// while(number>0&&(number-->0)){
// ret += '0';
// }
return '\\u'+UNICODE.addZero(str);
}
/**
* unicode \uXXXX => uft8 \xXX
*/
function encode(unicode){
if(!~unicode.indexOf("\\u")){
return "";
}
let ret = [];
let uniArr = unicode.split(/\\u/g).slice(1).map(v=>parseInt(v,16));
for(let i in uniArr){
let u = uniArr[i];
if(u > 0x7FFFFFFF){
ret.push("");
}else if(u > 0x3FFFFFF){ // 6 byte
ret.push(binary2hexstr(u>>31&0x01|0xFC));
ret.push(binary2hexstr(u>>26&0x3F|0x80));
ret.push(binary2hexstr(u>>21&0x3F|0x80));
ret.push(binary2hexstr(u>>12&0x3F|0x80));
ret.push(binary2hexstr(u>>6&0x3F|0x80));
ret.push(binary2hexstr(u&0x3F|0x80));
}else if(u >= 0x10FFFF){ // 5 byte
ret.push(binary2hexstr(u>>26&0x03|0xF8));
ret.push(binary2hexstr(u>>21&0x3F|0x80));
ret.push(binary2hexstr(u>>12&0x3F|0x80));
ret.push(binary2hexstr(u>>6&0x3F|0x80));
ret.push(binary2hexstr(u&0x3F|0x80));
}else if(u > 0xFFFF){ // 4 byte
ret.push(binary2hexstr(u>>21&0x07|0xF0));
ret.push(binary2hexstr(u>>12&0x3F|0x80));
ret.push(binary2hexstr(u>>6&0x3F|0x80));
ret.push(binary2hexstr(u&0x3F|0x80));
}else if(u > 0x7FF){ // 3 byte
ret.push(binary2hexstr(u>>12&0x0F|0xE0));
ret.push(binary2hexstr(u>>6&0x3F|0x80));
ret.push(binary2hexstr(u&0x3F|0x80));
}else if(u > 0x7F){ // 2 byte
ret.push(binary2hexstr(u>>6&0x1F|0xC0));
ret.push(binary2hexstr(u&0x3F|0x80));
}else if(u >= 0x0){ // 1 byte
ret.push(binary2hexstr(u&0x7F))
}else{
ret.push("");
}
}
return ret.join("");
}
/**
* utf8 \xXX => unicode \uXXXX
*/
function decode(utf8){
let ret = [];
let hexArr = utf8.split('\\x').slice(1).map(v=>parseInt(v,16));
let len = hexArr.length;
while(len >=0){
let firstByte = hexArr[0];
if(!firstByte||(firstByte>>6 === 2)){
break;
}
//method 1, is a express edition of method 2
let bin = 0x00,
number2 = UTF8.getByteNumber(firstByte),
n = number2;
if(n===1){
bin |= hexArr[0];
}else{
while(--n >= 0){
bin |= n?(hexArr[n]&0x3F)<<6*(number2-1-n):
(hexArr[n]&(0x7F>>number2))<<6*(number2-1-n);
}
}
ret.push(binary2unicodestr(bin));
hexArr.splice(0,number2);
len -= number2;
// method 2
// if(firstByte >= 0xFC){ //6 byte
// ret.push(binary2unicodestr(
// (hexArr[0]&1)<<30
// |(hexArr[1]&0x3F)<<24
// |(hexArr[2]&0x3F)<<18
// |(hexArr[3]&0x3F)<<12
// |(hexArr[4]&0x3F)<<6
// |(hexArr[5]&0x3F)
// ));
// hexArr.splice(0,6);
// len -= 6;
// }else if(firstByte >= 0xF8){ //5 byte
// ret.push(binary2unicodestr(
// (hexArr[0]&0x3)<<24
// |(hexArr[1]&0x3F)<<18
// |(hexArr[2]&0x3F)<<12
// |(hexArr[3]&0x3F)<<6
// |(hexArr[4]&0x3F)
// ));
// hexArr.splice(0,5);
// len -= 5;
// }else if(firstByte >= 0xF0){ //4 byte
// ret.push(binary2unicodestr(
// (hexArr[0]&0x7)<<18
// |(hexArr[1]&0x3F)<<12
// |(hexArr[2]&0x3F)<<6
// |(hexArr[3]&0x3F)
// ));
// hexArr.splice(0,4);
// len -= 4;
// }else if(firstByte >= 0xE0){ //3 byte
// ret.push(binary2unicodestr(
// (hexArr[0]&0xF)<<12
// |(hexArr[1]&0x3F)<<6
// |(hexArr[2]&0x3F)
// ));
// hexArr.splice(0,3);
// len -= 3;
// }else if(firstByte >= 0xC0){ //2 byte
// ret.push(binary2unicodestr(
// (hexArr[0]&0x1F)<<6
// |(hexArr[1]&0x3F)
// ));
// hexArr.splice(0,2);
// len -= 2;
// }else if(firstByte >= 0x00){ //1 byte
// ret.push(binary2unicodestr(hexArr[0]));
// hexArr.splice(0,1);
// len--;
// }
}
return ret.join("");
}
class UNICODE{
static addZero(hex){
let len = hex.length;
while(len++<4){
hex = "0"+hex;
}
return hex;
}
/**
* convert str to unicode string
*/
static encode(str){
let ret = [];
for(let i in str){
let hex = str.charCodeAt(i).toString(16).slice(-4);//last 4 char
ret.push("\\u"+UNICODE.addZero(hex));
}
return ret.join("");
}
/**
* convert unicode string to str
*/
static decode(unicode){
if(!unicode){
return "";
}
return unescape(unicode.replace(/\\/g,"%")
.replace(/%u[\da-fA-F]{2,4}/g,
v=>String.fromCharCode(parseInt(v.replace(/%u/,''),16)))
);
}
}
class UTF8{
/**
* get 1 numbers in byte
*/
static getByteNumber(byte){
let bit = 8;
while(bit--){
if(!(byte>>bit&1)) break;
}
return 7-bit?7-bit:1;
}
static encode(unicode){
return encode(unicode);
}
static decode(utf8){
return decode(utf8);
}
static str2utf8(str){
return UTF8.encode(UNICODE.encode(str))
}
static utf82str(utf8){
return UNICODE.decode(UTF8.decode(utf8));
}
}
module.exports = UTF8;
class UNICODE{
static addZero(hex){
let len = hex.length;
while(len++<4){
hex = "0"+hex;
}
return hex;
}
/**
* convert str to unicode string
*/
static encode(str){
let ret = [];
for(let i in str){
let hex = str.charCodeAt(i).toString(16).slice(-4);//last 4 char
ret.push("\\u"+UNICODE.addZero(hex));
}
return ret.join("");
}
/**
* convert unicode string to str
*/
static decode(unicode){
if(!unicode){
return "";
}
return unescape(unicode.replace(/\\/g,"%")
.replace(/%u[\da-fA-F]{2,4}/g,
v=>String.fromCharCode(parseInt(v.replace(/%u/,''),16)))
);
}
}
module.exports = UNICODE;