猜猜看:哪一种转换方法最快
蛋疼写了三种UTF-16到UTF-8的转换方法。其中一个不出所料果然很慢,但另外的两个测试结果让余跌了一下眼镜。
直接在内存上操作速度当然快,因此converMethod中convert2utf8毫无疑问是最快的
剩下的
convert2utf8_pushback
convert2utf8_copy
convert2utf8_allocate
都是返回STL中的string对象,哪个最快,哪个最慢,大家来猜一下吧
convertMethod.h
#include <string> bool match(const char *src, unsigned int src_length) { if (NULL == src) return false; if (src_length == 0) return false; if ((src_length&1) != 0) return false; return true; } unsigned int calcUtf8StringLength(const char *src, unsigned int src_length, bool is_little_endian) { if (NULL == src) return 0; if (src_length == 0) return 0; // if src_length is an odd number if ((src_length&1) != 0) return 0; const unsigned char *unsignedSrc = (unsigned char *)src; wchar_t chr = 0; unsigned int dest_len = 0; for (unsigned int i=0;i<src_length;) { if (is_little_endian) chr = *(unsignedSrc+i) + (*(unsignedSrc+i+1))*256; else chr = *(unsignedSrc+i+1) + (*(unsignedSrc+i))*256; i+=2; if (chr <= 0x007F) // U-0000 - U-007F { dest_len += 1; } else if (chr <= 0x07FF) { // U-0080 - U-07FF dest_len += 2; } else if (chr >= 0xD800 && chr <= 0xDFFF) { // U-D800 - U-DFFF dest_len += 0; } else { // U-0800 - UD7FF and UE000 - UFFFF dest_len += 3; } } return dest_len; } std::string convert2utf8_pushback(const char *src, unsigned int src_length, bool is_little_endian) { std::string utf8str; utf8str.clear(); if (NULL == src) return utf8str; if (src_length == 0) return utf8str; // if src_length is an odd number if ((src_length&1) != 0) return utf8str; const unsigned char *unsignedSrc = (unsigned char *)src; wchar_t chr = 0; char dest_chars[3]; memset((void*)dest_chars, 0, 3); unsigned int dest_len; for (unsigned int i=0;i<src_length;) { dest_len = 0; if (is_little_endian) chr = *(unsignedSrc+i) + (*(unsignedSrc+i+1))*256; else chr = *(unsignedSrc+i+1) + (*(unsignedSrc+i))*256; i+=2; if (chr <= 0x007F) // U-0000 - U-007F { dest_len = 1; dest_chars[0] = (char)chr; } else if (chr <= 0x07FF) { // U-0080 - U-07FF dest_len = 2; dest_chars[0] = (char)(0xC0 | (chr>>6)); dest_chars[1] = (char)(0x80 | (chr&0x003F)); } else if (chr >= 0xD800 && chr <= 0xDFFF) { // U-D800 - U-DFFF // ignore this unicode character dest_len = 0; } else { // U-0800 - UD7FF and UE000 - UFFFF dest_len = 3; dest_chars[0] = (char)(0xE0 | (chr>>12)); dest_chars[1] = (char)(0x80 | ((chr>>6) & 0x003F)); dest_chars[2] = (char)(0x80 | (chr & 0x003F)); } for (unsigned int j=0;j<dest_len;j++) { utf8str.push_back(dest_chars[j]); } } return utf8str; } std::string convert2utf8_copy(const char *src, unsigned int src_length, bool is_little_endian) { if (NULL == src) return std::string(); if (src_length == 0) return std::string(); // if src_length is an odd number if ((src_length&1) != 0) return std::string(); unsigned int need_length = calcUtf8StringLength(src, src_length, is_little_endian); char* dest = new char[need_length+1]; char* offset = dest; memset((void*)dest, 0, need_length+1); const unsigned char *unsignedSrc = (unsigned char *)src; wchar_t chr = 0; char dest_chars[3]; memset((void*)dest_chars, 0, 3); unsigned int dest_len; for (unsigned int i=0;i<src_length;) { dest_len = 0; if (is_little_endian) chr = *(unsignedSrc+i) + (*(unsignedSrc+i+1))*256; else chr = *(unsignedSrc+i+1) + (*(unsignedSrc+i))*256; i+=2; if (chr <= 0x007F) // U-0000 - U-007F { dest_len = 1; dest_chars[0] = (char)chr; } else if (chr <= 0x07FF) { // U-0080 - U-07FF dest_len = 2; dest_chars[0] = (char)(0xC0 | (chr>>6)); dest_chars[1] = (char)(0x80 | (chr&0x003F)); } else if (chr >= 0xD800 && chr <= 0xDFFF) { // U-D800 - U-DFFF // ignore this unicode character dest_len = 0; } else { // U-0800 - UD7FF and UE000 - UFFFF dest_len = 3; dest_chars[0] = (char)(0xE0 | (chr>>12)); dest_chars[1] = (char)(0x80 | ((chr>>6) & 0x003F)); dest_chars[2] = (char)(0x80 | (chr & 0x003F)); } if (dest_len>0) { memcpy(offset, dest_chars, dest_len); offset+=dest_len; } } std::string utf8str(dest); delete []dest; return utf8str; } std::string convert2utf8_allocate(const char *src, unsigned int src_length, bool is_little_endian) { if (NULL == src) return std::string(); if (src_length == 0) return std::string(); // if src_length is an odd number if ((src_length&1) != 0) return std::string(); unsigned int need_length = calcUtf8StringLength(src, src_length, is_little_endian); std::string utf8str(need_length, 0); unsigned int offset = 0; const unsigned char *unsignedSrc = (unsigned char *)src; wchar_t chr = 0; char dest_chars[3]; memset((void*)dest_chars, 0, 3); unsigned int dest_len; for (unsigned int i=0;i<src_length;) { dest_len = 0; if (is_little_endian) chr = *(unsignedSrc+i) + (*(unsignedSrc+i+1))*256; else chr = *(unsignedSrc+i+1) + (*(unsignedSrc+i))*256; i+=2; if (chr <= 0x007F) // U-0000 - U-007F { dest_len = 1; dest_chars[0] = (char)chr; } else if (chr <= 0x07FF) { // U-0080 - U-07FF dest_len = 2; dest_chars[0] = (char)(0xC0 | (chr>>6)); dest_chars[1] = (char)(0x80 | (chr&0x003F)); } else if (chr >= 0xD800 && chr <= 0xDFFF) { // U-D800 - U-DFFF // ignore this unicode character dest_len = 0; } else { // U-0800 - UD7FF and UE000 - UFFFF dest_len = 3; dest_chars[0] = (char)(0xE0 | (chr>>12)); dest_chars[1] = (char)(0x80 | ((chr>>6) & 0x003F)); dest_chars[2] = (char)(0x80 | (chr & 0x003F)); } for (unsigned int j=0;j<dest_len;j++) utf8str[offset++] = dest_chars[j]; } return utf8str; } bool convert2utf8(const char *src, unsigned int src_length, char *dest, unsigned int dest_length, bool is_little_endian, bool check_dest_length=false) { if (NULL == src || NULL == dest) return false; if (0 == src_length || 0 == dest_length) return false; if (!(dest>src+src_length || src>dest+dest_length)) return false; if ((src_length&1) != 0) return false; if (check_dest_length) { if (dest_length < calcUtf8StringLength(src, src_length, is_little_endian)) return false; } unsigned int offset = 0; const unsigned char *unsignedSrc = (unsigned char *)src; wchar_t chr = 0; memset((void*)dest, 0, dest_length); for (unsigned int i=0;i<src_length;) { if (is_little_endian) chr = *(unsignedSrc+i) + (*(unsignedSrc+i+1))*256; else chr = *(unsignedSrc+i+1) + (*(unsignedSrc+i))*256; i+=2; if (chr <= 0x007F) // U-0000 - U-007F { *(dest + offset++) = (char)chr; } else if (chr <= 0x07FF) { // U-0080 - U-07FF *(dest + offset++) = (char)(0xC0 | (chr>>6)); *(dest + offset++) = (char)(0x80 | (chr&0x003F)); } else if (chr >= 0xD800 && chr <= 0xDFFF) { // U-D800 - U-DFFF // ignore this unicode character } else { // U-0800 - UD7FF and UE000 - UFFFF *(dest + offset++) = (char)(0xE0 | (chr>>12)); *(dest + offset++) = (char)(0x80 | ((chr>>6) & 0x003F)); *(dest + offset++) = (char)(0x80 | (chr & 0x003F)); } } return true; }
测试程序:在VC新建一个工程把代码覆盖进去,并导入convertMethod.h
测试文件的路径需要更改一下,推荐用大文本测试,更能显示出性能的差异
测试文件只能是Unicode编码的文本文件
提供测试样品:bungakusyoujyo-unicode
#pragma once #include "stdafx.h" #include <string> #include <fstream> #include <iostream> #include <time.h> #include "convertMethod.h" using namespace std; int _tmain(int argc, _TCHAR* argv[]) { string filename = "../testfiles/utf-16/bungakusyoujyo-unicode.txt"; ifstream infile(filename.c_str(), ios::in|ios::binary); if (!infile) { cerr<<"unable to open input file!\n"; return -1; } ofstream outfile("../testfiles/out/bungakusyoujyo-utf-8.txt", ios::binary); if (!outfile) { cerr<<"unable to open output file\n"; infile.close(); return -1; } infile.seekg(0, ios::end); unsigned int rawLength = infile.tellg(); char *rawStringBuffer = new char[rawLength+1]; memset((void*)rawStringBuffer, 0, rawLength+1); infile.seekg(0, 0); infile.read(rawStringBuffer, rawLength); char *stringBuffer = rawStringBuffer + 2; unsigned int length = rawLength - 2; bool isLittleEndian=false, hasError=false; if (((unsigned char)rawStringBuffer[0]==0xFF)&&((unsigned char)rawStringBuffer[1]==0xFE)) isLittleEndian = true; else if (((unsigned char)rawStringBuffer[1]==0xFF)&&((unsigned char)rawStringBuffer[0]==0xFE)) isLittleEndian = false; else { hasError = true; } if (!hasError) { char UTF_8_BOM[3] = {'\xEF', '\xBB', '\xBF'}; outfile.write(UTF_8_BOM,3); if (match(stringBuffer, length)) { cout<<"Unicode matched!"<<endl; // output long beginTime, endTime; /*method 1*/ beginTime = clock(); string& resultBuffer1 = convert2utf8_pushback(stringBuffer, length, isLittleEndian); endTime = clock(); cout<<"utf-8 string length: "<<resultBuffer1.length()<<endl; cout<<"pushback method costs time: "<<endTime - beginTime<<"ms"<<endl; /*method 2*/ beginTime = clock(); string& resultBuffer2 = convert2utf8_copy(stringBuffer, length, isLittleEndian); endTime = clock(); cout<<"utf-8 string length: "<<resultBuffer2.length()<<endl; cout<<"copy construct method costs time: "<<endTime - beginTime<<"ms"<<endl; /*method 3*/ beginTime = clock(); string& resultBuffer3 = convert2utf8_allocate(stringBuffer, length, isLittleEndian); endTime = clock(); cout<<"utf-8 string length: "<<resultBuffer3.length()<<endl; cout<<"allocate method costs time: "<<endTime - beginTime<<"ms"<<endl; /*method 4*/ beginTime = clock(); unsigned int dst_length = calcUtf8StringLength(stringBuffer, length, isLittleEndian); char *dst = new char[dst_length]; bool result = convert2utf8(stringBuffer, length, dst, dst_length, isLittleEndian); endTime = clock(); if (!result) cerr<<"Fail to convert to utf-8\n"; else { cout<<"utf-8 string length: "<<dst_length<<endl; cout<<"memory copy method costs time: "<<endTime - beginTime<<"ms"<<endl; /*writing method 1*/ beginTime = clock(); outfile.write(dst, dst_length); endTime = clock(); cout<<"writing file by pointer costs time: "<<endTime - beginTime<<"ms"<<endl; } delete []dst; dst = NULL; /*writing method 2*/ /* const char *p = resultBuffer1.c_str(); unsigned int resultLength = resultBuffer1.length(); beginTime = clock(); outfile.write(p, resultLength); endTime = clock(); cout<<"writing file by pointer costs time: "<<endTime - beginTime<<"ms"<<endl; */ /*writing method 3*/ /* beginTime = clock(); outfile<<resultBuffer2; endTime = clock(); cout<<"writing file by stream costs time: "<<endTime - beginTime<<"ms"<<endl; */ } } infile.close(); outfile.close(); delete []rawStringBuffer; rawStringBuffer = NULL; stringBuffer = NULL; return 0; }
10W 行,普通文本没有这么BT吧….
Release和Debug的效率居然能反过来了
STL在Release下优化得那么好么
将文本复制粘贴超过10W行测试,否则Release下看不出差距