I've recently had a problem with importing UTF-8 CSV file into the MSSQL 2012 db using bulk insert, as MSSQL, in fact, does not support UTF-8.
In order to optimize the conversion process, the conversion buffer size can be changed.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 | /// <summary> /// Converts UTF-8 encoded file to UTF-16 (Unicode) /// </summary> /// <param name="utf8FilePath">Path to the input file (UTF-8 encoded)</param> /// <param name="unicodeFilePath">Path to the output file (UTF-16 encoded)</param> /// <param name="bufferSize">Size of the buffer in Kb</param> public static void ConvertUtfToUnicode(String utf8FilePath, String unicodeFilePath, int bufferSize) { FileStream fs = new FileStream(utf8FilePath, FileMode.Open, FileAccess.Read, FileShare.None, 1, true); FileStream fs_out = new FileStream(unicodeFilePath, FileMode.Create, FileAccess.Write, FileShare.None,1,true); int total = (int)fs.Length; int cBytesOut = 0, cBytesIn = 0, cExtraBytes = 0, cLength = 0; int piece = bufferSize * 1024; byte[] bytes_in = new byte[piece+3]; byte[] bytes_out = null; do { cLength = fs.Read(bytes_in, cExtraBytes, piece) + cExtraBytes; cExtraBytes = cntExtraBytes(bytes_in[cLength - 3], bytes_in[cLength - 2], bytes_in[cLength - 1]); bytes_out = Encoding.Convert(Encoding.UTF8, Encoding.Unicode, bytes_in, 0, cLength - cExtraBytes); if (cExtraBytes > 0) { Buffer.BlockCopy(bytes_in, cLength - cExtraBytes, bytes_in, 0, cExtraBytes); } fs_out.Write(bytes_out, 0, bytes_out.Count()); cBytesOut += bytes_out.Count(); cBytesIn += piece; } while (total > cBytesIn); fs.Close(); fs_out.Close(); } // UTF-8 is encoded in up to 4 bytes, need to make sure we don't chop a sequence in half // 240+: First in a 4 byte sequence // 224+: 3 byte sequence // 194+: 2 byte sequence // 128+: Part of a multi-byte sequence // 128-: Not part of a multi-byte sequence private static int cntExtraBytes(byte c3, byte c2, byte c1) { if (c1 < 128) return 0; if (c1 >= 192) return 1; if (c2 >= 224) return 2; if (c2 >= 192) return 0; if (c3 >= 240) return 3; if (c3 >= 224) return 0; return 0; } |
btw, how do you like my code beautifier? :) you can make you code beautiful too, just navigate to http://hilite.me/
No comments:
Post a Comment