{"id":254,"date":"2010-06-12T11:08:08","date_gmt":"2010-06-12T11:08:08","guid":{"rendered":"https:\/\/kari.world.ikari.fi\/2010\/06\/12\/converting-string-to-utf8\/"},"modified":"2010-06-12T11:08:08","modified_gmt":"2010-06-12T11:08:08","slug":"converting-string-to-utf8","status":"publish","type":"post","link":"https:\/\/kari.world.ikari.fi\/?p=254","title":{"rendered":"Converting String to UTF8?"},"content":{"rendered":"<p>Lets see what happens when trying to convert random string into UTF8 using different methods.<\/p>\n<p>[code lang=&#8221;java&#8221;]<br \/>\npackage org.kari.test.string;<\/p>\n<p>import java.io.IOException;<br \/>\nimport java.io.OutputStreamWriter;<br \/>\nimport java.nio.charset.Charset;<\/p>\n<p>import org.apache.log4j.Logger;<br \/>\nimport org.kari.log.LogUtil;<br \/>\nimport org.kari.util.DirectByteArrayOutputStream;<\/p>\n<p>\/**<br \/>\n * Test UTF8 conversion<br \/>\n *<br \/>\n * @author kari<br \/>\n *\/<br \/>\npublic class UTF8Test {<br \/>\n    public static final Logger LOG = LogUtil.getLogger(&#8221;utf8&#8221;);<\/p>\n<p>    private static final DirectByteArrayOutputStream mOutBuffer = new DirectByteArrayOutputStream(100000);<br \/>\n    private static OutputStreamWriter mWriter;<\/p>\n<p>    static final class ByteArrayReference {<br \/>\n        private byte[] mBuffer;<br \/>\n        private int mOffset;<br \/>\n        private int mLength;<\/p>\n<p>        public void set(byte[] pBuffer, int pOffset, int pLength) {<br \/>\n            mBuffer = pBuffer;<br \/>\n            mOffset = pOffset;<br \/>\n            mLength = pLength;<br \/>\n        }<\/p>\n<p>        public void clear() {<br \/>\n            mBuffer = null;<br \/>\n            mOffset = 0;<br \/>\n            mLength = 0;<br \/>\n        }<\/p>\n<p>        public byte[] getBuffer() {<br \/>\n            return mBuffer;<br \/>\n        }<\/p>\n<p>        public int getOffset() {<br \/>\n            return mOffset;<br \/>\n        }<\/p>\n<p>        public int getLength() {<br \/>\n            return mLength;<br \/>\n        }<\/p>\n<p>    }<\/p>\n<p>    public static abstract class Test {<br \/>\n        public abstract void convert(<br \/>\n            String pStr,<br \/>\n            ByteArrayReference pRef)<br \/>\n        throws IOException;<br \/>\n    }<\/p>\n<p>    public static final class WriterTest extends Test{<br \/>\n        @Override<br \/>\n        public void convert(<br \/>\n                String pStr,<br \/>\n                ByteArrayReference pRef)<br \/>\n            throws IOException<br \/>\n        {<br \/>\n            DirectByteArrayOutputStream out = mOutBuffer;<br \/>\n            OutputStreamWriter writer = mWriter;<\/p>\n<p>            out.reset();<br \/>\n            writer.write(pStr);<br \/>\n            writer.flush();<\/p>\n<p>            pRef.set(out.getBuffer(), 0, out.size());<\/p>\n<p>\/\/            System.out.print(&#8217;.&#8217;);<br \/>\n        }<br \/>\n    }<\/p>\n<p>    public static final class BasicTest extends Test {<br \/>\n        @Override<br \/>\n        public void convert(<br \/>\n                String pStr,<br \/>\n                ByteArrayReference pRef)<br \/>\n            throws IOException<br \/>\n        {<br \/>\n            byte[] data = pStr.getBytes(&#8221;UTF-8&#8221;);<br \/>\n            pRef.set(data, 0, data.length);<br \/>\n\/\/            System.out.print(&#8217;.&#8217;);<br \/>\n        }<br \/>\n    }<\/p>\n<p>    private UTF8Test() throws Exception {<br \/>\n        mWriter = new OutputStreamWriter(mOutBuffer, &#8221;UTF-8&#8221;);<br \/>\n    }<\/p>\n<p>    public ByteArrayReference test(String str, Test pTest)<br \/>\n        throws Exception<br \/>\n    {<br \/>\n        try {<br \/>\n            ByteArrayReference ref = new ByteArrayReference();<br \/>\n            System.out.println(&#8221;string len=&#8221; + str.length());<\/p>\n<p>            long startTime = System.nanoTime();<br \/>\n            int COUNT = 100;<br \/>\n            for (int i = 0; i < COUNT; i++) {\n                ref.clear();\n                pTest.convert(str, ref);\n            }\n            long endTime = System.nanoTime();\n            long diff = endTime - startTime;\n\n            System.out.println();\n            System.out.println(\"  utf8 len=\" + ref.getLength());\n\n            System.out.println(\"total nano = \" + diff + \" nanos\");\n            System.out.println(\"  per nano = \" + (diff \/ (double)COUNT) + \" nanos\");\n\n            System.out.println(\"total time = \" + (diff \/ (1000.0 * 1000)) + \" ms\");\n            System.out.println(\"  per time = \" + ((diff \/ (1000.0 * 1000)) \/ (double)COUNT) + \" ms\");\n\n            return ref;\n        } catch (Exception e) {\n            throw e;\n        }\n    }\n\n    public static void main(String[] args) {\n        try {\n            test();\n        } catch (Exception e) {\n            LOG.error(\"Failed\", e);\n        }\n    }\n\n    private static void test()\n        throws Exception\n    {\n        UTF8Test test = new UTF8Test();\n        ByteArrayReference ref1;\n        ByteArrayReference ref2;\n        {\n            String str = createLongString();\n\n            System.out.println(\"====================\");\n            System.out.println(\"=====BASIC==========\");\n            System.out.println(\"====================\");\n            ref1 = test.test(str, new BasicTest());\n\n            System.out.println(\"====================\");\n            System.out.println(\"=====WRITER=========\");\n            System.out.println(\"====================\");\n            ref2 = test.test(str, new WriterTest());\n        }\n\n        System.out.println(\"====================\");\n        System.out.println(\"equal= \" + equals(ref1, ref2));\n    }\n\n    public static boolean equals(\n        ByteArrayReference ref1,\n        ByteArrayReference ref2)\n    {\n        boolean result = false;\n        result = ref1.getLength() == ref2.getLength();\n        if (result) {\n            byte[] buf1 = ref1.getBuffer();\n            byte[] buf2 = ref2.getBuffer();\n            int offset1 = ref1.getOffset();\n            int offset2 = ref2.getOffset();\n\n            for (int i = 0; result &#038;&#038; i < ref1.getLength(); i++) {\n                result = buf1[offset1 + i] == buf2[offset2 + i];\n            }\n        }\n        return result;\n    }\n\n    private static String createLongString() {\n        StringBuilder sb = new StringBuilder();\n        for (int i = 0; i < 1000 * 1000; i++) {\n            char ch = (char)(32 + (60000 * Math.random()));\n            sb.append(ch);\n        }\n        return sb.toString();\n    }\n}\n[\/code]\n\n\n\n<p>\nTest run with following memory settings with Sun Java 1.6.0_20-b02 (32bit):<\/p>\n<p>[code]<br \/>\n-Xms100M -Xmx400M<br \/>\n[\/code]<\/p>\n<p>\nAnd the results are:<br \/>\n[code]<br \/>\n====================<br \/>\n=====BASIC==========<br \/>\n====================<br \/>\nstring len=1000000<\/p>\n<p>  utf8 len=2897030<br \/>\ntotal nano = 3675792784 nanos<br \/>\n  per nano = 3.675792784E7 nanos<br \/>\ntotal time = 3675.792784 ms<br \/>\n  per time = 36.75792784 ms<br \/>\n====================<br \/>\n=====WRITER=========<br \/>\n====================<br \/>\nstring len=1000000<\/p>\n<p>  utf8 len=2897030<br \/>\ntotal nano = 3252002400 nanos<br \/>\n  per nano = 3.2520024E7 nanos<br \/>\ntotal time = 3252.0024 ms<br \/>\n  per time = 32.520024 ms<br \/>\n====================<br \/>\nequal= true<br \/>\n[\/code]<\/p>\n<p>It seems that using Writer for conversion is slightly faster in this test run. However, in real life I believe difference can be even greater due to memory trashing what String.getBytes() causes.<\/p>\n<p>\nFaster approach could be to extract Encoder from UTF_8 class (i.e. re-implement it). Caveat emptor of such is naturally the fact that re-implementation can introduce mild bugs into logic easily, since most of the internal character set encoding logic must be duplicated in order to do so.<\/p>\n<p>\n<strong>References:<\/strong><br \/>\n<a href=\"http:\/\/blog.rapleaf.com\/dev\/2010\/04\/26\/faster-string-to-utf-8-encoding-in-java\/\">Faster string to UTF-8 encoding in Java<\/a><br \/>\n<a href=\"http:\/\/stackoverflow.com\/questions\/2098137\/fast-alternative-to-java-nio-charset-charset-decode-encode\">Fast alternative to java.nio.charset.Charset.decode(..)\/encode(..)<\/a><\/p>\n<p>\n<strong>Update: 16.6.2010<\/strong><br \/>\nFor completeness, I tried also what happens if CharEncoder is used<br \/>\n[code lang=&#8221;java&#8221;]<br \/>\nCharset cs = Charset.forName(&#8221;UTF-8&#8221;);<br \/>\nByteBuffer data = cs.encode(pStr);<br \/>\n[\/code]<\/p>\n<p>Net result is  that this is much slower (over 50% slower) than String.getBytes(). Main reason for slowness is likely the fact that this API cannot use optimized logic in String, which allows direct char[]\u00a0access into original character data.<\/p>\n<p>Notice 1:<br \/>\nIt seems that speed of OutputStreamWriter comes with cost. Logic inside OSW is using StreamEncoder , which allocates temporary char[] for *whole* string contents in order to copy chars from String for fast access, if strings are large, this can cause problems (!).<\/p>\n<p>Notice 2:<br \/>\nWhen changing test to use 100 char strings with 1 million iterations, it turned out that String.getBytes() was practically as fast than Writer (or faster, depending if extra gc() due to allocation in StreamEncoder is hit or not).<\/p>\n<p>Notice 3:<br \/>\nIn my, not-so-new, hardware, I got String encoding speed in between 54M\/s for plain ASCII chars (random characters in range 32 &#8211; 255), and 18M\/s when &#8221;high&#8221; UNICODE chars were included (random characters in range 32 &#8211; 60000). Not stellar performance, but what is note worthy is that for non-western users, speed is less than 50% (and encoded byte[] storage is tripled), so that needs to be taken in account when trying to &#8221;optimize&#8221; strings.<\/p>\n","protected":false},"excerpt":{"rendered":"<p>Lets see what happens when trying to convert random string into UTF8 using different methods. [code lang=&#8221;java&#8221;] package org.kari.test.string; import java.io.IOException; import java.io.OutputStreamWriter; import java.nio.charset.Charset; import org.apache.log4j.Logger; import org.kari.log.LogUtil; import org.kari.util.DirectByteArrayOutputStream; \/** * Test UTF8 conversion * * @author kari *\/ public class UTF8Test { public static final Logger LOG = LogUtil.getLogger(&#8221;utf8&#8221;); private static final&#8230;<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[7],"tags":[],"class_list":["post-254","post","type-post","status-publish","format-standard","hentry","category-java"],"_links":{"self":[{"href":"https:\/\/kari.world.ikari.fi\/index.php?rest_route=\/wp\/v2\/posts\/254","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/kari.world.ikari.fi\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/kari.world.ikari.fi\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/kari.world.ikari.fi\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/kari.world.ikari.fi\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=254"}],"version-history":[{"count":0,"href":"https:\/\/kari.world.ikari.fi\/index.php?rest_route=\/wp\/v2\/posts\/254\/revisions"}],"wp:attachment":[{"href":"https:\/\/kari.world.ikari.fi\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=254"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/kari.world.ikari.fi\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=254"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/kari.world.ikari.fi\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=254"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}