Compact String Dictionary

If system needs to process lots of short strings, which actually quite often are same, then using some dictionary to map strings into compact unique identifiers can improve memory usage and performance quite a bit.

[code lang=”java”]
package org.kari.test.string;

import gnu.trove.TObjectIntHashMap;

import java.util.ArrayList;
import java.util.List;

/**
* Dictionary mapping strings int compact identifiers
*
* @author kari
*/
public final class Dictionary {
public static final int NULL = 1;
/**
*

NOTE KI storage could be also something else. For example,
* data could stored using set of byte[] arrays of UTF-8 encoded data
*/
private final List mDictionary = new ArrayList();
private final TObjectIntHashMap mStrings = new TObjectIntHashMap();

/**
* 0 == not used (n/a value)
*/
private int mIndex;

public Dictionary() {
add(null);
add(””);
}

/**
* Add string into dictionary
*
* @return Compact shared Id of string
*/
public int add(String pStr) {
int key = mStrings.get(pStr);
if (key == 0) {
mIndex++;
key = mIndex;
// don’t leak memory via substring
String str = pStr != null ? new String(pStr) : null;
mDictionary.add(str);
mStrings.put(str, key);
}
return key;
}

/**
* @return String matching pKey, null if not found
*/
public String get(int pKey) {
if (pKey >= 0 && pKey <= mDictionary.size()) { return mDictionary.get(pKey - 1); } return null; } } [/code]

And quick test program for the dictionary
[code lang=”java”]
package org.kari.test.string;

public class DictionaryTest {
public static void main(String[] args) {
Dictionary dict = new Dictionary();

int[] keys = new int[] {
dict.add(””),
dict.add(null),
dict.add(”foobar”),
dict.add(”duh”),
dict.add(”foobar”),
};

for (int i = 0; i < keys.length; i++) { int key = keys[i]; System.out.println(key + "=[" + dict.get(key) + "]"); } } } [/code]

Output from test program looks like this
[code]
2=[]
1=[null]
3=[foobar]
4=[duh]
3=[foobar]
[/code]

Now what is point of whole this? Well, it could us to do, for example, following:
[code lang=”java”]
package org.kari.test.string;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

/**
* Compact XML node based into Dictionary
*
* @author kari
*/
public final class XMLNode {
private static final XMLNode[] EMPTY = new XMLNode[0];

/**
* Dictionary is set in processing context and shared from there
*/
public static final ThreadLocal DICTIONARY = new ThreadLocal();

private int mName;
private int mText;

/**
* [attrId, attrValue, …]
*/
private int[] mAttributes;

/**
* Children, null value denotes missing child
*/
private XMLNode[] mNodes;

private XMLNode(String pName) {
setName(pName);
}

private XMLNode(int pName) {
setName(pName);
}

private Dictionary getDictionary() {
return DICTIONARY.get();
}

public String getName() {
return getDictionary().get(mName);
}

public void setName(String pName) {
mName = getDictionary().add(pName);
}

public void setName(int pName) {
mName = pName;
}

public String getText() {
return getDictionary().get(mText);
}

public void setText(String pText) {
mText = getDictionary().add(pText);
}

public void setText(int pText) {
mText = pText;
}

/**
* @param pAttrValue Value, null to remove attribute
*/
public void setAttribute(String pAttrName, String pAttrValue) {
setAttribute(getDictionary().add(pAttrName), pAttrValue);
}

/**
* @param pAttrValue Value, null to remove attribute
*/
public void setAttribute(int pAttrName, String pAttrValue) {
int value = getDictionary().add(pAttrValue);

if (mAttributes == null) {
mAttributes = new int[] {
pAttrName,
value
};
} else {
int index = -1;
int size = mAttributes.length;
for (int i = 0; i < size; i += 2) { int attr = mAttributes[i]; if (attr == pAttrName) { index = i; break; } } if (index != -1) { mAttributes[index + 1] = value; } else { int[] tmp = new int[size + 2]; System.arraycopy(mAttributes, 0, tmp, 0, size); tmp[size] = pAttrName; tmp[size + 1] = value; mAttributes = tmp; } } } /** * @return Attr value, null if not found */ public String getAttribute(String pAttrName) { return getAttribute(getDictionary().add(pAttrName)); } /** * @return Attr value, null if not found */ public String getAttribute(int pAttrName) { if (mAttributes != null) { int size = mAttributes.length; for (int i = 0; i < size; i += 2) { int attr = mAttributes[i]; if (attr == pAttrName) { int value = mAttributes[i + 1]; return getDictionary().get(value); } } } return null; } /** * @return All attribute names */ public List getAttributes() {
List result = null;
if (mAttributes != null) {
int size = mAttributes.length;
result = new ArrayList(size / 2);
Dictionary dict = getDictionary();
for (int i = 0; i < size; i += 2) { int attr = mAttributes[i]; int value = mAttributes[i + 1]; if (value != Dictionary.NULL) { result.add(dict.get(attr)); } } } return result != null ? result : Collections.emptyList();
}

/**
* @return All children of this, can contain null values (== missing child)
*/
public XMLNode[] getNodes() {
return mNodes != null
? mNodes
: EMPTY;
}

/**
* @param pNodes null to remove all nodes
*/
public void setNodes(List pNodes) {
if (pNodes != null && !pNodes.isEmpty()) {
int size = pNodes.size();
mNodes = new XMLNode[size];
for (int i = 0; i < size; i++) { mNodes[i] = pNodes.get(i); } } else { mNodes = null; } } public void addNode(XMLNode pNode) { if (mNodes == null) { mNodes = new XMLNode[] { pNode }; } else { int size = mNodes.length; XMLNode[] tmp = new XMLNode[size + 1]; System.arraycopy(mNodes, 0, tmp, 0, size); tmp[size] = pNode; mNodes = tmp; } } public void removeNode(XMLNode pNode) { if (mNodes != null) { int size = mNodes.length; for (int i = 0; i < size; i++) { if (mNodes[i] == pNode) { mNodes[i] = null; return; } } } } } [/code] This means that each XMLNode has base size of 24 bytes (in 64bit JVM it becomes 40 bytes). However, compact encoding could go even one step further, and treat XMLNodes as lightweights. Then, (a) each node would be assigned unique key (ex. negative number), which is from different range than attribute keys, (b) whole XMLNode is stored actually just an int[], which is stored into global map, (c) only unique lightweight id is kept for each node. Using different id spaces for nodes and attributes allows storing them in same int[] array, since based into id range they can be identified. And name and text, are simply stored into array indexes 0 and 1. Then access logic would be something like this: [code lang="java"] .... public static final ThreadLocal> NODE_DATA = new ThreadLocal>();

/**
*

NOTE Don’t store reference of returned node into any permanent
* collection
*
* @return Lightweight node instance for node manipulation
*/
public static XMLNode getNode(int pKey) {
int[] data = NODE_DATA.get().get(pKey);
return data != null
? new XMLNode(data)
: null;
}
…
[/code]
However, that goes a bit extreme. Based into quick calculation, nothing is saved in base size in 32bit JVM, but in 64bit JVM 16 bytes is saved. Actual savings happen actually only after some attributes or children are added, since object overhead of separate arrays is avoided.

References:
String Magic

12.1.2010 / java

ma	ti	ke	to	pe	la	su
	1	2	3	4	5	6
7	8	9	10	11	12	13
14	15	16	17	18	19	20
21	22	23	24	25	26	27
28	29	30

Compact String Dictionary

Vastaa Peruuta vastaus