String intern
Several simple code snippets
// Sample1
char[] chars = {'h', 'e', 'l', 'l', 'o'};
String str1 = new String(chars);
System.out.println(str1 == str1.intern());
print: true
// Sample2
String str1 = new String("hello");
System.out.println(str1 == str1.intern());
print: false
// Sample3
String a = "hello";
String b = new String("hello");
String c = new String("h" + "e" + "l" + "l" + "o");
String d = b.intern();
How many String objects is created? Answer is 3. Are you right?
questions
- when and where is the String object from string literal created, even if we don’t explicitly initialize it?
- what happens for
intern()
? - how is the “pool of strings” implemented?
- what’s the difference of two constructors between String(char[]) and String(String)?
constant string loading and intern
As we know, string literal is compiled into constant pool of the class file. And the bytecode to load it is ldc
.
public class StringLiteral {
public StringLiteral() {
String str = "hello";
}
}
will be compiled to:
Constant pool:
#1 = Methodref #4.#16 // java/lang/Object."<init>":()V
#2 = String #17 // hello
#3 = Class #18 // StringLiteral
#4 = Class #19 // java/lang/Object
#5 = Utf8 <init>
#6 = Utf8 ()V
#7 = Utf8 Code
#8 = Utf8 LineNumberTable
#9 = Utf8 LocalVariableTable
#10 = Utf8 this
#11 = Utf8 LStringLiteral;
#12 = Utf8 str
#13 = Utf8 Ljava/lang/String;
#14 = Utf8 SourceFile
#15 = Utf8 StringLiteral.java
#16 = NameAndType #5:#6 // "<init>":()V
#17 = Utf8 hello
#18 = Utf8 StringLiteral
#19 = Utf8 java/lang/Object
{
public StringLiteral();
flags: ACC_PUBLIC
Code:
stack=1, locals=2, args_size=1
0: aload_0
1: invokespecial #1 // Method java/lang/Object."<init>":()V
4: ldc #2 // String hello
6: astore_1
7: return
LineNumberTable:
line 5: 0
line 6: 4
line 7: 7
LocalVariableTable:
Start Length Slot Name Signature
0 8 0 this LStringLiteral;
7 1 1 str Ljava/lang/String;
}
when ldc
is executed, it must firstly resolve the symbol to the actual reference if not resolved.
NOTICE:
4: ldc #2 // String hello
6: astore_1
will find #2 in constant pool:
#2 = String #17 // hello
#17 = Utf8 hello
Initially, #2 is Constant_String_info symbol, and it assoiciates with #17 which is Constant_Utf8_info (UTF-8 encoding char array).
// line2111-2161 share/vm/interpreter/bytecodeInterpreter.cpp
CASE(_ldc):
{
u2 index;
bool wide = false;
int incr = 2; // frequent case
if (opcode == Bytecodes::_ldc) {
index = pc[1];
} else {
index = Bytes::get_Java_u2(pc+1);
incr = 3;
wide = true;
}
ConstantPool* constants = METHOD->constants();
switch (constants->tag_at(index).value()) {
....
case JVM_CONSTANT_String:
{
oop result = constants->resolved_references()->obj_at(index);
if (result == NULL) {
CALL_VM(InterpreterRuntime::resolve_ldc(THREAD, (Bytecodes::Code) opcode), handle_exception);
SET_STACK_OBJECT(THREAD->vm_result(), 0);
THREAD->set_vm_result(NULL);
} else {
VERIFY_OOP(result);
SET_STACK_OBJECT(result, 0);
}
break;
}
....
}
when it’s JVM_CONSTANT_String, jvm first checks resolved reference in line oop result = constants->resolved_references()->obj_at(index);
. If it is not resolved, then call InterpreterRuntime::resolve_ldc(THREAD, (Bytecodes::Code) opcode)
.
// line125-142 share/vm/interpreter/interpreterRuntime.cpp
IRT_ENTRY(void, InterpreterRuntime::resolve_ldc(JavaThread* thread, Bytecodes::Code bytecode)) {
assert(bytecode == Bytecodes::_fast_aldc ||
bytecode == Bytecodes::_fast_aldc_w, "wrong bc");
ResourceMark rm(thread);
methodHandle m (thread, method(thread));
Bytecode_loadconstant ldc(m, bci(thread));
oop result = ldc.resolve_constant(CHECK);
#ifdef ASSERT
{
// The bytecode wrappers aren't GC-safe so construct a new one
Bytecode_loadconstant ldc2(m, bci(thread));
oop coop = m->constants()->resolved_references()->obj_at(ldc2.cache_index());
assert(result == coop, "expected result for assembly code");
}
#endif
thread->set_vm_result(result);
}
IRT_END
here, it then calls Bytecode_loadconstant::resolve_constant(..)
.
// line217-226 share/vm/interpreter/bytecode.cpp
oop Bytecode_loadconstant::resolve_constant(TRAPS) const {
assert(_method.not_null(), "must supply method to resolve constant");
int index = raw_index();
ConstantPool* constants = _method->constants();
if (has_cache_index()) {
return constants->resolve_cached_constant_at(index, THREAD);
} else {
return constants->resolve_constant_at(index, THREAD);
}
}
here, no cache temporarily, it calls constants->resolve_constant_at(index, THREAD)
.
// line705-709 share/vm/oops/constantPool.hpp
// Resolve late bound constants.
oop resolve_constant_at(int index, TRAPS) {
constantPoolHandle h_this(THREAD, this);
return resolve_constant_at_impl(h_this, index, _no_index_sentinel, THREAD);
}
// line614-718 shared/vm/oops/contantPool.cpp
// Called to resolve constants in the constant pool and return an oop.
// Some constant pool entries cache their resolved oop. This is also
// called to create oops from constants to use in arguments for invokedynamic
oop ConstantPool::resolve_constant_at_impl(constantPoolHandle this_oop, int index, int cache_index, TRAPS) {
oop result_oop = NULL;
Handle throw_exception;
if (cache_index == _possible_index_sentinel) {
// It is possible that this constant is one which is cached in the objects.
// We'll do a linear search. This should be OK because this usage is rare.
assert(index > 0, "valid index");
cache_index = this_oop->cp_to_object_index(index);
}
assert(cache_index == _no_index_sentinel || cache_index >= 0, "");
assert(index == _no_index_sentinel || index >= 0, "");
if (cache_index >= 0) {
result_oop = this_oop->resolved_references()->obj_at(cache_index);
if (result_oop != NULL) {
return result_oop;
// That was easy...
}
index = this_oop->object_to_cp_index(cache_index);
}
jvalue prim_value; // temp used only in a few cases below
int tag_value = this_oop->tag_at(index).value();
switch (tag_value) {
....
case JVM_CONSTANT_String:
assert(cache_index != _no_index_sentinel, "should have been set");
if (this_oop->is_pseudo_string_at(index)) {
result_oop = this_oop->pseudo_string_at(index, cache_index);
break;
}
result_oop = string_at_impl(this_oop, index, cache_index, CHECK_NULL);
break;
....
}
if (cache_index >= 0) {
// Cache the oop here also.
Handle result_handle(THREAD, result_oop);
MonitorLockerEx ml(this_oop->lock()); // don't know if we really need this
oop result = this_oop->resolved_references()->obj_at(cache_index);
// Benign race condition: resolved_references may already be filled in while we were trying to lock.
// The important thing here is that all threads pick up the same result.
// It doesn't matter which racing thread wins, as long as only one
// result is used by all threads, and all future queries.
// That result may be either a resolved constant or a failure exception.
if (result == NULL) {
this_oop->resolved_references()->obj_at_put(cache_index, result_handle());
return result_handle();
} else {
// Return the winning thread's result. This can be different than
// result_handle() for MethodHandles.
return result;
}
} else {
return result_oop;
}
}
And then calls string_at_impl
.
// line816-825 share/vm/oops/contantPool.cpp
oop ConstantPool::string_at_impl(constantPoolHandle this_oop, int which, int obj_index, TRAPS) {
// If the string has already been interned, this entry will be non-null
oop str = this_oop->resolved_references()->obj_at(obj_index);
if (str != NULL) return str;
Symbol* sym = this_oop->unresolved_string_at(which);
str = StringTable::intern(sym, CHECK_(NULL));
this_oop->string_at_put(which, obj_index, str);
assert(java_lang_String::is_instance(str), "must be string");
return str;
}
Finally, we find these two lines:
Symbol* sym = this_oop->unresolved_string_at(which);
str = StringTable::intern(sym, CHECK_(NULL));
Here sym is Utf8
the String is associate with.
// vm/classfile/symbolTable.cpp
oop StringTable::intern(Symbol* symbol, TRAPS) {
if (symbol == NULL) return NULL;
ResourceMark rm(THREAD);
int length;
jchar* chars = symbol->as_unicode(length);
Handle string;
oop result = intern(string, chars, length, CHECK_NULL);
return result;
}
oop StringTable::intern(Handle string_or_null, jchar* name,
int len, TRAPS) {
unsigned int hashValue = hash_string(name, len);
int index = the_table()->hash_to_index(hashValue);
oop found_string = the_table()->lookup(index, name, len, hashValue);
// Found
if (found_string != NULL) return found_string;
debug_only(StableMemoryChecker smc(name, len * sizeof(name[0])));
assert(!Universe::heap()->is_in_reserved(name),
"proposed name of symbol must be stable");
Handle string;
// try to reuse the string if possible
if (!string_or_null.is_null()) {
string = string_or_null;
} else {
string = java_lang_String::create_from_unicode(name, len, CHECK_NULL);
}
// Grab the StringTable_lock before getting the_table() because it could
// change at safepoint.
MutexLocker ml(StringTable_lock, THREAD);
// Otherwise, add to symbol to table
return the_table()->basic_add(index, string, name, len,
hashValue, CHECK_NULL);
}
oop StringTable::basic_add(int index_arg, Handle string, jchar* name,
int len, unsigned int hashValue_arg, TRAPS) {
assert(java_lang_String::equals(string(), name, len),
"string must be properly initialized");
// Cannot hit a safepoint in this function because the "this" pointer can move.
No_Safepoint_Verifier nsv;
// Check if the symbol table has been rehashed, if so, need to recalculate
// the hash value and index before second lookup.
unsigned int hashValue;
int index;
if (use_alternate_hashcode()) {
hashValue = hash_string(name, len);
index = hash_to_index(hashValue);
} else {
hashValue = hashValue_arg;
index = index_arg;
}
// Since look-up was done lock-free, we need to check if another
// thread beat us in the race to insert the symbol.
oop test = lookup(index, name, len, hashValue); // calls lookup(u1*, int)
if (test != NULL) {
// Entry already added
return test;
}
HashtableEntry<oop, mtSymbol>* entry = new_entry(hashValue, string());
add_entry(index, entry);
return string();
}
Firstly, it converts the Utf8 symbol to Java char array. Then using this char array to crate a String
object by java_lang_String::create_from_unicode(name, len, CHECK_NULL);
. Finally, add this oop of String to the table and return its oop. Here the_table is StringTable. StringTable is just a Hashtable<oop, mtSymbol>.
// line185-192 shared/vm/classfile/javaClasses.cpp
Handle java_lang_String::create_from_unicode(jchar* unicode, int length, TRAPS) {
Handle h_obj = basic_create(length, CHECK_NH);
typeArrayOop buffer = value(h_obj());
for (int index = 0; index < length; index++) {
buffer->char_at_put(index, unicode[index]);
}
return h_obj;
}
explicitly call String#intern()
This is the JNI method of String#intern():
// share/native/java/lang/String.c
JNIEXPORT jobject JNICALL
Java_java_lang_String_intern(JNIEnv *env, jobject this)
{
return JVM_InternString(env, this);
}
// share/vm/prims/jvm.cpp
JVM_ENTRY(jstring, JVM_InternString(JNIEnv *env, jstring str))
JVMWrapper("JVM_InternString");
JvmtiVMObjectAllocEventCollector oam;
if (str == NULL) return NULL;
oop string = JNIHandles::resolve_non_null(str);
oop result = StringTable::intern(string, CHECK_NULL);
return (jstring) JNIHandles::make_local(env, result);
JVM_END
You can see: oop result = StringTable::intern(string, CHECK_NULL);
It calls the same method as the above code when loading constant string.
visualization
StringTable has a static field _the_table and it can be accessed directly by StringTable::the_table(); So actually this is a JVM internal data structure belonging to JVM. It is globally created in universe. It makes no sense to say it is located in the method area.