引言
在Java虚拟机(JVM)中,方法调用是一个复杂而精巧的过程。本文将通过分析OpenJDK 17的C++源码,深入探讨aload_1
和invokevirtual
指令的执行机制,揭示JVM方法调用的内部工作原理。
aload_1指令:局部变量访问的优化
aload_1
是Java字节码中用于加载局部变量的指令,它是aload
指令家族的优化形式:
java
// Java字节码示例
7: astore_1 // 将对象引用存储到局部变量1
8: aload_1 // 从局部变量1加载对象引用
在OpenJDK的模板解释器中,aload_1
由以下代码处理:
cpp
void TemplateTable::aload(int n) {
transition(vtos, atos); // 状态转换:从无类型到引用类型
__ movptr(rax, aaddress(n)); // 将局部变量n的值加载到rax寄存器
}
这里的关键是:
-
aaddress(n)
计算局部变量表索引n
的地址 -
索引值
1
是硬编码在指令中的,不需要额外操作数 -
加载的对象引用存入
rax
寄存器,相当于压入操作数栈顶
invokevirtual指令:虚方法调用的核心
invokevirtual
指令负责虚拟方法分发,其执行过程更为复杂。让我们分析关键代码:
prepare_invoke:方法调用准备
cpp
void TemplateTable::invokevirtual(int byte_no) {
transition(vtos, vtos);
prepare_invoke(byte_no, rbx, noreg, rcx, rdx); // recv, flags
invokevirtual_helper(rbx, rcx, rdx);
}
prepare_invoke
函数是方法调用的核心准备阶段:
cpp
void TemplateTable::prepare_invoke(int byte_no,
Register method,
Register index,
Register recv,
Register flags) {
// ... 初始化处理
if (recv == noreg) recv = rcx; // 确保接收者使用rcx寄存器
// 加载常量池缓存条目
load_invoke_cp_cache_entry(byte_no, method, index, flags, is_invokevirtual, false, is_invokedynamic);
// 加载接收者对象
if (load_receiver) {
__ movl(recv, flags);
__ andl(recv, ConstantPoolCacheEntry::parameter_size_mask);
const int no_return_pc_pushed_yet = -1;
const int receiver_is_at_end = -1;
Address recv_addr = __ argument_address(recv, no_return_pc_pushed_yet + receiver_is_at_end);
__ movptr(recv, recv_addr); // 关键:从栈加载接收者到rcx
__ verify_oop(recv);
}
// ... 后续处理
}
这里的关键操作是__ argument_address
计算接收者对象在调用者操作数栈上的地址,然后通过movptr
将其加载到rcx
寄存器。
invokevirtual_helper:虚方法分发
cpp
void TemplateTable::invokevirtual_helper(Register index,
Register recv,
Register flags) {
// 检查是否为final方法
Label notFinal;
__ movl(rax, flags);
__ andl(rax, (1 << ConstantPoolCacheEntry::is_vfinal_shift));
__ jcc(Assembler::zero, notFinal);
// 处理final方法
__ null_check(recv);
__ profile_final_call(rax);
__ jump_from_interpreted(method, rax);
__ bind(notFinal);
// 处理虚方法分发
__ null_check(recv, oopDesc::klass_offset_in_bytes());
__ load_klass(rax, recv, tmp_load_klass); // 加载接收者的类对象
__ profile_virtual_call(rax, rlocals, rdx);
__ lookup_virtual_method(rax, index, method); // 查找虚方法
__ jump_from_interpreted(method, rdx); // 跳转到方法入口
}
参数传递机制
JVM使用栈式架构进行参数传递,这是一个精巧的设计:
调用者操作数栈布局
text
[参数N] ← 栈顶(最后压入的参数)
...
[参数2]
[参数1]
[接收者对象] ← 第一个隐含参数(this引用)
参数访问机制
在方法入口处,解释器通过以下代码建立参数访问基础:
cpp
// 在generate_normal_entry中:
__ lea(rlocals, Address(rsp, rcx, Interpreter::stackElementScale(), -wordSize));
这里:
-
rcx
存储参数数量(不是接收者对象) -
计算结果
rlocals
指向局部变量表的起始地址 -
参数通过计算好的偏移量直接访问,无需显式复制
实际案例解析
考虑以下Java代码的字节码:
java
public static void main(String[] args) {
ByteCodeTest byteCodeTest = new ByteCodeTest();
byteCodeTest.test();
}
// 对应字节码:
7: astore_1 // 存储对象引用到局部变量1
8: aload_1 // 加载对象引用到操作数栈
9: invokevirtual #7 // 调用test方法
执行流程:
-
astore_1
将对象引用存入局部变量表索引1 -
aload_1
从局部变量表加载该引用到操作数栈顶 -
invokevirtual
使用栈顶对象作为接收者,调用test
方法 -
在
prepare_invoke
中,接收者对象从栈加载到rcx
寄存器 -
通过虚方法分发找到实际的
test
方法实现 -
跳转到方法入口执行
总结
JVM的方法调用机制体现了栈式架构的精妙设计:
-
局部变量访问优化 :
aload_1
等指令通过硬编码索引提高效率 -
参数传递机制:参数按约定顺序排列在操作数栈上,通过地址计算直接访问
-
虚方法分发:基于接收者对象的实际类型进行方法查找
-
性能优化:通过内联缓存、方法内联等技术提升执行效率
通过深入分析OpenJDK源码,我们不仅理解了JVM方法调用的内部机制,还能更好地理解Java程序的运行时行为,为性能优化和问题排查提供坚实基础。
这种精巧的设计使得Java能够在保持面向对象特性的同时,提供接近本地代码的执行效率,体现了JVM作为高级语言运行时环境的成熟和 sophistication。
##源码
cpp
void TemplateTable::invokevirtual(int byte_no) {
// tty->print("TemplateTable::invokevirtual: ");
transition(vtos, vtos);
assert(byte_no == f2_byte, "use this argument");
prepare_invoke(byte_no,
rbx, // method or vtable index
noreg, // unused itable index
rcx, rdx); // recv, flags
// rbx: index
// rcx: receiver
// rdx: flags
invokevirtual_helper(rbx, rcx, rdx);
}
void TemplateTable::prepare_invoke(int byte_no,
Register method, // linked method (or i-klass)
Register index, // itable index, MethodType, etc.
Register recv, // if caller wants to see it
Register flags // if caller wants to test it
) {
// determine flags
const Bytecodes::Code code = bytecode();
const bool is_invokeinterface = code == Bytecodes::_invokeinterface;
const bool is_invokedynamic = code == Bytecodes::_invokedynamic;
const bool is_invokehandle = code == Bytecodes::_invokehandle;
const bool is_invokevirtual = code == Bytecodes::_invokevirtual;
const bool is_invokespecial = code == Bytecodes::_invokespecial;
const bool load_receiver = (recv != noreg);
const bool save_flags = (flags != noreg);
assert(load_receiver == (code != Bytecodes::_invokestatic && code != Bytecodes::_invokedynamic), "");
assert(save_flags == (is_invokeinterface || is_invokevirtual), "need flags for vfinal");
assert(flags == noreg || flags == rdx, "");
assert(recv == noreg || recv == rcx, "");
// setup registers & access constant pool cache
if (recv == noreg) recv = rcx;
if (flags == noreg) flags = rdx;
assert_different_registers(method, index, recv, flags);
// save 'interpreter return address'
__ save_bcp();
load_invoke_cp_cache_entry(byte_no, method, index, flags, is_invokevirtual, false, is_invokedynamic);
// maybe push appendix to arguments (just before return address)
if (is_invokedynamic || is_invokehandle) {
Label L_no_push;
__ testl(flags, (1 << ConstantPoolCacheEntry::has_appendix_shift));
__ jcc(Assembler::zero, L_no_push);
// Push the appendix as a trailing parameter.
// This must be done before we get the receiver,
// since the parameter_size includes it.
__ push(rbx);
__ mov(rbx, index);
__ load_resolved_reference_at_index(index, rbx);
__ pop(rbx);
__ push(index); // push appendix (MethodType, CallSite, etc.)
__ bind(L_no_push);
}
// load receiver if needed (after appendix is pushed so parameter size is correct)
// Note: no return address pushed yet
if (load_receiver) {
__ movl(recv, flags);
__ andl(recv, ConstantPoolCacheEntry::parameter_size_mask);
const int no_return_pc_pushed_yet = -1; // argument slot correction before we push return address
const int receiver_is_at_end = -1; // back off one slot to get receiver
Address recv_addr = __ argument_address(recv, no_return_pc_pushed_yet + receiver_is_at_end);
__ movptr(recv, recv_addr);
__ verify_oop(recv);
}
if (save_flags) {
__ movl(rbcp, flags);
}
// compute return type
__ shrl(flags, ConstantPoolCacheEntry::tos_state_shift);
// Make sure we don't need to mask flags after the above shift
ConstantPoolCacheEntry::verify_tos_state_shift();
// load return address
{
const address table_addr = (address) Interpreter::invoke_return_entry_table_for(code);
ExternalAddress table(table_addr);
LP64_ONLY(__ lea(rscratch1, table));
LP64_ONLY(__ movptr(flags, Address(rscratch1, flags, Address::times_ptr)));
NOT_LP64(__ movptr(flags, ArrayAddress(table, Address(noreg, flags, Address::times_ptr))));
}
// push return address
__ push(flags);
// Restore flags value from the constant pool cache, and restore rsi
// for later null checks. r13 is the bytecode pointer
if (save_flags) {
__ movl(flags, rbcp);
__ restore_bcp();
}
}
void TemplateTable::invokevirtual_helper(Register index,
Register recv,
Register flags) {
tty->print("TemplateTable::invokevirtual_helper ");
// Uses temporary registers rax, rdx
assert_different_registers(index, recv, rax, rdx);
assert(index == rbx, "");
assert(recv == rcx, "");
// Test for an invoke of a final method
Label notFinal;
__ movl(rax, flags);
__ andl(rax, (1 << ConstantPoolCacheEntry::is_vfinal_shift));
__ jcc(Assembler::zero, notFinal);
const Register method = index; // method must be rbx
assert(method == rbx,
"Method* must be rbx for interpreter calling convention");
// do the call - the index is actually the method to call
// that is, f2 is a vtable index if !is_vfinal, else f2 is a Method*
// It's final, need a null check here!
__ null_check(recv);
// profile this call
__ profile_final_call(rax);
__ profile_arguments_type(rax, method, rbcp, true);
// 在final分支也打印Klass指针
Register tmp_load_klass = LP64_ONLY(rscratch1) NOT_LP64(noreg);
__ load_klass(rax, recv, tmp_load_klass);
__ jump_from_interpreted(method, rax);
__ bind(notFinal);
// get receiver klass
__ null_check(recv, oopDesc::klass_offset_in_bytes());
__ load_klass(rax, recv, tmp_load_klass);
// profile this call
__ profile_virtual_call(rax, rlocals, rdx);
// get target Method* & entry point
__ lookup_virtual_method(rax, index, method);
__ profile_arguments_type(rdx, method, rbcp, true);
// ============== 调试信息输出开始 ==============
// 使用宏汇编安全输出
{
// 保存关键寄存器 (已在调用前保存)
print_debug_info_asm(method); // rbx 包含 Method*
}
__ jump_from_interpreted(method, rdx);
}
// Jump to from_interpreted entry of a call unless single stepping is possible
// in this thread in which case we must call the i2i entry
void InterpreterMacroAssembler::jump_from_interpreted(Register method, Register temp) {
prepare_to_jump_from_interpreted();
if (JvmtiExport::can_post_interpreter_events()) {
Label run_compiled_code;
// JVMTI events, such as single-stepping, are implemented partly by avoiding running
// compiled code in threads for which the event is enabled. Check here for
// interp_only_mode if these events CAN be enabled.
// interp_only is an int, on little endian it is sufficient to test the byte only
// Is a cmpl faster?
LP64_ONLY(temp = r15_thread;)
NOT_LP64(get_thread(temp);)
cmpb(Address(temp, JavaThread::interp_only_mode_offset()), 0);
jccb(Assembler::zero, run_compiled_code);
jmp(Address(method, Method::interpreter_entry_offset()));
bind(run_compiled_code);
}
jmp(Address(method, Method::from_interpreted_offset()));
}
//
// Generic interpreted method entry to (asm) interpreter
//
address TemplateInterpreterGenerator::generate_normal_entry(bool synchronized) {
// determine code generation flags
bool inc_counter = UseCompiler || CountCompiledCalls || LogTouchedMethods;
// ebx: Method*
// rbcp: sender sp
address entry_point = __ pc();
const Address constMethod(rbx, Method::const_offset());
const Address access_flags(rbx, Method::access_flags_offset());
const Address size_of_parameters(rdx,
ConstMethod::size_of_parameters_offset());
const Address size_of_locals(rdx, ConstMethod::size_of_locals_offset());
// get parameter size (always needed)
__ movptr(rdx, constMethod);
__ load_unsigned_short(rcx, size_of_parameters);
// rbx: Method*
// rcx: size of parameters
// rbcp: sender_sp (could differ from sp+wordSize if we were called via c2i )
__ load_unsigned_short(rdx, size_of_locals); // get size of locals in words
__ subl(rdx, rcx); // rdx = no. of additional locals
// YYY
// __ incrementl(rdx);
// __ andl(rdx, -2);
// see if we've got enough room on the stack for locals plus overhead.
generate_stack_overflow_check();
//yym-gaizao
// #ifdef DEBUG_PRINT_METHOD_NAME
// ---yym--- 打印代码移动到堆栈检查之后
{
// 保存寄存器状态
__ push(rax);
__ push(rcx);
__ push(rdx);
__ push(rdi);
__ push(rsi);
__ push(r8);
__ push(r9);
__ push(r10);
__ push(r11);
NOT_LP64(__ get_thread(r15_thread));
__ push(r15); // 共保存 10 个寄存器 = 80 字节
// 计算原始 RSP: 当前 RSP + 保存的寄存器大小 + 红色区域
const int saved_regs_size = 10 * wordSize; // 10个寄存器 * 8字节
const int red_zone_size = 128; // 红色区域大小
__ lea(rsi, Address(rsp, saved_regs_size + red_zone_size));
// 准备参数
__ movptr(rdi, rbx); // Method*
__ mov(r8, rcx); // params_size
__ mov(r9, rdx); // locals_size
// 对齐栈指针 (16字节对齐)
__ subptr(rsp, 32);
// 安全调用
__ call(RuntimeAddress(
CAST_FROM_FN_PTR(address,
TemplateInterpreterGenerator::print_debug_info)
));
__ addptr(rsp, 32); // 恢复栈指针
// 恢复寄存器
__ pop(r15);
NOT_LP64(__ restore_thread(r15_thread));
__ pop(r11);
__ pop(r10);
__ pop(r9);
__ pop(r8);
__ pop(rsi);
__ pop(rdi);
__ pop(rdx);
__ pop(rcx);
__ pop(rax);
}
// #endif
// get return address
__ pop(rax);
// compute beginning of parameters
__ lea(rlocals, Address(rsp, rcx, Interpreter::stackElementScale(), -wordSize));
// rdx - # of additional locals
// allocate space for locals
// explicitly initialize locals
{
Label exit, loop;
__ testl(rdx, rdx);
__ jcc(Assembler::lessEqual, exit); // do nothing if rdx <= 0
__ bind(loop);
__ push((int) NULL_WORD); // initialize local variables
__ decrementl(rdx); // until everything initialized
__ jcc(Assembler::greater, loop);
__ bind(exit);
}
// initialize fixed part of activation frame
generate_fixed_frame(false);
// make sure method is not native & not abstract
#ifdef ASSERT
__ movl(rax, access_flags);
{
Label L;
__ testl(rax, JVM_ACC_NATIVE);
__ jcc(Assembler::zero, L);
__ stop("tried to execute native method as non-native");
__ bind(L);
}
{
Label L;
__ testl(rax, JVM_ACC_ABSTRACT);
__ jcc(Assembler::zero, L);
__ stop("tried to execute abstract method in interpreter");
__ bind(L);
}
#endif
// Since at this point in the method invocation the exception
// handler would try to exit the monitor of synchronized methods
// which hasn't been entered yet, we set the thread local variable
// _do_not_unlock_if_synchronized to true. The remove_activation
// will check this flag.
const Register thread = NOT_LP64(rax) LP64_ONLY(r15_thread);
NOT_LP64(__ get_thread(thread));
const Address do_not_unlock_if_synchronized(thread,
in_bytes(JavaThread::do_not_unlock_if_synchronized_offset()));
__ movbool(do_not_unlock_if_synchronized, true);
__ profile_parameters_type(rax, rcx, rdx);
// increment invocation count & check for overflow
Label invocation_counter_overflow;
if (inc_counter) {
generate_counter_incr(&invocation_counter_overflow);
}
Label continue_after_compile;
__ bind(continue_after_compile);
// check for synchronized interpreted methods
bang_stack_shadow_pages(false);
// reset the _do_not_unlock_if_synchronized flag
NOT_LP64(__ get_thread(thread));
__ movbool(do_not_unlock_if_synchronized, false);
// check for synchronized methods
// Must happen AFTER invocation_counter check and stack overflow check,
// so method is not locked if overflows.
if (synchronized) {
// Allocate monitor and lock method
lock_method();
} else {
// no synchronization necessary
#ifdef ASSERT
{
Label L;
__ movl(rax, access_flags);
__ testl(rax, JVM_ACC_SYNCHRONIZED);
__ jcc(Assembler::zero, L);
__ stop("method needs synchronization");
__ bind(L);
}
#endif
}
// start execution
#ifdef ASSERT
{
Label L;
const Address monitor_block_top (rbp,
frame::interpreter_frame_monitor_block_top_offset * wordSize);
__ movptr(rax, monitor_block_top);
__ cmpptr(rax, rsp);
__ jcc(Assembler::equal, L);
__ stop("broken stack frame setup in interpreter");
__ bind(L);
}
#endif
// jvmti support
__ notify_method_entry();
__ dispatch_next(vtos);
// invocation counter overflow
if (inc_counter) {
// Handle overflow of counter and compile method
__ bind(invocation_counter_overflow);
generate_counter_overflow(continue_after_compile);
}
return entry_point;
}
// Generate a fixed interpreter frame. This is identical setup for
// interpreted methods and for native methods hence the shared code.
//
// Args:
// rax: return address
// rbx: Method*
// r14/rdi: pointer to locals
// r13/rsi: sender sp
// rdx: cp cache
void TemplateInterpreterGenerator::generate_fixed_frame(bool native_call) {
// initialize fixed part of activation frame
__ push(rax); // save return address
__ enter(); // save old & set new rbp
__ push(rbcp); // set sender sp
__ push((int)NULL_WORD); // leave last_sp as null
__ movptr(rbcp, Address(rbx, Method::const_offset())); // get ConstMethod*
__ lea(rbcp, Address(rbcp, ConstMethod::codes_offset())); // get codebase
__ push(rbx); // save Method*
// Get mirror and store it in the frame as GC root for this Method*
__ load_mirror(rdx, rbx);
__ push(rdx);
if (ProfileInterpreter) {
Label method_data_continue;
__ movptr(rdx, Address(rbx, in_bytes(Method::method_data_offset())));
__ testptr(rdx, rdx);
__ jcc(Assembler::zero, method_data_continue);
__ addptr(rdx, in_bytes(MethodData::data_offset()));
__ bind(method_data_continue);
__ push(rdx); // set the mdp (method data pointer)
} else {
__ push(0);
}
__ movptr(rdx, Address(rbx, Method::const_offset()));
__ movptr(rdx, Address(rdx, ConstMethod::constants_offset()));
__ movptr(rdx, Address(rdx, ConstantPool::cache_offset_in_bytes()));
__ push(rdx); // set constant pool cache
__ push(rlocals); // set locals pointer
if (native_call) {
__ push(0); // no bcp
} else {
__ push(rbcp); // set bcp
}
__ push(0); // reserve word for pointer to expression stack bottom
__ movptr(Address(rsp, 0), rsp); // set expression stack bottom
}
void TemplateTable::aload(int n) {
transition(vtos, atos);
__ movptr(rax, aaddress(n));
}
public static void main(java.lang.String[]);
descriptor: ([Ljava/lang/String;)V
flags: (0x0009) ACC_PUBLIC, ACC_STATIC
Code:
stack=2, locals=2, args_size=1
0: new #5 // class ByteCodeTest
3: dup
4: invokespecial #6 // Method "<init>":()V
7: astore_1
8: aload_1
9: invokevirtual #7 // Method test:()I
12: pop
13: return
LineNumberTable:
line 15: 0
line 16: 8
line 17: 13
yym@yym:~/javaTest/javaByteCode$ cat ByteCodeTest.java
public class ByteCodeTest {
public int add(int a, int b) {
return a+b;
}
public int test() {
int a = 2146598;
int b = 1091754;
int c = add(a, b);
return c;
}
public static void main(String[] args) {
ByteCodeTest byteCodeTest = new ByteCodeTest();
byteCodeTest.test();
}
}