blob: eb2cf4c1fd70a22dc76d5f638003807ac98ffc50 [file] [log] [blame]
# Copyright (C) 2023-2025 Apple Inc. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
# THE POSSIBILITY OF SUCH DAMAGE.
# Callee save
macro saveIPIntRegisters()
# NOTE: We intentionally don't restore pinned wasm registers here. These are saved
# and restored when entering Wasm by the JSToWasm wrapper and changes to them are meant
# to be observable within the same Wasm module.
subp IPIntCalleeSaveSpaceStackAligned, sp
if ARM64 or ARM64E
storepairq MC, PC, -2 * SlotSize[cfr]
elsif X86_64 or RISCV64
storep PC, -1 * SlotSize[cfr]
storep MC, -2 * SlotSize[cfr]
end
end
macro restoreIPIntRegisters()
# NOTE: We intentionally don't restore pinned wasm registers here. These are saved
# and restored when entering Wasm by the JSToWasm wrapper and changes to them are meant
# to be observable within the same Wasm module.
if ARM64 or ARM64E
loadpairq -2 * SlotSize[cfr], MC, PC
elsif X86_64 or RISCV64
loadp -1 * SlotSize[cfr], PC
loadp -2 * SlotSize[cfr], MC
end
addp IPIntCalleeSaveSpaceStackAligned, sp
end
# Dispatch target bases
if ARM64 or ARM64E
const ipint_dispatch_base = _ipint_unreachable
const ipint_gc_dispatch_base = _ipint_struct_new
const ipint_conversion_dispatch_base = _ipint_i32_trunc_sat_f32_s
const ipint_simd_dispatch_base = _ipint_simd_v128_load_mem
const ipint_atomic_dispatch_base = _ipint_memory_atomic_notify
end
# Tail-call bytecode dispatch
macro nextIPIntInstruction()
loadb [PC], t0
if ARM64 or ARM64E
# x0 = opcode
pcrtoaddr ipint_dispatch_base, t7
addlshiftp t7, t0, (constexpr (WTF::fastLog2(JSC::IPInt::alignIPInt))), t0
jmp t0
elsif X86_64
leap _os_script_config_storage, t1
loadp JSC::LLInt::OpcodeConfig::ipint_dispatch_base[t1], t1
lshiftq (constexpr (WTF::fastLog2(JSC::IPInt::alignIPInt))), t0
addq t1, t0
jmp t0
else
error
end
end
# Stack operations
# Every value on the stack is always 16 bytes! This makes life easy.
macro pushQuad(reg)
if ARM64 or ARM64E
push reg, reg
elsif X86_64
push reg, reg
else
break
end
end
macro pushQuadPair(reg1, reg2)
push reg1, reg2
end
macro popQuad(reg)
# FIXME: emit post-increment in offlineasm
if ARM64 or ARM64E
loadqinc [sp], reg, V128ISize
elsif X86_64
loadq [sp], reg
addq V128ISize, sp
else
break
end
end
macro pushVec(reg)
pushv reg
end
macro popVec(reg)
popv reg
end
# Typed push/pop to make code pretty
macro pushInt32(reg)
pushQuad(reg)
end
macro popInt32(reg)
popQuad(reg)
end
macro pushFloat32(reg)
pushv reg
end
macro popFloat32(reg)
popv reg
end
macro pushInt64(reg)
pushQuad(reg)
end
macro popInt64(reg)
popQuad(reg)
end
macro pushFloat64(reg)
pushv reg
end
macro popFloat64(reg)
popv reg
end
# Entering IPInt
# MC = location in argumINT bytecode
# csr0 = tmp
# csr1 = dst
# csr2 = src
# csr3 = end
# csr4 = for dispatch
const argumINTTmp = csr0
const argumINTDst = sc0
const argumINTSrc = csr2
const argumINTEnd = csr3
const argumINTDsp = csr4
macro ipintEntry()
const argumINTEndAsScratch = argumINTEnd
checkStackOverflow(ws0, argumINTEndAsScratch)
# Allocate space for locals and rethrow values
if ARM64 or ARM64E
loadpairi Wasm::IPIntCallee::m_localSizeToAlloc[ws0], argumINTTmp, argumINTEnd
else
loadi Wasm::IPIntCallee::m_localSizeToAlloc[ws0], argumINTTmp
loadi Wasm::IPIntCallee::m_numRethrowSlotsToAlloc[ws0], argumINTEnd
end
mulp LocalSize, argumINTEnd
mulp LocalSize, argumINTTmp
subp argumINTEnd, sp
move sp, argumINTEnd
subp argumINTTmp, sp
move sp, argumINTDsp
loadp Wasm::IPIntCallee::m_argumINTBytecode + VectorBufferOffset[ws0], MC
push argumINTTmp, argumINTDst, argumINTSrc, argumINTEnd
move argumINTDsp, argumINTDst
leap FirstArgumentOffset[cfr], argumINTSrc
validateOpcodeConfig(argumINTTmp)
argumINTDispatch()
end
macro argumINTDispatch()
loadb [MC], argumINTTmp
addp 1, MC
bbgteq argumINTTmp, (constexpr IPInt::ArgumINTBytecode::NumOpcodes), _ipint_argument_dispatch_err
lshiftp (constexpr (WTF::fastLog2(JSC::IPInt::alignArgumInt))), argumINTTmp
if ARM64 or ARM64E
pcrtoaddr _argumINT_begin, argumINTDsp
addp argumINTTmp, argumINTDsp
jmp argumINTDsp
elsif X86_64
leap (_argumINT_begin - _ipint_entry_relativePCBase)[PL], argumINTDsp
addp argumINTTmp, argumINTDsp
jmp argumINTDsp
else
break
end
end
macro argumINTInitializeDefaultLocals()
# zero out remaining locals
bpeq argumINTDst, argumINTEnd, .ipint_entry_finish_zero
loadb [MC], argumINTTmp
addp 1, MC
sxb2p argumINTTmp, argumINTTmp
andp ValueNull, argumINTTmp
if ARM64 or ARM64E
# offlineasm doesn't have xzr so emit it
emit "stp x19, xzr, [x9]"
elsif X86_64
storep argumINTTmp, [argumINTDst]
storep 0, 8[argumINTDst]
end
addp LocalSize, argumINTDst
end
macro argumINTFinish()
pop argumINTEnd, argumINTSrc, argumINTDst, argumINTTmp
end
#############################
# 0x00 - 0x11: control flow #
#############################
ipintOp(_unreachable, macro()
# unreachable
# Push to stack for the handler
push PC, MC
push PL, ws0
move cfr, a1
move sp, a2
operationCall(macro() cCall3(_ipint_extern_unreachable_breakpoint_handler) end)
# Remove pushed values
addq 4 * SlotSize, sp
bqeq r0, 0, .exception
.continue:
nextIPIntInstruction()
.exception:
ipintException(Unreachable)
end)
ipintOp(_nop, macro()
# nop
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_block, macro()
# block
validateOpcodeConfig(t0)
if ARM64 or ARM64E
loadpairi IPInt::BlockMetadata::deltaPC[MC], t0, t1
else
loadi IPInt::BlockMetadata::deltaPC[MC], t0
loadi IPInt::BlockMetadata::deltaMC[MC], t1
end
sxi2q t0, t0
sxi2q t1, t1
advancePCByReg(t0)
advanceMCByReg(t1)
nextIPIntInstruction()
end)
ipintOp(_loop, macro()
# loop
# We already validateOpcodeConfig in ipintLoopOSR.
ipintLoopOSR(1)
loadb IPInt::InstructionLengthMetadata::length[MC], t0
advancePCByReg(t0)
advanceMCByReg(constexpr (sizeof(IPInt::InstructionLengthMetadata)))
nextIPIntInstruction()
end)
ipintOp(_if, macro()
# if
validateOpcodeConfig(t1)
popInt32(t0)
bineq 0, t0, .ipint_if_taken
if ARM64 or ARM64E
loadpairi IPInt::IfMetadata::elseDeltaPC[MC], t0, t1
else
loadi IPInt::IfMetadata::elseDeltaPC[MC], t0
loadi IPInt::IfMetadata::elseDeltaMC[MC], t1
end
advancePCByReg(t0)
advanceMCByReg(t1)
nextIPIntInstruction()
.ipint_if_taken:
# Skip LEB128
loadb IPInt::IfMetadata::instructionLength[MC], t0
advanceMC(constexpr (sizeof(IPInt::IfMetadata)))
advancePCByReg(t0)
nextIPIntInstruction()
end)
ipintOp(_else, macro()
# else
# Counterintuitively, we only run this instruction if the if
# clause is TAKEN. This is used to branch to the end of the
# block.
validateOpcodeConfig(t0)
if ARM64 or ARM64E
loadpairi IPInt::BlockMetadata::deltaPC[MC], t0, t1
else
loadi IPInt::BlockMetadata::deltaPC[MC], t0
loadi IPInt::BlockMetadata::deltaMC[MC], t1
end
# always skipping forward - no need to sign-extend t0, t1
advancePCByReg(t0)
advanceMCByReg(t1)
nextIPIntInstruction()
end)
ipintOp(_try, macro()
validateOpcodeConfig(t0)
loadb IPInt::InstructionLengthMetadata::length[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::InstructionLengthMetadata)))
nextIPIntInstruction()
end)
ipintOp(_catch, macro()
# Counterintuitively, like else, we only run this instruction
# if no exception was thrown during the preceeding try or catch block.
validateOpcodeConfig(t0)
if ARM64 or ARM64E
loadpairi IPInt::BlockMetadata::deltaPC[MC], t0, t1
else
loadi IPInt::BlockMetadata::deltaPC[MC], t0
loadi IPInt::BlockMetadata::deltaMC[MC], t1
end
# always skipping forward - no need to sign-extend t0, t1
advancePCByReg(t0)
advanceMCByReg(t1)
nextIPIntInstruction()
end)
ipintOp(_throw, macro()
saveCallSiteIndex()
loadp JSWebAssemblyInstance::m_vm[wasmInstance], t0
loadp VM::topEntryFrame[t0], t0
copyCalleeSavesToEntryFrameCalleeSavesBuffer(t0)
move cfr, a1
move sp, a2
loadi IPInt::ThrowMetadata::exceptionIndex[MC], a3
operationCall(macro() cCall4(_ipint_extern_throw_exception) end)
jumpToException()
end)
ipintOp(_rethrow, macro()
saveCallSiteIndex()
loadp JSWebAssemblyInstance::m_vm[wasmInstance], t0
loadp VM::topEntryFrame[t0], t0
copyCalleeSavesToEntryFrameCalleeSavesBuffer(t0)
move cfr, a1
move PL, a2
loadi IPInt::RethrowMetadata::tryDepth[MC], a3
operationCall(macro() cCall4(_ipint_extern_rethrow_exception) end)
jumpToException()
end)
ipintOp(_throw_ref, macro()
popQuad(a2)
bieq a2, ValueNull, .throw_null_ref
saveCallSiteIndex()
loadp JSWebAssemblyInstance::m_vm[wasmInstance], t0
loadp VM::topEntryFrame[t0], t0
copyCalleeSavesToEntryFrameCalleeSavesBuffer(t0)
move cfr, a1
operationCall(macro() cCall3(_ipint_extern_throw_ref) end)
jumpToException()
.throw_null_ref:
throwException(NullExnrefReference)
end)
macro uintDispatch()
if ARM64 or ARM64E
loadb [MC], sc2
addq 1, MC
bigteq sc2, (constexpr IPInt::UIntBytecode::NumOpcodes), _ipint_uint_dispatch_err
lshiftq (constexpr (WTF::fastLog2(JSC::IPInt::alignUInt))), sc2
pcrtoaddr _uint_begin, sc3
addq sc2, ws3
jmp ws3
elsif X86_64
loadb [MC], sc1
addq 1, MC
bigteq sc1, (constexpr IPInt::UIntBytecode::NumOpcodes), _ipint_uint_dispatch_err
lshiftq 6, sc1
leap (_uint_begin - _mint_entry_relativePCBase)[PC, sc1], sc1
jmp sc1
end
end
ipintOp(_end, macro()
validateOpcodeConfig(t1)
if X86_64
loadp UnboxedWasmCalleeStackSlot[cfr], ws0
end
loadp Wasm::IPIntCallee::m_bytecodeEnd[ws0], t1
bqeq PC, t1, .ipint_end_ret
advancePC(1)
nextIPIntInstruction()
end)
# This implementation is specially defined out of ipintOp scope to make end implementation tight.
.ipint_end_ret:
loadp Wasm::IPIntCallee::m_uINTBytecode + VectorBufferOffset[ws0], MC
ipintEpilogueOSR(10)
if X86_64
loadp UnboxedWasmCalleeStackSlot[cfr], ws0
end
loadi Wasm::IPIntCallee::m_topOfReturnStackFPOffset[ws0], sc0
addp cfr, sc0
initPCRelative(mint_entry, PC)
// We've already validateOpcodeConfig() in all the places that can jump to .ipint_end_ret.
uintDispatch()
ipintOp(_br, macro()
# br
validateOpcodeConfig(t0)
loadh IPInt::BranchTargetMetadata::toPop[MC], t0
# number to keep
loadh IPInt::BranchTargetMetadata::toKeep[MC], t1
# ex. pop 3 and keep 2
#
# +4 +3 +2 +1 sp
# a b c d e
# d e
#
# [sp + k + numToPop] = [sp + k] for k in numToKeep-1 -> 0
move t0, t2
mulq StackValueSize, t2
leap [sp, t2], t2
.ipint_br_poploop:
bqeq t1, 0, .ipint_br_popend
subq 1, t1
move t1, t3
mulq StackValueSize, t3
loadq [sp, t3], t0
storeq t0, [t2, t3]
loadq 8[sp, t3], t0
storeq t0, 8[t2, t3]
jmp .ipint_br_poploop
.ipint_br_popend:
loadh IPInt::BranchTargetMetadata::toPop[MC], t0
mulq StackValueSize, t0
leap [sp, t0], sp
if ARM64 or ARM64E
loadpairi IPInt::BlockMetadata::deltaPC[MC], t0, t1
else
loadi IPInt::BlockMetadata::deltaPC[MC], t0
loadi IPInt::BlockMetadata::deltaMC[MC], t1
end
sxi2q t0, t0
sxi2q t1, t1
advancePCByReg(t0)
advanceMCByReg(t1)
nextIPIntInstruction()
end)
ipintOp(_br_if, macro()
# pop i32
validateOpcodeConfig(t2)
popInt32(t0)
bineq t0, 0, _ipint_br
loadb IPInt::BranchMetadata::instructionLength[MC], t0
advanceMC(constexpr (sizeof(IPInt::BranchMetadata)))
advancePCByReg(t0)
nextIPIntInstruction()
end)
ipintOp(_br_table, macro()
# br_table
validateOpcodeConfig(t2)
popInt32(t0)
loadi IPInt::SwitchMetadata::size[MC], t1
advanceMC(constexpr (sizeof(IPInt::SwitchMetadata)))
bib t0, t1, .ipint_br_table_clamped
subq t1, 1, t0
.ipint_br_table_clamped:
move t0, t1
muli (constexpr (sizeof(IPInt::BranchTargetMetadata))), t0
addq t0, MC
jmp _ipint_br
end)
ipintOp(_return, macro()
validateOpcodeConfig(MC)
# ret
if X86_64
loadp UnboxedWasmCalleeStackSlot[cfr], ws0
end
# This is guaranteed going to an end instruction, so skip
# dispatch and end of program check for speed
jmp .ipint_end_ret
end)
if ARM64 or ARM64E
const IPIntCallCallee = sc1
const IPIntCallFunctionSlot = sc0
elsif X86_64
const IPIntCallCallee = t7
const IPIntCallFunctionSlot = t6
end
ipintOp(_call, macro()
// The operationCall below already calls validateOpcodeConfig().
saveCallSiteIndex()
loadb IPInt::CallMetadata::length[MC], t0
advancePCByReg(t0)
move cfr, a1
move MC, a2
advanceMC(IPInt::CallMetadata::signature)
subq 16, sp
move sp, a3
# operation returns the entrypoint in r0 and the target instance in r1
# operation stores the target callee to sp[0] and target function info to sp[1]
operationCall(macro() cCall4(_ipint_extern_prepare_call) end)
loadq [sp], IPIntCallCallee
loadq 8[sp], IPIntCallFunctionSlot
addq 16, sp
# call
jmp .ipint_call_common
end)
ipintOp(_call_indirect, macro()
// The operationCall below already calls validateOpcodeConfig().
saveCallSiteIndex()
loadb IPInt::CallIndirectMetadata::length[MC], t2
advancePCByReg(t2)
# Get function index by pointer, use it as a return for callee
move sp, a2
# Get callIndirectMetadata
move cfr, a1
move MC, a3
advanceMC(IPInt::CallIndirectMetadata::signature)
operationCallMayThrow(macro() cCall4(_ipint_extern_prepare_call_indirect) end)
loadq [sp], IPIntCallCallee
loadq 8[sp], IPIntCallFunctionSlot
addq 16, sp
jmp .ipint_call_common
end)
ipintOp(_return_call, macro()
// The operationCall below already calls validateOpcodeConfig().
saveCallSiteIndex()
loadb IPInt::TailCallMetadata::length[MC], t0
advancePCByReg(t0)
move cfr, a1
move MC, a2
subq 16, sp
move sp, a3
# operation returns the entrypoint in r0 and the target instance in r1
# this operation stores the boxed Callee into *r2
operationCall(macro() cCall4(_ipint_extern_prepare_call) end)
loadq [sp], IPIntCallCallee
loadq 8[sp], IPIntCallFunctionSlot
addq 16, sp
loadi IPInt::TailCallMetadata::callerStackArgSize[MC], t3
advanceMC(IPInt::TailCallMetadata::argumentBytecode)
jmp .ipint_tail_call_common
end)
ipintOp(_return_call_indirect, macro()
// The operationCallMayThrow below already calls validateOpcodeConfig().
saveCallSiteIndex()
loadb IPInt::TailCallIndirectMetadata::length[MC], t2
advancePCByReg(t2)
# Get function index by pointer, use it as a return for callee
move sp, a2
# Get callIndirectMetadata
move cfr, a1
move MC, a3
operationCallMayThrow(macro() cCall4(_ipint_extern_prepare_call_indirect) end)
loadq [sp], IPIntCallCallee
loadq 8[sp], IPIntCallFunctionSlot
addq 16, sp
loadi IPInt::TailCallIndirectMetadata::callerStackArgSize[MC], t3
advanceMC(IPInt::TailCallIndirectMetadata::argumentBytecode)
jmp .ipint_tail_call_common
end)
ipintOp(_call_ref, macro()
// The operationCall below already calls validateOpcodeConfig().
saveCallSiteIndex()
move cfr, a1
move MC, a2
move sp, a3
operationCallMayThrow(macro() cCall4(_ipint_extern_prepare_call_ref) end)
loadq [sp], IPIntCallCallee
loadq 8[sp], IPIntCallFunctionSlot
addq 16, sp
loadb IPInt::CallRefMetadata::length[MC], t3
advanceMC(IPInt::CallRefMetadata::signature)
advancePCByReg(t3)
jmp .ipint_call_common
end)
ipintOp(_return_call_ref, macro()
// The operationCallMayThrow below already calls validateOpcodeConfig().
saveCallSiteIndex()
loadb IPInt::TailCallRefMetadata::length[MC], t2
advancePCByReg(t2)
move cfr, a1
move MC, a2
move sp, a3
operationCallMayThrow(macro() cCall4(_ipint_extern_prepare_call_ref) end)
loadq [sp], IPIntCallCallee
loadq 8[sp], IPIntCallFunctionSlot
addq 16, sp
loadi IPInt::TailCallRefMetadata::callerStackArgSize[MC], t3
advanceMC(IPInt::TailCallRefMetadata::argumentBytecode)
jmp .ipint_tail_call_common
end)
reservedOpcode(0x16)
reservedOpcode(0x17)
ipintOp(_delegate, macro()
# Counterintuitively, like else, we only run this instruction
# if no exception was thrown during the preceeding try or catch block.
validateOpcodeConfig(t0)
if ARM64 or ARM64E
loadpairi IPInt::BlockMetadata::deltaPC[MC], t0, t1
else
loadi IPInt::BlockMetadata::deltaPC[MC], t0
loadi IPInt::BlockMetadata::deltaMC[MC], t1
end
# always skipping forward - no need to sign-extend t0, t1
advancePCByReg(t0)
advanceMCByReg(t1)
nextIPIntInstruction()
end)
ipintOp(_catch_all, macro()
# Counterintuitively, like else, we only run this instruction
# if no exception was thrown during the preceeding try or catch block.
validateOpcodeConfig(t0)
if ARM64 or ARM64E
loadpairi IPInt::BlockMetadata::deltaPC[MC], t0, t1
else
loadi IPInt::BlockMetadata::deltaPC[MC], t0
loadi IPInt::BlockMetadata::deltaMC[MC], t1
end
# always skipping forward - no need to sign-extend t0, t1
advancePCByReg(t0)
advanceMCByReg(t1)
nextIPIntInstruction()
end)
ipintOp(_drop, macro()
addq StackValueSize, sp
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_select, macro()
popInt32(t0)
bieq t0, 0, .ipint_select_val2
addq StackValueSize, sp
advancePC(1)
advanceMC(constexpr (sizeof(IPInt::InstructionLengthMetadata)))
nextIPIntInstruction()
.ipint_select_val2:
popVec(v1)
popVec(v0)
pushVec(v1)
advancePC(1)
advanceMC(constexpr (sizeof(IPInt::InstructionLengthMetadata)))
nextIPIntInstruction()
end)
ipintOp(_select_t, macro()
popInt32(t0)
bieq t0, 0, .ipint_select_t_val2
addq StackValueSize, sp
loadb IPInt::InstructionLengthMetadata::length[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::InstructionLengthMetadata)))
nextIPIntInstruction()
.ipint_select_t_val2:
popVec(v1)
popVec(v0)
pushVec(v1)
loadb IPInt::InstructionLengthMetadata::length[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::InstructionLengthMetadata)))
nextIPIntInstruction()
end)
reservedOpcode(0x1d)
reservedOpcode(0x1e)
ipintOp(_try_table, macro()
# advance MC/PC
validateOpcodeConfig(t0)
if ARM64 or ARM64E
loadpairi IPInt::BlockMetadata::deltaPC[MC], t0, t1
else
loadi IPInt::BlockMetadata::deltaPC[MC], t0
loadi IPInt::BlockMetadata::deltaMC[MC], t1
end
# always skipping forward - no need to sign-extend t0, t1
advancePCByReg(t0)
advanceMCByReg(t1)
nextIPIntInstruction()
end)
###################################
# 0x20 - 0x26: get and set values #
###################################
macro localGetPostDecode()
# Index into locals
mulq LocalSize, t0
loadv [PL, t0], v0
# Push to stack
pushVec(v0)
nextIPIntInstruction()
end
ipintOp(_local_get, macro()
# local.get
loadb 1[PC], t0
advancePC(2)
bbaeq t0, 128, _ipint_local_get_slow_path
localGetPostDecode()
end)
macro localSetPostDecode()
# Pop from stack
popVec(v0)
# Store to locals
mulq LocalSize, t0
storev v0, [PL, t0]
nextIPIntInstruction()
end
ipintOp(_local_set, macro()
# local.set
loadb 1[PC], t0
advancePC(2)
bbaeq t0, 128, _ipint_local_set_slow_path
localSetPostDecode()
end)
macro localTeePostDecode()
# Load from stack
loadv [sp], v0
# Store to locals
mulq LocalSize, t0
storev v0, [PL, t0]
nextIPIntInstruction()
end
ipintOp(_local_tee, macro()
# local.tee
loadb 1[PC], t0
advancePC(2)
bbaeq t0, 128, _ipint_local_tee_slow_path
localTeePostDecode()
end)
ipintOp(_global_get, macro()
loadb IPInt::GlobalMetadata::instructionLength[MC], t0
advancePCByReg(t0)
# Load pre-computed index from metadata
loadb IPInt::GlobalMetadata::bindingMode[MC], t2
loadi IPInt::GlobalMetadata::index[MC], t1
loadp JSWebAssemblyInstance::m_globals[wasmInstance], t0
advanceMC(constexpr (sizeof(IPInt::GlobalMetadata)))
lshiftp 1, t1
bieq t2, 0, .ipint_global_get_embedded
loadp [t0, t1, 8], t0
loadv [t0], v0
pushVec(v0)
nextIPIntInstruction()
.ipint_global_get_embedded:
loadv [t0, t1, 8], v0
pushVec(v0)
nextIPIntInstruction()
end)
ipintOp(_global_set, macro()
# isRef = 1 => ref, use slowpath
loadb IPInt::GlobalMetadata::isRef[MC], t0
bineq t0, 0, .ipint_global_set_refpath
# bindingMode = 1 => portable
loadb IPInt::GlobalMetadata::bindingMode[MC], t2
# get global addr
loadp JSWebAssemblyInstance::m_globals[wasmInstance], t0
# get value to store
popVec(v0)
# get index
loadi IPInt::GlobalMetadata::index[MC], t1
lshiftp 1, t1
bieq t2, 0, .ipint_global_set_embedded
# portable: dereference then set
loadp [t0, t1, 8], t0
storev v0, [t0]
loadb IPInt::GlobalMetadata::instructionLength[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::GlobalMetadata)))
jmp .ipint_global_set_dispatch
.ipint_global_set_embedded:
# embedded: set directly
storev v0, [t0, t1, 8]
loadb IPInt::GlobalMetadata::instructionLength[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::GlobalMetadata)))
jmp .ipint_global_set_dispatch
.ipint_global_set_refpath:
loadi IPInt::GlobalMetadata::index[MC], a1
# Pop from stack
popQuad(a2)
operationCall(macro() cCall3(_ipint_extern_set_global_ref) end)
loadb IPInt::GlobalMetadata::instructionLength[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::GlobalMetadata)))
.ipint_global_set_dispatch:
nextIPIntInstruction()
end)
ipintOp(_table_get, macro()
# Load pre-computed index from metadata
loadi IPInt::Const32Metadata::value[MC], a1
popInt32(a2)
operationCallMayThrow(macro() cCall3(_ipint_extern_table_get) end)
pushQuad(r0)
loadb IPInt::Const32Metadata::instructionLength[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::Const32Metadata)))
nextIPIntInstruction()
end)
ipintOp(_table_set, macro()
# Load pre-computed index from metadata
loadi IPInt::Const32Metadata::value[MC], a1
popQuad(a3)
popInt32(a2)
operationCallMayThrow(macro() cCall4(_ipint_extern_table_set) end)
loadb IPInt::Const32Metadata::instructionLength[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::Const32Metadata)))
nextIPIntInstruction()
end)
reservedOpcode(0x27)
macro popMemoryIndex(reg, tmp)
popInt32(reg)
ori 0, reg
end
macro ipintCheckMemoryBound(mem, scratch, size)
# Memory indices are 32 bit
leap size - 1[mem], scratch
bpb scratch, boundsCheckingSize, .continuation
ipintException(OutOfBoundsMemoryAccess)
.continuation:
end
macro loadMemoryOffsetAndAdvanceMC(dstReg, tmpReg, instrLenReg)
loadb JSWebAssemblyInstance::m_cachedIsMemory64[wasmInstance], tmpReg
btiz tmpReg, .memory32
loadq IPInt::Const64Metadata::value[MC], dstReg
loadb IPInt::Const64Metadata::instructionLength[MC], instrLenReg
advanceMC(constexpr (sizeof(IPInt::Const64Metadata)))
jmp .done
.memory32:
loadi IPInt::Const32Metadata::value[MC], dstReg
loadb IPInt::Const32Metadata::instructionLength[MC], instrLenReg
advanceMC(constexpr (sizeof(IPInt::Const32Metadata)))
.done:
end
ipintOp(_i32_load_mem, macro()
# i32.load
# pop index
popMemoryIndex(t0, t2)
loadMemoryOffsetAndAdvanceMC(t2, t3, t4)
addp t2, t0
ipintCheckMemoryBound(t0, t2, 4)
# load memory location
loadi [memoryBase, t0], t1
pushInt32(t1)
advancePCByReg(t4)
nextIPIntInstruction()
end)
ipintOp(_i64_load_mem, macro()
# i32.load
# pop index
popMemoryIndex(t0, t2)
loadMemoryOffsetAndAdvanceMC(t2, t3, t4)
addp t2, t0
ipintCheckMemoryBound(t0, t2, 8)
# load memory location
loadq [memoryBase, t0], t1
pushInt64(t1)
advancePCByReg(t4)
nextIPIntInstruction()
end)
ipintOp(_f32_load_mem, macro()
# f32.load
# pop index
popMemoryIndex(t0, t2)
loadMemoryOffsetAndAdvanceMC(t2, t3, t4)
addp t2, t0
ipintCheckMemoryBound(t0, t2, 4)
# load memory location
loadf [memoryBase, t0], ft0
pushFloat32(ft0)
advancePCByReg(t4)
nextIPIntInstruction()
end)
ipintOp(_f64_load_mem, macro()
# f64.load
# pop index
popMemoryIndex(t0, t2)
loadMemoryOffsetAndAdvanceMC(t2, t3, t4)
addp t2, t0
ipintCheckMemoryBound(t0, t2, 8)
# load memory location
loadd [memoryBase, t0], ft0
pushFloat64(ft0)
advancePCByReg(t4)
nextIPIntInstruction()
end)
ipintOp(_i32_load8s_mem, macro()
# i32.load8_s
# pop index
popMemoryIndex(t0, t2)
loadMemoryOffsetAndAdvanceMC(t2, t3, t4)
addp t2, t0
ipintCheckMemoryBound(t0, t2, 1)
# load memory location
loadb [memoryBase, t0], t1
sxb2i t1, t1
pushInt32(t1)
advancePCByReg(t4)
nextIPIntInstruction()
end)
ipintOp(_i32_load8u_mem, macro()
# i32.load8_u
# pop index
popMemoryIndex(t0, t2)
loadMemoryOffsetAndAdvanceMC(t2, t3, t4)
addp t2, t0
ipintCheckMemoryBound(t0, t2, 1)
# load memory location
loadb [memoryBase, t0], t1
pushInt32(t1)
advancePCByReg(t4)
nextIPIntInstruction()
end)
ipintOp(_i32_load16s_mem, macro()
# i32.load16_s
# pop index
popMemoryIndex(t0, t2)
loadMemoryOffsetAndAdvanceMC(t2, t3, t4)
addp t2, t0
ipintCheckMemoryBound(t0, t2, 2)
# load memory location
loadh [memoryBase, t0], t1
sxh2i t1, t1
pushInt32(t1)
advancePCByReg(t4)
nextIPIntInstruction()
end)
ipintOp(_i32_load16u_mem, macro()
# i32.load16_u
# pop index
popMemoryIndex(t0, t2)
loadMemoryOffsetAndAdvanceMC(t2, t3, t4)
addp t2, t0
ipintCheckMemoryBound(t0, t2, 2)
# load memory location
loadh [memoryBase, t0], t1
pushInt32(t1)
advancePCByReg(t4)
nextIPIntInstruction()
end)
ipintOp(_i64_load8s_mem, macro()
# i64.load8_s
# pop index
popMemoryIndex(t0, t2)
loadMemoryOffsetAndAdvanceMC(t2, t3, t4)
addp t2, t0
ipintCheckMemoryBound(t0, t2, 1)
# load memory location
loadb [memoryBase, t0], t1
sxb2q t1, t1
pushInt64(t1)
advancePCByReg(t4)
nextIPIntInstruction()
end)
ipintOp(_i64_load8u_mem, macro()
# i64.load8_u
# pop index
popMemoryIndex(t0, t2)
loadMemoryOffsetAndAdvanceMC(t2, t3, t4)
addp t2, t0
ipintCheckMemoryBound(t0, t2, 1)
# load memory location
loadb [memoryBase, t0], t1
pushInt64(t1)
advancePCByReg(t4)
nextIPIntInstruction()
end)
ipintOp(_i64_load16s_mem, macro()
# i64.load16_s
# pop index
popMemoryIndex(t0, t2)
loadMemoryOffsetAndAdvanceMC(t2, t3, t4)
addp t2, t0
ipintCheckMemoryBound(t0, t2, 2)
# load memory location
loadh [memoryBase, t0], t1
sxh2q t1, t1
pushInt64(t1)
advancePCByReg(t4)
nextIPIntInstruction()
end)
ipintOp(_i64_load16u_mem, macro()
# i64.load16_u
# pop index
popMemoryIndex(t0, t2)
loadMemoryOffsetAndAdvanceMC(t2, t3, t4)
addp t2, t0
ipintCheckMemoryBound(t0, t2, 2)
# load memory location
loadh [memoryBase, t0], t1
pushInt64(t1)
advancePCByReg(t4)
nextIPIntInstruction()
end)
ipintOp(_i64_load32s_mem, macro()
# i64.load32_s
# pop index
popMemoryIndex(t0, t2)
loadMemoryOffsetAndAdvanceMC(t2, t3, t4)
addp t2, t0
ipintCheckMemoryBound(t0, t2, 4)
# load memory location
loadi [memoryBase, t0], t1
sxi2q t1, t1
pushInt64(t1)
advancePCByReg(t4)
nextIPIntInstruction()
end)
ipintOp(_i64_load32u_mem, macro()
# i64.load8_s
# pop index
popMemoryIndex(t0, t2)
loadMemoryOffsetAndAdvanceMC(t2, t3, t4)
addp t2, t0
ipintCheckMemoryBound(t0, t2, 4)
# load memory location
loadi [memoryBase, t0], t1
pushInt64(t1)
advancePCByReg(t4)
nextIPIntInstruction()
end)
ipintOp(_i32_store_mem, macro()
# i32.store
# pop data
popInt32(t1)
# pop index
popMemoryIndex(t0, t2)
loadMemoryOffsetAndAdvanceMC(t2, t3, t4)
addp t2, t0
ipintCheckMemoryBound(t0, t2, 4)
# load memory location
storei t1, [memoryBase, t0]
advancePCByReg(t4)
nextIPIntInstruction()
end)
ipintOp(_i64_store_mem, macro()
# i64.store
# pop data
popInt64(t1)
# pop index
popMemoryIndex(t0, t2)
loadMemoryOffsetAndAdvanceMC(t2, t3, t4)
addp t2, t0
ipintCheckMemoryBound(t0, t2, 8)
# load memory location
storeq t1, [memoryBase, t0]
advancePCByReg(t4)
nextIPIntInstruction()
end)
ipintOp(_f32_store_mem, macro()
# f32.store
# pop data
popFloat32(ft0)
# pop index
popMemoryIndex(t0, t2)
loadMemoryOffsetAndAdvanceMC(t2, t3, t4)
addp t2, t0
ipintCheckMemoryBound(t0, t2, 4)
# load memory location
storef ft0, [memoryBase, t0]
advancePCByReg(t4)
nextIPIntInstruction()
end)
ipintOp(_f64_store_mem, macro()
# f64.store
# pop data
popFloat64(ft0)
# pop index
popMemoryIndex(t0, t2)
loadMemoryOffsetAndAdvanceMC(t2, t3, t4)
addp t2, t0
ipintCheckMemoryBound(t0, t2, 8)
# load memory location
stored ft0, [memoryBase, t0]
advancePCByReg(t4)
nextIPIntInstruction()
end)
ipintOp(_i32_store8_mem, macro()
# i32.store8
# pop data
popInt32(t1)
# pop index
popMemoryIndex(t0, t2)
loadMemoryOffsetAndAdvanceMC(t2, t3, t4)
addp t2, t0
ipintCheckMemoryBound(t0, t2, 1)
# load memory location
storeb t1, [memoryBase, t0]
advancePCByReg(t4)
nextIPIntInstruction()
end)
ipintOp(_i32_store16_mem, macro()
# i32.store16
# pop data
popInt32(t1)
# pop index
popMemoryIndex(t0, t2)
loadMemoryOffsetAndAdvanceMC(t2, t3, t4)
addp t2, t0
ipintCheckMemoryBound(t0, t2, 2)
# load memory location
storeh t1, [memoryBase, t0]
advancePCByReg(t4)
nextIPIntInstruction()
end)
ipintOp(_i64_store8_mem, macro()
# i64.store8
# pop data
popInt64(t1)
# pop index
popMemoryIndex(t0, t2)
loadMemoryOffsetAndAdvanceMC(t2, t3, t4)
addp t2, t0
ipintCheckMemoryBound(t0, t2, 1)
# load memory location
storeb t1, [memoryBase, t0]
advancePCByReg(t4)
nextIPIntInstruction()
end)
ipintOp(_i64_store16_mem, macro()
# i64.store16
# pop data
popInt64(t1)
# pop index
popMemoryIndex(t0, t2)
loadMemoryOffsetAndAdvanceMC(t2, t3, t4)
addp t2, t0
ipintCheckMemoryBound(t0, t2, 2)
# load memory location
storeh t1, [memoryBase, t0]
advancePCByReg(t4)
nextIPIntInstruction()
end)
ipintOp(_i64_store32_mem, macro()
# i64.store32
# pop data
popInt64(t1)
# pop index
popMemoryIndex(t0, t2)
loadMemoryOffsetAndAdvanceMC(t2, t3, t4)
addp t2, t0
ipintCheckMemoryBound(t0, t2, 4)
# load memory location
storei t1, [memoryBase, t0]
advancePCByReg(t4)
nextIPIntInstruction()
end)
ipintOp(_memory_size, macro()
loadp JSWebAssemblyInstance::m_cachedMemorySize[wasmInstance], t0
urshiftp 16, t0
zxi2q t0, t0
pushInt32(t0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_memory_grow, macro()
popInt32(a1)
operationCall(macro() cCall2(_ipint_extern_memory_grow) end)
pushInt32(r0)
ipintReloadMemory()
advancePC(2)
nextIPIntInstruction()
end)
################################
# 0x41 - 0x44: constant values #
################################
ipintOp(_i32_const, macro()
# i32.const
loadb IPInt::InstructionLengthMetadata::length[MC], t1
bigteq t1, 2, .ipint_i32_const_slowpath
loadb 1[PC], t0
lshiftq 7, t1
orq t1, t0
sxb2i t0, t0
pushInt32(t0)
advancePC(2)
advanceMC(constexpr (sizeof(IPInt::InstructionLengthMetadata)))
nextIPIntInstruction()
.ipint_i32_const_slowpath:
# Load pre-computed value from metadata
loadi IPInt::Const32Metadata::value[MC], t0
# Push to stack
pushInt32(t0)
advancePCByReg(t1)
advanceMC(constexpr (sizeof(IPInt::Const32Metadata)))
nextIPIntInstruction()
end)
ipintOp(_i64_const, macro()
# i64.const
# Load pre-computed value from metadata
loadq IPInt::Const64Metadata::value[MC], t0
# Push to stack
pushInt64(t0)
loadb IPInt::Const64Metadata::instructionLength[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::Const64Metadata)))
nextIPIntInstruction()
end)
ipintOp(_f32_const, macro()
# f32.const
# Load pre-computed value from metadata
loadf 1[PC], ft0
pushFloat32(ft0)
advancePC(5)
nextIPIntInstruction()
end)
ipintOp(_f64_const, macro()
# f64.const
# Load pre-computed value from metadata
loadd 1[PC], ft0
pushFloat64(ft0)
advancePC(9)
nextIPIntInstruction()
end)
###############################
# 0x45 - 0x4f: i32 comparison #
###############################
ipintOp(_i32_eqz, macro()
# i32.eqz
popInt32(t0)
cieq t0, 0, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i32_eq, macro()
# i32.eq
popInt32(t1)
popInt32(t0)
cieq t0, t1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i32_ne, macro()
# i32.ne
popInt32(t1)
popInt32(t0)
cineq t0, t1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i32_lt_s, macro()
# i32.lt_s
popInt32(t1)
popInt32(t0)
cilt t0, t1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i32_lt_u, macro()
# i32.lt_u
popInt32(t1)
popInt32(t0)
cib t0, t1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i32_gt_s, macro()
# i32.gt_s
popInt32(t1)
popInt32(t0)
cigt t0, t1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i32_gt_u, macro()
# i32.gt_u
popInt32(t1)
popInt32(t0)
cia t0, t1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i32_le_s, macro()
# i32.le_s
popInt32(t1)
popInt32(t0)
cilteq t0, t1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i32_le_u, macro()
# i32.le_u
popInt32(t1)
popInt32(t0)
cibeq t0, t1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i32_ge_s, macro()
# i32.ge_s
popInt32(t1)
popInt32(t0)
cigteq t0, t1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i32_ge_u, macro()
# i32.ge_u
popInt32(t1)
popInt32(t0)
ciaeq t0, t1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
###############################
# 0x50 - 0x5a: i64 comparison #
###############################
ipintOp(_i64_eqz, macro()
# i64.eqz
popInt64(t0)
cqeq t0, 0, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i64_eq, macro()
# i64.eq
popInt64(t1)
popInt64(t0)
cqeq t0, t1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i64_ne, macro()
# i64.ne
popInt64(t1)
popInt64(t0)
cqneq t0, t1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i64_lt_s, macro()
# i64.lt_s
popInt64(t1)
popInt64(t0)
cqlt t0, t1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i64_lt_u, macro()
# i64.lt_u
popInt64(t1)
popInt64(t0)
cqb t0, t1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i64_gt_s, macro()
# i64.gt_s
popInt64(t1)
popInt64(t0)
cqgt t0, t1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i64_gt_u, macro()
# i64.gt_u
popInt64(t1)
popInt64(t0)
cqa t0, t1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i64_le_s, macro()
# i64.le_s
popInt64(t1)
popInt64(t0)
cqlteq t0, t1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i64_le_u, macro()
# i64.le_u
popInt64(t1)
popInt64(t0)
cqbeq t0, t1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i64_ge_s, macro()
# i64.ge_s
popInt64(t1)
popInt64(t0)
cqgteq t0, t1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i64_ge_u, macro()
# i64.ge_u
popInt64(t1)
popInt64(t0)
cqaeq t0, t1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
###############################
# 0x5b - 0x60: f32 comparison #
###############################
ipintOp(_f32_eq, macro()
# f32.eq
popFloat32(ft1)
popFloat32(ft0)
cfeq ft0, ft1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f32_ne, macro()
# f32.ne
popFloat32(ft1)
popFloat32(ft0)
cfnequn ft0, ft1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f32_lt, macro()
# f32.lt
popFloat32(ft1)
popFloat32(ft0)
cflt ft0, ft1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f32_gt, macro()
# f32.gt
popFloat32(ft1)
popFloat32(ft0)
cfgt ft0, ft1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f32_le, macro()
# f32.le
popFloat32(ft1)
popFloat32(ft0)
cflteq ft0, ft1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f32_ge, macro()
# f32.ge
popFloat32(ft1)
popFloat32(ft0)
cfgteq ft0, ft1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
###############################
# 0x61 - 0x66: f64 comparison #
###############################
ipintOp(_f64_eq, macro()
# f64.eq
popFloat64(ft1)
popFloat64(ft0)
cdeq ft0, ft1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f64_ne, macro()
# f64.ne
popFloat64(ft1)
popFloat64(ft0)
cdnequn ft0, ft1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f64_lt, macro()
# f64.lt
popFloat64(ft1)
popFloat64(ft0)
cdlt ft0, ft1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f64_gt, macro()
# f64.gt
popFloat64(ft1)
popFloat64(ft0)
cdgt ft0, ft1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f64_le, macro()
# f64.le
popFloat64(ft1)
popFloat64(ft0)
cdlteq ft0, ft1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f64_ge, macro()
# f64.ge
popFloat64(ft1)
popFloat64(ft0)
cdgteq ft0, ft1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
###############################
# 0x67 - 0x78: i32 operations #
###############################
ipintOp(_i32_clz, macro()
# i32.clz
popInt32(t0)
lzcnti t0, t1
pushInt32(t1)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i32_ctz, macro()
# i32.ctz
popInt32(t0)
tzcnti t0, t1
pushInt32(t1)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i32_popcnt, macro()
# i32.popcnt
popInt32(t1)
operationCall(macro() cCall2(_slow_path_wasm_popcount) end)
pushInt32(r1)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i32_add, macro()
# i32.add
popInt32(t1)
popInt32(t0)
addi t1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i32_sub, macro()
# i32.sub
popInt32(t1)
popInt32(t0)
subi t1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i32_mul, macro()
# i32.mul
popInt32(t1)
popInt32(t0)
muli t1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i32_div_s, macro()
# i32.div_s
popInt32(t1)
popInt32(t0)
btiz t1, .ipint_i32_div_s_throwDivisionByZero
bineq t1, -1, .ipint_i32_div_s_safe
bieq t0, constexpr INT32_MIN, .ipint_i32_div_s_throwIntegerOverflow
.ipint_i32_div_s_safe:
if X86_64
# FIXME: Add a way to static_asset that t0 is rax and t2 is rdx
# https://bugs.webkit.org/show_bug.cgi?id=203692
cdqi
idivi t1
elsif ARM64 or ARM64E or RISCV64
divis t1, t0
else
error
end
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
.ipint_i32_div_s_throwDivisionByZero:
ipintException(DivisionByZero)
.ipint_i32_div_s_throwIntegerOverflow:
ipintException(IntegerOverflow)
end)
ipintOp(_i32_div_u, macro()
# i32.div_u
popInt32(t1)
popInt32(t0)
btiz t1, .ipint_i32_div_u_throwDivisionByZero
if X86_64
xori t2, t2
udivi t1
elsif ARM64 or ARM64E or RISCV64
divi t1, t0
else
error
end
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
.ipint_i32_div_u_throwDivisionByZero:
ipintException(DivisionByZero)
end)
ipintOp(_i32_rem_s, macro()
# i32.rem_s
popInt32(t1)
popInt32(t0)
btiz t1, .ipint_i32_rem_s_throwDivisionByZero
bineq t1, -1, .ipint_i32_rem_s_safe
bineq t0, constexpr INT32_MIN, .ipint_i32_rem_s_safe
move 0, t2
jmp .ipint_i32_rem_s_return
.ipint_i32_rem_s_safe:
if X86_64
# FIXME: Add a way to static_asset that t0 is rax and t2 is rdx
# https://bugs.webkit.org/show_bug.cgi?id=203692
cdqi
idivi t1
elsif ARM64 or ARM64E
divis t1, t0, t2
muli t1, t2
subi t0, t2, t2
elsif RISCV64
remis t0, t1, t2
else
error
end
.ipint_i32_rem_s_return:
pushInt32(t2)
advancePC(1)
nextIPIntInstruction()
.ipint_i32_rem_s_throwDivisionByZero:
ipintException(DivisionByZero)
end)
ipintOp(_i32_rem_u, macro()
# i32.rem_u
popInt32(t1)
popInt32(t0)
btiz t1, .ipint_i32_rem_u_throwDivisionByZero
if X86_64
xori t2, t2
udivi t1
elsif ARM64 or ARM64E
divi t1, t0, t2
muli t1, t2
subi t0, t2, t2
elsif RISCV64
remi t0, t1, t2
else
error
end
pushInt32(t2)
advancePC(1)
nextIPIntInstruction()
.ipint_i32_rem_u_throwDivisionByZero:
ipintException(DivisionByZero)
end)
ipintOp(_i32_and, macro()
# i32.and
popInt32(t1)
popInt32(t0)
andi t1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i32_or, macro()
# i32.or
popInt32(t1)
popInt32(t0)
ori t1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i32_xor, macro()
# i32.xor
popInt32(t1)
popInt32(t0)
xori t1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i32_shl, macro()
# i32.shl
popInt32(t1)
popInt32(t0)
lshifti t1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i32_shr_s, macro()
# i32.shr_s
popInt32(t1)
popInt32(t0)
rshifti t1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i32_shr_u, macro()
# i32.shr_u
popInt32(t1)
popInt32(t0)
urshifti t1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i32_rotl, macro()
# i32.rotl
popInt32(t1)
popInt32(t0)
lrotatei t1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i32_rotr, macro()
# i32.rotr
popInt32(t1)
popInt32(t0)
rrotatei t1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
###############################
# 0x79 - 0x8a: i64 operations #
###############################
ipintOp(_i64_clz, macro()
# i64.clz
popInt64(t0)
lzcntq t0, t1
pushInt64(t1)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i64_ctz, macro()
# i64.ctz
popInt64(t0)
tzcntq t0, t1
pushInt64(t1)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i64_popcnt, macro()
# i64.popcnt
popInt64(t1)
operationCall(macro() cCall2(_slow_path_wasm_popcountll) end)
pushInt64(r1)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i64_add, macro()
# i64.add
popInt64(t1)
popInt64(t0)
addq t1, t0
pushInt64(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i64_sub, macro()
# i64.sub
popInt64(t1)
popInt64(t0)
subq t1, t0
pushInt64(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i64_mul, macro()
# i64.mul
popInt64(t1)
popInt64(t0)
mulq t1, t0
pushInt64(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i64_div_s, macro()
# i64.div_s
popInt64(t1)
popInt64(t0)
btqz t1, .ipint_i64_div_s_throwDivisionByZero
bqneq t1, -1, .ipint_i64_div_s_safe
bqeq t0, constexpr INT64_MIN, .ipint_i64_div_s_throwIntegerOverflow
.ipint_i64_div_s_safe:
if X86_64
# FIXME: Add a way to static_asset that t0 is rax and t2 is rdx
# https://bugs.webkit.org/show_bug.cgi?id=203692
cqoq
idivq t1
elsif ARM64 or ARM64E or RISCV64
divqs t1, t0
else
error
end
pushInt64(t0)
advancePC(1)
nextIPIntInstruction()
.ipint_i64_div_s_throwDivisionByZero:
ipintException(DivisionByZero)
.ipint_i64_div_s_throwIntegerOverflow:
ipintException(IntegerOverflow)
end)
ipintOp(_i64_div_u, macro()
# i64.div_u
popInt64(t1)
popInt64(t0)
btqz t1, .ipint_i64_div_u_throwDivisionByZero
if X86_64
xorq t2, t2
udivq t1
elsif ARM64 or ARM64E or RISCV64
divq t1, t0
else
error
end
pushInt64(t0)
advancePC(1)
nextIPIntInstruction()
.ipint_i64_div_u_throwDivisionByZero:
ipintException(DivisionByZero)
end)
ipintOp(_i64_rem_s, macro()
# i64.rem_s
popInt64(t1)
popInt64(t0)
btqz t1, .ipint_i64_rem_s_throwDivisionByZero
bqneq t1, -1, .ipint_i64_rem_s_safe
bqneq t0, constexpr INT64_MIN, .ipint_i64_rem_s_safe
move 0, t2
jmp .ipint_i64_rem_s_return
.ipint_i64_rem_s_safe:
if X86_64
# FIXME: Add a way to static_asset that t0 is rax and t2 is rdx
# https://bugs.webkit.org/show_bug.cgi?id=203692
cqoq
idivq t1
elsif ARM64 or ARM64E
divqs t1, t0, t2
mulq t1, t2
subq t0, t2, t2
elsif RISCV64
remqs t0, t1, t2
else
error
end
.ipint_i64_rem_s_return:
pushInt64(t2)
advancePC(1)
nextIPIntInstruction()
.ipint_i64_rem_s_throwDivisionByZero:
ipintException(DivisionByZero)
end)
ipintOp(_i64_rem_u, macro()
# i64.rem_u
popInt64(t1)
popInt64(t0)
btqz t1, .ipint_i64_rem_u_throwDivisionByZero
if X86_64
xorq t2, t2
udivq t1
elsif ARM64 or ARM64E
divq t1, t0, t2
mulq t1, t2
subq t0, t2, t2
elsif RISCV64
remq t0, t1, t2
else
error
end
pushInt64(t2)
advancePC(1)
nextIPIntInstruction()
.ipint_i64_rem_u_throwDivisionByZero:
ipintException(DivisionByZero)
end)
ipintOp(_i64_and, macro()
# i64.and
popInt64(t1)
popInt64(t0)
andq t1, t0
pushInt64(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i64_or, macro()
# i64.or
popInt64(t1)
popInt64(t0)
orq t1, t0
pushInt64(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i64_xor, macro()
# i64.xor
popInt64(t1)
popInt64(t0)
xorq t1, t0
pushInt64(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i64_shl, macro()
# i64.shl
popInt64(t1)
popInt64(t0)
lshiftq t1, t0
pushInt64(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i64_shr_s, macro()
# i64.shr_s
popInt64(t1)
popInt64(t0)
rshiftq t1, t0
pushInt64(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i64_shr_u, macro()
# i64.shr_u
popInt64(t1)
popInt64(t0)
urshiftq t1, t0
pushInt64(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i64_rotl, macro()
# i64.rotl
popInt64(t1)
popInt64(t0)
lrotateq t1, t0
pushInt64(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i64_rotr, macro()
# i64.rotr
popInt64(t1)
popInt64(t0)
rrotateq t1, t0
pushInt64(t0)
advancePC(1)
nextIPIntInstruction()
end)
###############################
# 0x8b - 0x98: f32 operations #
###############################
ipintOp(_f32_abs, macro()
# f32.abs
popFloat32(ft0)
absf ft0, ft1
pushFloat32(ft1)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f32_neg, macro()
# f32.neg
popFloat32(ft0)
negf ft0, ft1
pushFloat32(ft1)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f32_ceil, macro()
# f32.ceil
popFloat32(ft0)
ceilf ft0, ft1
pushFloat32(ft1)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f32_floor, macro()
# f32.floor
popFloat32(ft0)
floorf ft0, ft1
pushFloat32(ft1)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f32_trunc, macro()
# f32.trunc
popFloat32(ft0)
truncatef ft0, ft1
pushFloat32(ft1)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f32_nearest, macro()
# f32.nearest
popFloat32(ft0)
roundf ft0, ft1
pushFloat32(ft1)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f32_sqrt, macro()
# f32.sqrt
popFloat32(ft0)
sqrtf ft0, ft1
pushFloat32(ft1)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f32_add, macro()
# f32.add
popFloat32(ft1)
popFloat32(ft0)
addf ft1, ft0
pushFloat32(ft0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f32_sub, macro()
# f32.sub
popFloat32(ft1)
popFloat32(ft0)
subf ft1, ft0
pushFloat32(ft0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f32_mul, macro()
# f32.mul
popFloat32(ft1)
popFloat32(ft0)
mulf ft1, ft0
pushFloat32(ft0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f32_div, macro()
# f32.div
popFloat32(ft1)
popFloat32(ft0)
divf ft1, ft0
pushFloat32(ft0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f32_min, macro()
# f32.min
popFloat32(ft1)
popFloat32(ft0)
bfeq ft0, ft1, .ipint_f32_min_equal
bflt ft0, ft1, .ipint_f32_min_lt
bfgt ft0, ft1, .ipint_f32_min_return
.ipint_f32_min_NaN:
addf ft0, ft1
pushFloat32(ft1)
advancePC(1)
nextIPIntInstruction()
.ipint_f32_min_equal:
orf ft0, ft1
pushFloat32(ft1)
advancePC(1)
nextIPIntInstruction()
.ipint_f32_min_lt:
moved ft0, ft1
pushFloat32(ft1)
advancePC(1)
nextIPIntInstruction()
.ipint_f32_min_return:
pushFloat32(ft1)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f32_max, macro()
# f32.max
popFloat32(ft1)
popFloat32(ft0)
bfeq ft1, ft0, .ipint_f32_max_equal
bflt ft1, ft0, .ipint_f32_max_lt
bfgt ft1, ft0, .ipint_f32_max_return
.ipint_f32_max_NaN:
addf ft0, ft1
pushFloat32(ft1)
advancePC(1)
nextIPIntInstruction()
.ipint_f32_max_equal:
andf ft0, ft1
pushFloat32(ft1)
advancePC(1)
nextIPIntInstruction()
.ipint_f32_max_lt:
moved ft0, ft1
pushFloat32(ft1)
advancePC(1)
nextIPIntInstruction()
.ipint_f32_max_return:
pushFloat32(ft1)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f32_copysign, macro()
# f32.copysign
popFloat32(ft1)
popFloat32(ft0)
ff2i ft1, t1
move 0x80000000, t2
andi t2, t1
ff2i ft0, t0
move 0x7fffffff, t2
andi t2, t0
ori t1, t0
fi2f t0, ft0
pushFloat32(ft0)
advancePC(1)
nextIPIntInstruction()
end)
###############################
# 0x99 - 0xa6: f64 operations #
###############################
ipintOp(_f64_abs, macro()
# f64.abs
popFloat64(ft0)
absd ft0, ft1
pushFloat64(ft1)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f64_neg, macro()
# f64.neg
popFloat64(ft0)
negd ft0, ft1
pushFloat64(ft1)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f64_ceil, macro()
# f64.ceil
popFloat64(ft0)
ceild ft0, ft1
pushFloat64(ft1)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f64_floor, macro()
# f64.floor
popFloat64(ft0)
floord ft0, ft1
pushFloat64(ft1)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f64_trunc, macro()
# f64.trunc
popFloat64(ft0)
truncated ft0, ft1
pushFloat64(ft1)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f64_nearest, macro()
# f64.nearest
popFloat64(ft0)
roundd ft0, ft1
pushFloat64(ft1)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f64_sqrt, macro()
# f64.sqrt
popFloat64(ft0)
sqrtd ft0, ft1
pushFloat64(ft1)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f64_add, macro()
# f64.add
popFloat64(ft1)
popFloat64(ft0)
addd ft1, ft0
pushFloat64(ft0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f64_sub, macro()
# f64.sub
popFloat64(ft1)
popFloat64(ft0)
subd ft1, ft0
pushFloat64(ft0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f64_mul, macro()
# f64.mul
popFloat64(ft1)
popFloat64(ft0)
muld ft1, ft0
pushFloat64(ft0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f64_div, macro()
# f64.div
popFloat64(ft1)
popFloat64(ft0)
divd ft1, ft0
pushFloat64(ft0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f64_min, macro()
# f64.min
popFloat64(ft1)
popFloat64(ft0)
bdeq ft0, ft1, .ipint_f64_min_equal
bdlt ft0, ft1, .ipint_f64_min_lt
bdgt ft0, ft1, .ipint_f64_min_return
.ipint_f64_min_NaN:
addd ft0, ft1
pushFloat64(ft1)
advancePC(1)
nextIPIntInstruction()
.ipint_f64_min_equal:
ord ft0, ft1
pushFloat64(ft1)
advancePC(1)
nextIPIntInstruction()
.ipint_f64_min_lt:
moved ft0, ft1
pushFloat64(ft1)
advancePC(1)
nextIPIntInstruction()
.ipint_f64_min_return:
pushFloat64(ft1)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f64_max, macro()
# f64.max
popFloat64(ft1)
popFloat64(ft0)
bdeq ft1, ft0, .ipint_f64_max_equal
bdlt ft1, ft0, .ipint_f64_max_lt
bdgt ft1, ft0, .ipint_f64_max_return
.ipint_f64_max_NaN:
addd ft0, ft1
pushFloat64(ft1)
advancePC(1)
nextIPIntInstruction()
.ipint_f64_max_equal:
andd ft0, ft1
pushFloat64(ft1)
advancePC(1)
nextIPIntInstruction()
.ipint_f64_max_lt:
moved ft0, ft1
pushFloat64(ft1)
advancePC(1)
nextIPIntInstruction()
.ipint_f64_max_return:
pushFloat64(ft1)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f64_copysign, macro()
# f64.copysign
popFloat64(ft1)
popFloat64(ft0)
fd2q ft1, t1
move 0x8000000000000000, t2
andq t2, t1
fd2q ft0, t0
move 0x7fffffffffffffff, t2
andq t2, t0
orq t1, t0
fq2d t0, ft0
pushFloat64(ft0)
advancePC(1)
nextIPIntInstruction()
end)
############################
# 0xa7 - 0xc4: conversions #
############################
ipintOp(_i32_wrap_i64, macro()
# because of how we store values on stack, do nothing
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i32_trunc_f32_s, macro()
popFloat32(ft0)
move 0xcf000000, t0 # INT32_MIN (Note that INT32_MIN - 1.0 in float is the same as INT32_MIN in float).
fi2f t0, ft1
bfltun ft0, ft1, .ipint_trunc_i32_f32_s_outOfBoundsTrunc
move 0x4f000000, t0 # -INT32_MIN
fi2f t0, ft1
bfgtequn ft0, ft1, .ipint_trunc_i32_f32_s_outOfBoundsTrunc
truncatef2is ft0, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
.ipint_trunc_i32_f32_s_outOfBoundsTrunc:
ipintException(OutOfBoundsTrunc)
end)
ipintOp(_i32_trunc_f32_u, macro()
popFloat32(ft0)
move 0xbf800000, t0 # -1.0
fi2f t0, ft1
bfltequn ft0, ft1, .ipint_trunc_i32_f32_u_outOfBoundsTrunc
move 0x4f800000, t0 # INT32_MIN * -2.0
fi2f t0, ft1
bfgtequn ft0, ft1, .ipint_trunc_i32_f32_u_outOfBoundsTrunc
truncatef2i ft0, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
.ipint_trunc_i32_f32_u_outOfBoundsTrunc:
ipintException(OutOfBoundsTrunc)
end)
ipintOp(_i32_trunc_f64_s, macro()
popFloat64(ft0)
move 0xc1e0000000200000, t0 # INT32_MIN - 1.0
fq2d t0, ft1
bdltequn ft0, ft1, .ipint_trunc_i32_f64_s_outOfBoundsTrunc
move 0x41e0000000000000, t0 # -INT32_MIN
fq2d t0, ft1
bdgtequn ft0, ft1, .ipint_trunc_i32_f64_s_outOfBoundsTrunc
truncated2is ft0, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
.ipint_trunc_i32_f64_s_outOfBoundsTrunc:
ipintException(OutOfBoundsTrunc)
end)
ipintOp(_i32_trunc_f64_u, macro()
popFloat64(ft0)
move 0xbff0000000000000, t0 # -1.0
fq2d t0, ft1
bdltequn ft0, ft1, .ipint_trunc_i32_f64_u_outOfBoundsTrunc
move 0x41f0000000000000, t0 # INT32_MIN * -2.0
fq2d t0, ft1
bdgtequn ft0, ft1, .ipint_trunc_i32_f64_u_outOfBoundsTrunc
truncated2i ft0, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
.ipint_trunc_i32_f64_u_outOfBoundsTrunc:
ipintException(OutOfBoundsTrunc)
end)
ipintOp(_i64_extend_i32_s, macro()
popInt32(t0)
sxi2q t0, t0
pushInt64(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i64_extend_i32_u, macro()
popInt32(t0)
move 0, t1
noti t1
andq t1, t0
pushInt64(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i64_trunc_f32_s, macro()
popFloat32(ft0)
move 0xdf000000, t0 # INT64_MIN
fi2f t0, ft1
bfltun ft0, ft1, .ipint_trunc_i64_f32_s_outOfBoundsTrunc
move 0x5f000000, t0 # -INT64_MIN
fi2f t0, ft1
bfgtequn ft0, ft1, .ipint_trunc_i64_f32_s_outOfBoundsTrunc
truncatef2qs ft0, t0
pushInt64(t0)
advancePC(1)
nextIPIntInstruction()
.ipint_trunc_i64_f32_s_outOfBoundsTrunc:
ipintException(OutOfBoundsTrunc)
end)
ipintOp(_i64_trunc_f32_u, macro()
popFloat32(ft0)
move 0xbf800000, t0 # -1.0
fi2f t0, ft1
bfltequn ft0, ft1, .ipint_i64_f32_u_outOfBoundsTrunc
move 0x5f800000, t0 # INT64_MIN * -2.0
fi2f t0, ft1
bfgtequn ft0, ft1, .ipint_i64_f32_u_outOfBoundsTrunc
truncatef2q ft0, t0
pushInt64(t0)
advancePC(1)
nextIPIntInstruction()
.ipint_i64_f32_u_outOfBoundsTrunc:
ipintException(OutOfBoundsTrunc)
end)
ipintOp(_i64_trunc_f64_s, macro()
popFloat64(ft0)
move 0xc3e0000000000000, t0 # INT64_MIN
fq2d t0, ft1
bdltun ft0, ft1, .ipint_i64_f64_s_outOfBoundsTrunc
move 0x43e0000000000000, t0 # -INT64_MIN
fq2d t0, ft1
bdgtequn ft0, ft1, .ipint_i64_f64_s_outOfBoundsTrunc
truncated2qs ft0, t0
pushInt64(t0)
advancePC(1)
nextIPIntInstruction()
.ipint_i64_f64_s_outOfBoundsTrunc:
ipintException(OutOfBoundsTrunc)
end)
ipintOp(_i64_trunc_f64_u, macro()
popFloat64(ft0)
move 0xbff0000000000000, t0 # -1.0
fq2d t0, ft1
bdltequn ft0, ft1, .ipint_i64_f64_u_outOfBoundsTrunc
move 0x43f0000000000000, t0 # INT64_MIN * -2.0
fq2d t0, ft1
bdgtequn ft0, ft1, .ipint_i64_f64_u_outOfBoundsTrunc
truncated2q ft0, t0
pushInt64(t0)
advancePC(1)
nextIPIntInstruction()
.ipint_i64_f64_u_outOfBoundsTrunc:
ipintException(OutOfBoundsTrunc)
end)
ipintOp(_f32_convert_i32_s, macro()
popInt32(t0)
andq 0xffffffff, t0
ci2fs t0, ft0
pushFloat32(ft0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f32_convert_i32_u, macro()
popInt32(t0)
andq 0xffffffff, t0
ci2f t0, ft0
pushFloat32(ft0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f32_convert_i64_s, macro()
popInt64(t0)
cq2fs t0, ft0
pushFloat32(ft0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f32_convert_i64_u, macro()
popInt64(t0)
if X86_64
cq2f t0, t1, ft0
else
cq2f t0, ft0
end
pushFloat32(ft0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f32_demote_f64, macro()
popFloat64(ft0)
cd2f ft0, ft0
pushFloat32(ft0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f64_convert_i32_s, macro()
popInt32(t0)
andq 0xffffffff, t0
ci2ds t0, ft0
pushFloat64(ft0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f64_convert_i32_u, macro()
popInt32(t0)
andq 0xffffffff, t0
ci2d t0, ft0
pushFloat64(ft0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f64_convert_i64_s, macro()
popInt64(t0)
cq2ds t0, ft0
pushFloat64(ft0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f64_convert_i64_u, macro()
popInt64(t0)
if X86_64
cq2d t0, t1, ft0
else
cq2d t0, ft0
end
pushFloat64(ft0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f64_promote_f32, macro()
popFloat32(ft0)
cf2d ft0, ft0
pushFloat64(ft0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i32_reinterpret_f32, macro()
popFloat32(ft0)
ff2i ft0, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i64_reinterpret_f64, macro()
popFloat64(ft0)
fd2q ft0, t0
pushInt64(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f32_reinterpret_i32, macro()
# nop because of stack layout
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_f64_reinterpret_i64, macro()
# nop because of stack layout
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i32_extend8_s, macro()
# i32.extend8_s
popInt32(t0)
sxb2i t0, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i32_extend16_s, macro()
# i32.extend8_s
popInt32(t0)
sxh2i t0, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i64_extend8_s, macro()
# i64.extend8_s
popInt64(t0)
sxb2q t0, t0
pushInt64(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i64_extend16_s, macro()
# i64.extend8_s
popInt64(t0)
sxh2q t0, t0
pushInt64(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_i64_extend32_s, macro()
# i64.extend8_s
popInt64(t0)
sxi2q t0, t0
pushInt64(t0)
advancePC(1)
nextIPIntInstruction()
end)
reservedOpcode(0xc5)
reservedOpcode(0xc6)
reservedOpcode(0xc7)
reservedOpcode(0xc8)
reservedOpcode(0xc9)
reservedOpcode(0xca)
reservedOpcode(0xcb)
reservedOpcode(0xcc)
reservedOpcode(0xcd)
reservedOpcode(0xce)
reservedOpcode(0xcf)
#####################
# 0xd0 - 0xd6: refs #
#####################
ipintOp(_ref_null_t, macro()
loadi IPInt::Const32Metadata::value[MC], t0
pushQuad(t0)
loadb IPInt::Const32Metadata::instructionLength[MC], t0
advancePC(t0)
advanceMC(constexpr (sizeof(IPInt::Const32Metadata)))
nextIPIntInstruction()
end)
ipintOp(_ref_is_null, macro()
popQuad(t0)
cqeq t0, ValueNull, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_ref_func, macro()
loadi IPInt::Const32Metadata::value[MC], a1
operationCall(macro() cCall2(_ipint_extern_ref_func) end)
pushQuad(r0)
loadb IPInt::Const32Metadata::instructionLength[MC], t0
advancePC(t0)
advanceMC(constexpr (sizeof(IPInt::Const32Metadata)))
nextIPIntInstruction()
end)
ipintOp(_ref_eq, macro()
popQuad(t0)
popQuad(t1)
cqeq t0, t1, t0
pushInt32(t0)
advancePC(1)
nextIPIntInstruction()
end)
ipintOp(_ref_as_non_null, macro()
loadq [sp], t0
bqeq t0, ValueNull, .ref_as_non_null_nullRef
advancePC(1)
nextIPIntInstruction()
.ref_as_non_null_nullRef:
throwException(NullRefAsNonNull)
end)
ipintOp(_br_on_null, macro()
validateOpcodeConfig(t0)
loadq [sp], t0
bqneq t0, ValueNull, .br_on_null_not_null
# pop the null
addq StackValueSize, sp
jmp _ipint_br
.br_on_null_not_null:
loadb IPInt::BranchMetadata::instructionLength[MC], t0
advanceMC(constexpr (sizeof(IPInt::BranchMetadata)))
advancePCByReg(t0)
nextIPIntInstruction()
end)
ipintOp(_br_on_non_null, macro()
validateOpcodeConfig(t0)
loadq [sp], t0
bqneq t0, ValueNull, _ipint_br
addq StackValueSize, sp
loadb IPInt::BranchMetadata::instructionLength[MC], t0
advanceMC(constexpr (sizeof(IPInt::BranchMetadata)))
advancePCByReg(t0)
nextIPIntInstruction()
end)
reservedOpcode(0xd7)
reservedOpcode(0xd8)
reservedOpcode(0xd9)
reservedOpcode(0xda)
reservedOpcode(0xdb)
reservedOpcode(0xdc)
reservedOpcode(0xdd)
reservedOpcode(0xde)
reservedOpcode(0xdf)
reservedOpcode(0xe0)
reservedOpcode(0xe1)
reservedOpcode(0xe2)
reservedOpcode(0xe3)
reservedOpcode(0xe4)
reservedOpcode(0xe5)
reservedOpcode(0xe6)
reservedOpcode(0xe7)
reservedOpcode(0xe8)
reservedOpcode(0xe9)
reservedOpcode(0xea)
reservedOpcode(0xeb)
reservedOpcode(0xec)
reservedOpcode(0xed)
reservedOpcode(0xee)
reservedOpcode(0xef)
reservedOpcode(0xf0)
reservedOpcode(0xf1)
reservedOpcode(0xf2)
reservedOpcode(0xf3)
reservedOpcode(0xf4)
reservedOpcode(0xf5)
reservedOpcode(0xf6)
reservedOpcode(0xf7)
reservedOpcode(0xf8)
reservedOpcode(0xf9)
reservedOpcode(0xfa)
# If the following four instructions are given more descriptive names,
# the changes should be matched in IPINT_INSTRUCTIONS in Tools/lldb/debug_ipint.py
ipintOp(_gc_prefix, macro()
decodeLEBVarUInt32(1, t0, t1, t2, t3, t4)
# Security guarantee: always less than 30 (0x00 -> 0x1e)
biaeq t0, 0x1f, .ipint_gc_nonexistent
leap _os_script_config_storage, t1
loadp JSC::LLInt::OpcodeConfig::ipint_gc_dispatch_base[t1], t1
if ARM64 or ARM64E
addlshiftp t1, t0, 8, t0
jmp t0
elsif X86_64
lshiftq 8, t0
addq t1, t0
jmp t0
end
.ipint_gc_nonexistent:
break
end)
ipintOp(_conversion_prefix, macro()
decodeLEBVarUInt32(1, t0, t1, t2, t3, t4)
# Security guarantee: always less than 18 (0x00 -> 0x11)
biaeq t0, 0x12, .ipint_conversion_nonexistent
leap _os_script_config_storage, t1
loadp JSC::LLInt::OpcodeConfig::ipint_conversion_dispatch_base[t1], t1
if ARM64 or ARM64E
addlshiftp t1, t0, 8, t0
jmp t0
elsif X86_64
lshiftq 8, t0
addq t1, t0
jmp t0
end
.ipint_conversion_nonexistent:
break
end)
ipintOp(_simd_prefix, macro()
decodeLEBVarUInt32(1, t0, t1, t2, t3, t4)
# Security guarantee: always less than 256 (0x00 -> 0xff)
biaeq t0, 0x100, .ipint_simd_nonexistent
leap _os_script_config_storage, t1
loadp JSC::LLInt::OpcodeConfig::ipint_simd_dispatch_base[t1], t1
if ARM64 or ARM64E
addlshiftp t1, t0, 8, t0
jmp t0
elsif X86_64
lshiftq 8, t0
addq t1, t0
jmp t0
end
.ipint_simd_nonexistent:
break
end)
ipintOp(_atomic_prefix, macro()
decodeLEBVarUInt32(1, t0, t1, t2, t3, t4)
# Security guarantee: always less than 78 (0x00 -> 0x4e)
biaeq t0, 0x4f, .ipint_atomic_nonexistent
leap _os_script_config_storage, t1
loadp JSC::LLInt::OpcodeConfig::ipint_atomic_dispatch_base[t1], t1
if ARM64 or ARM64E
addlshiftp t1, t0, 8, t0
jmp t0
elsif X86_64
lshiftq 8, t0
addq t1, t0
jmp t0
end
.ipint_atomic_nonexistent:
break
end)
reservedOpcode(0xff)
break
#####################
## GC instructions ##
#####################
ipintOp(_struct_new, macro()
loadi IPInt::StructNewMetadata::type[MC], a1 # type
move sp, a2
operationCallMayThrow(macro() cCall3(_ipint_extern_struct_new) end)
loadh IPInt::StructNewMetadata::params[MC], t1 # number of parameters popped
mulq StackValueSize, t1
addq t1, sp
pushQuad(r0)
loadb IPInt::StructNewMetadata::length[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::StructNewMetadata)))
nextIPIntInstruction()
end)
ipintOp(_struct_new_default, macro()
loadi IPInt::StructNewDefaultMetadata::type[MC], a1 # type
operationCallMayThrow(macro() cCall2(_ipint_extern_struct_new_default) end)
pushQuad(r0)
loadb IPInt::StructNewDefaultMetadata::length[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::StructNewDefaultMetadata)))
nextIPIntInstruction()
end)
ipintOp(_struct_get, macro()
popQuad(a1) # object
loadi IPInt::StructGetSetMetadata::fieldIndex[MC], a2 # field index
subp StackValueSize, sp # allocate space for result
move sp, a3 # result location
operationCallMayThrow(macro() cCall4(_ipint_extern_struct_get) end)
loadb IPInt::StructGetSetMetadata::length[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::StructGetSetMetadata)))
nextIPIntInstruction()
end)
ipintOp(_struct_get_s, macro()
popQuad(a1) # object
loadi IPInt::StructGetSetMetadata::fieldIndex[MC], a2 # field index
subp StackValueSize, sp # allocate space for result
move sp, a3 # result location
operationCallMayThrow(macro() cCall4(_ipint_extern_struct_get_s) end)
loadb IPInt::StructGetSetMetadata::length[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::StructGetSetMetadata)))
nextIPIntInstruction()
end)
ipintOp(_struct_get_u, macro()
popQuad(a1) # object
loadi IPInt::StructGetSetMetadata::fieldIndex[MC], a2 # field index
subp StackValueSize, sp # allocate space for result
move sp, a3 # result location
operationCallMayThrow(macro() cCall4(_ipint_extern_struct_get) end)
loadb IPInt::StructGetSetMetadata::length[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::StructGetSetMetadata)))
nextIPIntInstruction()
end)
ipintOp(_struct_set, macro()
loadp StackValueSize[sp], a1 # object
loadi IPInt::StructGetSetMetadata::fieldIndex[MC], a2 # field index
move sp, a3
operationCallMayThrow(macro() cCall4(_ipint_extern_struct_set) end)
loadb IPInt::StructGetSetMetadata::length[MC], t0
addp 2 * StackValueSize, sp
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::StructGetSetMetadata)))
nextIPIntInstruction()
end)
ipintOp(_array_new, macro()
loadi IPInt::ArrayNewMetadata::type[MC], a1 # type
popInt32(a2) # length
move sp, a3 # pointer to default value
operationCallMayThrow(macro() cCall4(_ipint_extern_array_new) end)
addp StackValueSize, sp # pop default value
pushQuad(r0)
loadb IPInt::ArrayNewMetadata::length[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::ArrayNewMetadata)))
nextIPIntInstruction()
end)
ipintOp(_array_new_default, macro()
loadi IPInt::ArrayNewMetadata::type[MC], a1 # type
popInt32(a2) # length
operationCallMayThrow(macro() cCall3(_ipint_extern_array_new_default) end)
pushQuad(r0)
loadb IPInt::ArrayNewMetadata::length[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::ArrayNewMetadata)))
nextIPIntInstruction()
end)
ipintOp(_array_new_fixed, macro()
loadi IPInt::ArrayNewFixedMetadata::type[MC], a1 # type
loadi IPInt::ArrayNewFixedMetadata::arraySize[MC], a2 # array length
move sp, a3 # arguments
operationCallMayThrow(macro() cCall4(_ipint_extern_array_new_fixed) end)
# pop all the arguments
loadi IPInt::ArrayNewFixedMetadata::arraySize[MC], t3 # array length
muli StackValueSize, t3
addp t3, sp
pushQuad(r0)
loadb IPInt::ArrayNewFixedMetadata::length[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::ArrayNewFixedMetadata)))
nextIPIntInstruction()
end)
ipintOp(_array_new_data, macro()
move MC, a1 # metadata
popInt32(a3) # size
popInt32(a2) # offset
operationCallMayThrow(macro() cCall4(_ipint_extern_array_new_data) end)
pushQuad(r0)
loadb IPInt::ArrayNewDataMetadata::length[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::ArrayNewDataMetadata)))
nextIPIntInstruction()
end)
ipintOp(_array_new_elem, macro()
move MC, a1 # metadata
popInt32(a3) # size
popInt32(a2) # offset
operationCallMayThrow(macro() cCall4(_ipint_extern_array_new_elem) end)
pushQuad(r0)
loadb IPInt::ArrayNewElemMetadata::length[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::ArrayNewElemMetadata)))
nextIPIntInstruction()
end)
ipintOp(_array_get, macro()
loadi IPInt::ArrayGetSetMetadata::type[MC], a1 # type
move sp, a2 # all args on stack, result will be returned on stack
operationCallMayThrow(macro() cCall3(_ipint_extern_array_get) end)
addp StackValueSize, sp # 2 args - 1 result
loadb IPInt::ArrayGetSetMetadata::length[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::ArrayGetSetMetadata)))
nextIPIntInstruction()
end)
ipintOp(_array_get_s, macro()
loadi IPInt::ArrayGetSetMetadata::type[MC], a1 # type
move sp, a2 # all args on stack, result will be returned on stack
operationCallMayThrow(macro() cCall3(_ipint_extern_array_get_s) end)
addp StackValueSize, sp # 2 args - 1 result
loadb IPInt::ArrayGetSetMetadata::length[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::ArrayGetSetMetadata)))
nextIPIntInstruction()
end)
ipintOp(_array_get_u, macro()
loadi IPInt::ArrayGetSetMetadata::type[MC], a1 # type
move sp, a2 # all args on stack, result will be returned on stack
operationCallMayThrow(macro() cCall3(_ipint_extern_array_get) end)
addp StackValueSize, sp # 2 args - 1 result
loadb IPInt::ArrayGetSetMetadata::length[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::ArrayGetSetMetadata)))
nextIPIntInstruction()
end)
ipintOp(_array_set, macro()
loadi IPInt::ArrayGetSetMetadata::type[MC], a1 # type
move sp, a2 # stack pointer with all the arguments
operationCallMayThrow(macro() cCall3(_ipint_extern_array_set) end)
addq StackValueSize * 3, sp
loadb IPInt::ArrayGetSetMetadata::length[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::ArrayGetSetMetadata)))
nextIPIntInstruction()
end)
ipintOp(_array_len, macro()
popQuad(t0) # array into t0
bqeq t0, ValueNull, .nullArray
loadi JSWebAssemblyArray::m_size[t0], t0
pushInt32(t0)
advancePC(2)
nextIPIntInstruction()
.nullArray:
throwException(NullAccess)
end)
ipintOp(_array_fill, macro()
move sp, a1
operationCallMayThrow(macro() cCall2(_ipint_extern_array_fill) end)
addp StackValueSize * 4, sp
loadb IPInt::ArrayFillMetadata::length[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::ArrayFillMetadata)))
nextIPIntInstruction()
end)
ipintOp(_array_copy, macro()
move sp, a1
operationCallMayThrow(macro() cCall2(_ipint_extern_array_copy) end)
addp StackValueSize * 5, sp
loadb IPInt::ArrayFillMetadata::length[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::ArrayCopyMetadata)))
nextIPIntInstruction()
end)
ipintOp(_array_init_data, macro()
loadi IPInt::ArrayInitDataMetadata::dataSegmentIndex[MC], a1
move sp, a2
operationCallMayThrow(macro() cCall3(_ipint_extern_array_init_data) end)
addp StackValueSize * 4, sp
loadb IPInt::ArrayInitDataMetadata::length[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::ArrayInitDataMetadata)))
nextIPIntInstruction()
end)
ipintOp(_array_init_elem, macro()
loadi IPInt::ArrayInitElemMetadata::elemSegmentIndex[MC], a1
move sp, a2
operationCallMayThrow(macro() cCall3(_ipint_extern_array_init_elem) end)
addp StackValueSize * 4, sp
loadb IPInt::ArrayInitElemMetadata::length[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::ArrayInitElemMetadata)))
nextIPIntInstruction()
end)
ipintOp(_ref_test, macro()
loadi IPInt::RefTestCastMetadata::toHeapType[MC], a1
move 0, a2 # allowNull
popQuad(a3)
operationCall(macro() cCall3(_ipint_extern_ref_test) end)
pushInt32(r0)
loadb IPInt::RefTestCastMetadata::length[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::RefTestCastMetadata)))
nextIPIntInstruction()
end)
ipintOp(_ref_test_nullable, macro()
loadi IPInt::RefTestCastMetadata::toHeapType[MC], a1
move 1, a2 # allowNull
popQuad(a3)
operationCall(macro() cCall3(_ipint_extern_ref_test) end)
pushInt32(r0)
loadb IPInt::RefTestCastMetadata::length[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::RefTestCastMetadata)))
nextIPIntInstruction()
end)
ipintOp(_ref_cast, macro()
loadi IPInt::RefTestCastMetadata::toHeapType[MC], a1
move 0, a2 # allowNull
popQuad(a3)
operationCallMayThrow(macro() cCall3(_ipint_extern_ref_cast) end)
pushInt32(r0)
loadb IPInt::RefTestCastMetadata::length[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::RefTestCastMetadata)))
nextIPIntInstruction()
end)
ipintOp(_ref_cast_nullable, macro()
loadi IPInt::RefTestCastMetadata::toHeapType[MC], a1
move 1, a2 # allowNull
popQuad(a3)
operationCallMayThrow(macro() cCall3(_ipint_extern_ref_cast) end)
pushInt32(r0)
loadb IPInt::RefTestCastMetadata::length[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::RefTestCastMetadata)))
nextIPIntInstruction()
end)
ipintOp(_br_on_cast, macro()
validateOpcodeConfig(a1)
loadi IPInt::RefTestCastMetadata::toHeapType[MC], a1
# fb 18 FLAGS
loadb 2[PC], a2
rshifti 1, a2 # bit 1 = null2
loadq [sp], a3
operationCall(macro() cCall3(_ipint_extern_ref_test) end)
advanceMC(constexpr (sizeof(IPInt::RefTestCastMetadata)))
bineq r0, 0, _ipint_br
loadb IPInt::BranchMetadata::instructionLength[MC], t0
advanceMC(constexpr (sizeof(IPInt::BranchMetadata)))
advancePCByReg(t0)
nextIPIntInstruction()
end)
ipintOp(_br_on_cast_fail, macro()
validateOpcodeConfig(a1)
loadi IPInt::RefTestCastMetadata::toHeapType[MC], a1
loadb 2[PC], a2
# fb 19 FLAGS
rshifti 1, a2 # bit 1 = null2
loadq [sp], a3
operationCall(macro() cCall3(_ipint_extern_ref_test) end)
advanceMC(constexpr (sizeof(IPInt::RefTestCastMetadata)))
bieq r0, 0, _ipint_br
loadb IPInt::BranchMetadata::instructionLength[MC], t0
advanceMC(constexpr (sizeof(IPInt::BranchMetadata)))
advancePCByReg(t0)
nextIPIntInstruction()
end)
ipintOp(_any_convert_extern, macro()
popQuad(a1)
operationCall(macro() cCall2(_ipint_extern_any_convert_extern) end)
pushQuad(r0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_extern_convert_any, macro()
# do nothing
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_ref_i31, macro()
popInt32(t0)
lshifti 0x1, t0
rshifti 0x1, t0
orq TagNumber, t0
pushQuad(t0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_i31_get_s, macro()
popQuad(t0)
bqeq t0, ValueNull, .i31_get_throw
pushInt32(t0)
advancePC(2)
nextIPIntInstruction()
.i31_get_throw:
throwException(NullI31Get)
end)
ipintOp(_i31_get_u, macro()
popQuad(t0)
bqeq t0, ValueNull, .i31_get_throw
andq 0x7fffffff, t0
pushInt32(t0)
advancePC(2)
nextIPIntInstruction()
.i31_get_throw:
throwException(NullI31Get)
end)
#############################
## Conversion instructions ##
#############################
ipintOp(_i32_trunc_sat_f32_s, macro()
popFloat32(ft0)
move 0xcf000000, t0 # INT32_MIN (Note that INT32_MIN - 1.0 in float is the same as INT32_MIN in float).
fi2f t0, ft1
bfltun ft0, ft1, .ipint_i32_trunc_sat_f32_s_outOfBoundsTruncSatMinOrNaN
move 0x4f000000, t0 # -INT32_MIN
fi2f t0, ft1
bfgtequn ft0, ft1, .ipint_i32_trunc_sat_f32_s_outOfBoundsTruncSatMax
truncatef2is ft0, t0
pushInt32(t0)
.end:
loadb IPInt::InstructionLengthMetadata::length[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::InstructionLengthMetadata)))
nextIPIntInstruction()
.ipint_i32_trunc_sat_f32_s_outOfBoundsTruncSatMinOrNaN:
bfeq ft0, ft0, .ipint_i32_trunc_sat_f32_s_outOfBoundsTruncSatMin
move 0, t0
pushInt32(t0)
jmp .end
.ipint_i32_trunc_sat_f32_s_outOfBoundsTruncSatMax:
move (constexpr INT32_MAX), t0
pushInt32(t0)
jmp .end
.ipint_i32_trunc_sat_f32_s_outOfBoundsTruncSatMin:
move (constexpr INT32_MIN), t0
pushInt32(t0)
jmp .end
end)
ipintOp(_i32_trunc_sat_f32_u, macro()
popFloat32(ft0)
move 0xbf800000, t0 # -1.0
fi2f t0, ft1
bfltequn ft0, ft1, .ipint_i32_trunc_sat_f32_u_outOfBoundsTruncSatMin
move 0x4f800000, t0 # INT32_MIN * -2.0
fi2f t0, ft1
bfgtequn ft0, ft1, .ipint_i32_trunc_sat_f32_u_outOfBoundsTruncSatMax
truncatef2i ft0, t0
pushInt32(t0)
.end:
loadb IPInt::InstructionLengthMetadata::length[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::InstructionLengthMetadata)))
nextIPIntInstruction()
.ipint_i32_trunc_sat_f32_u_outOfBoundsTruncSatMin:
move 0, t0
pushInt32(t0)
jmp .end
.ipint_i32_trunc_sat_f32_u_outOfBoundsTruncSatMax:
move (constexpr UINT32_MAX), t0
pushInt32(t0)
jmp .end
end)
ipintOp(_i32_trunc_sat_f64_s, macro()
popFloat64(ft0)
move 0xc1e0000000200000, t0 # INT32_MIN - 1.0
fq2d t0, ft1
bdltequn ft0, ft1, .ipint_i32_trunc_sat_f64_s_outOfBoundsTruncSatMinOrNaN
move 0x41e0000000000000, t0 # -INT32_MIN
fq2d t0, ft1
bdgtequn ft0, ft1, .ipint_i32_trunc_sat_f64_s_outOfBoundsTruncSatMax
truncated2is ft0, t0
pushInt32(t0)
.end:
loadb IPInt::InstructionLengthMetadata::length[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::InstructionLengthMetadata)))
nextIPIntInstruction()
.ipint_i32_trunc_sat_f64_s_outOfBoundsTruncSatMinOrNaN:
bdeq ft0, ft0, .ipint_i32_trunc_sat_f64_s_outOfBoundsTruncSatMin
move 0, t0
pushInt32(t0)
jmp .end
.ipint_i32_trunc_sat_f64_s_outOfBoundsTruncSatMax:
move (constexpr INT32_MAX), t0
pushInt32(t0)
jmp .end
.ipint_i32_trunc_sat_f64_s_outOfBoundsTruncSatMin:
move (constexpr INT32_MIN), t0
pushInt32(t0)
jmp .end
end)
ipintOp(_i32_trunc_sat_f64_u, macro()
popFloat64(ft0)
move 0xbff0000000000000, t0 # -1.0
fq2d t0, ft1
bdltequn ft0, ft1, .ipint_i32_trunc_sat_f64_u_outOfBoundsTruncSatMin
move 0x41f0000000000000, t0 # INT32_MIN * -2.0
fq2d t0, ft1
bdgtequn ft0, ft1, .ipint_i32_trunc_sat_f64_u_outOfBoundsTruncSatMax
truncated2i ft0, t0
pushInt32(t0)
.end:
loadb IPInt::InstructionLengthMetadata::length[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::InstructionLengthMetadata)))
nextIPIntInstruction()
.ipint_i32_trunc_sat_f64_u_outOfBoundsTruncSatMin:
move 0, t0
pushInt32(t0)
jmp .end
.ipint_i32_trunc_sat_f64_u_outOfBoundsTruncSatMax:
move (constexpr UINT32_MAX), t0
pushInt32(t0)
jmp .end
end)
ipintOp(_i64_trunc_sat_f32_s, macro()
popFloat32(ft0)
move 0xdf000000, t0 # INT64_MIN
fi2f t0, ft1
bfltun ft0, ft1, .ipint_i64_trunc_sat_f32_s_outOfBoundsTruncSatMinOrNaN
move 0x5f000000, t0 # -INT64_MIN
fi2f t0, ft1
bfgtequn ft0, ft1, .ipint_i64_trunc_sat_f32_s_outOfBoundsTruncSatMax
truncatef2qs ft0, t0
pushInt64(t0)
.end:
loadb IPInt::InstructionLengthMetadata::length[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::InstructionLengthMetadata)))
nextIPIntInstruction()
.ipint_i64_trunc_sat_f32_s_outOfBoundsTruncSatMinOrNaN:
bfeq ft0, ft0, .ipint_i64_trunc_sat_f32_s_outOfBoundsTruncSatMin
move 0, t0
pushInt64(t0)
jmp .end
.ipint_i64_trunc_sat_f32_s_outOfBoundsTruncSatMax:
move (constexpr INT64_MAX), t0
pushInt64(t0)
jmp .end
.ipint_i64_trunc_sat_f32_s_outOfBoundsTruncSatMin:
move (constexpr INT64_MIN), t0
pushInt64(t0)
jmp .end
end)
ipintOp(_i64_trunc_sat_f32_u, macro()
popFloat32(ft0)
move 0xbf800000, t0 # -1.0
fi2f t0, ft1
bfltequn ft0, ft1, .ipint_i64_trunc_sat_f32_u_outOfBoundsTruncSatMin
move 0x5f800000, t0 # INT64_MIN * -2.0
fi2f t0, ft1
bfgtequn ft0, ft1, .ipint_i64_trunc_sat_f32_u_outOfBoundsTruncSatMax
truncatef2q ft0, t0
pushInt64(t0)
.end:
loadb IPInt::InstructionLengthMetadata::length[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::InstructionLengthMetadata)))
nextIPIntInstruction()
.ipint_i64_trunc_sat_f32_u_outOfBoundsTruncSatMin:
move 0, t0
pushInt64(t0)
jmp .end
.ipint_i64_trunc_sat_f32_u_outOfBoundsTruncSatMax:
move (constexpr UINT64_MAX), t0
pushInt64(t0)
jmp .end
end)
ipintOp(_i64_trunc_sat_f64_s, macro()
popFloat64(ft0)
move 0xc3e0000000000000, t0 # INT64_MIN
fq2d t0, ft1
bdltun ft0, ft1, .ipint_i64_trunc_sat_f64_s_outOfBoundsTruncSatMinOrNaN
move 0x43e0000000000000, t0 # -INT64_MIN
fq2d t0, ft1
bdgtequn ft0, ft1, .ipint_i64_trunc_sat_f64_s_outOfBoundsTruncSatMax
truncated2qs ft0, t0
pushInt64(t0)
.end:
loadb IPInt::InstructionLengthMetadata::length[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::InstructionLengthMetadata)))
nextIPIntInstruction()
.ipint_i64_trunc_sat_f64_s_outOfBoundsTruncSatMinOrNaN:
bdeq ft0, ft0, .ipint_i64_trunc_sat_f64_s_outOfBoundsTruncSatMin
move 0, t0
pushInt64(t0)
jmp .end
.ipint_i64_trunc_sat_f64_s_outOfBoundsTruncSatMax:
move (constexpr INT64_MAX), t0
pushInt64(t0)
jmp .end
.ipint_i64_trunc_sat_f64_s_outOfBoundsTruncSatMin:
move (constexpr INT64_MIN), t0
pushInt64(t0)
jmp .end
end)
ipintOp(_i64_trunc_sat_f64_u, macro()
popFloat64(ft0)
move 0xbff0000000000000, t0 # -1.0
fq2d t0, ft1
bdltequn ft0, ft1, .ipint_i64_trunc_sat_f64_u_outOfBoundsTruncSatMin
move 0x43f0000000000000, t0 # INT64_MIN * -2.0
fq2d t0, ft1
bdgtequn ft0, ft1, .ipint_i64_trunc_sat_f64_u_outOfBoundsTruncSatMax
truncated2q ft0, t0
pushInt64(t0)
.end:
loadb IPInt::InstructionLengthMetadata::length[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::InstructionLengthMetadata)))
nextIPIntInstruction()
.ipint_i64_trunc_sat_f64_u_outOfBoundsTruncSatMin:
move 0, t0
pushInt64(t0)
jmp .end
.ipint_i64_trunc_sat_f64_u_outOfBoundsTruncSatMax:
move (constexpr UINT64_MAX), t0
pushInt64(t0)
jmp .end
end)
ipintOp(_memory_init, macro()
# memory.init
move sp, a2
loadi 1[MC], a1
operationCallMayThrow(macro() cCall3(_ipint_extern_memory_init) end)
addq 3 * StackValueSize, sp
loadb [MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::Const32Metadata))) # xxx check
nextIPIntInstruction()
end)
ipintOp(_data_drop, macro()
# data.drop
loadi 1[MC], a1
operationCall(macro() cCall2(_ipint_extern_data_drop) end)
loadb [MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::Const32Metadata))) # xxx check
nextIPIntInstruction()
end)
ipintOp(_memory_copy, macro()
# memory.copy
popQuad(a3) # n
popQuad(a2) # s
popQuad(a1) # d
operationCallMayThrow(macro() cCall4(_ipint_extern_memory_copy) end)
loadb IPInt::InstructionLengthMetadata::length[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::InstructionLengthMetadata)))
nextIPIntInstruction()
end)
ipintOp(_memory_fill, macro()
# memory.fill
popQuad(a3) # n
popQuad(a2) # val
popQuad(a1) # d
operationCallMayThrow(macro() cCall4(_ipint_extern_memory_fill) end)
loadb IPInt::InstructionLengthMetadata::length[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::InstructionLengthMetadata)))
nextIPIntInstruction()
end)
ipintOp(_table_init, macro()
# table.init
move sp, a1
leap [MC], a2 # IPInt::tableInitMetadata
operationCallMayThrow(macro() cCall3(_ipint_extern_table_init) end)
addp 3 * StackValueSize, sp
loadb IPInt::TableInitMetadata::instructionLength[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::TableInitMetadata)))
nextIPIntInstruction()
end)
ipintOp(_elem_drop, macro()
# elem.drop
loadi IPInt::Const32Metadata::value[MC], a1
operationCall(macro() cCall2(_ipint_extern_elem_drop) end)
loadb IPInt::Const32Metadata::instructionLength[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::Const32Metadata)))
nextIPIntInstruction()
end)
ipintOp(_table_copy, macro()
# table.copy
move sp, a1
move MC, a2
operationCallMayThrow(macro() cCall3(_ipint_extern_table_copy) end)
addp 3 * StackValueSize, sp
loadb IPInt::TableCopyMetadata::instructionLength[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::TableCopyMetadata)))
nextIPIntInstruction()
end)
ipintOp(_table_grow, macro()
# table.grow
move sp, a1
move MC, a2 # IPInt::tableGrowMetadata
operationCall(macro() cCall3(_ipint_extern_table_grow) end)
addp StackValueSize * 2, sp
pushQuad(r0)
loadb IPInt::TableGrowMetadata::instructionLength[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::TableGrowMetadata)))
nextIPIntInstruction()
end)
ipintOp(_table_size, macro()
# table.size
loadi IPInt::Const32Metadata::value[MC], a1
operationCall(macro() cCall2(_ipint_extern_table_size) end)
pushQuad(r0)
loadb IPInt::Const32Metadata::instructionLength[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::Const32Metadata)))
nextIPIntInstruction()
end)
ipintOp(_table_fill, macro()
# table.fill
move sp, a1
move MC, a2
operationCallMayThrow(macro() cCall3(_ipint_extern_table_fill) end)
addp 3 * StackValueSize, sp
loadb IPInt::TableFillMetadata::instructionLength[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::TableFillMetadata)))
nextIPIntInstruction()
end)
#######################
## SIMD Instructions ##
#######################
const ImmLaneIdxOffset = 2 # Offset in bytecode
const ImmLaneIdx16Mask = 0xf
const ImmLaneIdx8Mask = 0x7
const ImmLaneIdx4Mask = 0x3
const ImmLaneIdx2Mask = 0x1
# 0xFD 0x00 - 0xFD 0x0B: memory
# Wrapper for SIMD load/store operations. Places linear address in t0 for memOp()
macro simdMemoryOp(accessSize, memOp)
popMemoryIndex(t0, t2)
loadi IPInt::Const32Metadata::value[MC], t2
addp t2, t0
ipintCheckMemoryBound(t0, t2, accessSize)
memOp()
loadb IPInt::Const32Metadata::instructionLength[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::Const32Metadata)))
nextIPIntInstruction()
end
ipintOp(_simd_v128_load_mem, macro()
# v128.load
simdMemoryOp(16, macro()
loadv [memoryBase, t0], v0
pushVec(v0)
end)
end)
ipintOp(_simd_v128_load_8x8s_mem, macro()
# v128.load8x8_s - load 8 8-bit values, sign-extend each to i16
simdMemoryOp(8, macro()
if ARM64 or ARM64E
loadd [memoryBase, t0], ft0
# offlineasm ft0 = ARM v0
# offlineasm v0 = ARM v16
emit "sxtl v16.8h, v0.8b"
elsif X86_64
# memoryBase is r14, t0 is eax
emit "pmovsxbw (%r14,%rax), %xmm0"
else
break # Not implemented
end
pushVec(v0)
end)
end)
ipintOp(_simd_v128_load_8x8u_mem, macro()
# v128.load8x8_u - load 8 8-bit values, zero-extend each to i16
simdMemoryOp(8, macro()
if ARM64 or ARM64E
loadd [memoryBase, t0], ft0
# offlineasm ft0 = ARM v0
# offlineasm v0 = ARM v16
emit "uxtl v16.8h, v0.8b"
elsif X86_64
# memoryBase is r14, t0 is eax
emit "pmovzxbw (%r14,%rax), %xmm0"
else
break # Not implemented
end
pushVec(v0)
end)
end)
ipintOp(_simd_v128_load_16x4s_mem, macro()
# v128.load16x4_s - load 4 16-bit values, sign-extend each to i32
simdMemoryOp(8, macro()
if ARM64 or ARM64E
loadd [memoryBase, t0], ft0
# offlineasm ft0 = ARM v0
# offlineasm v0 = ARM v16
emit "sxtl v16.4s, v0.4h"
elsif X86_64
# memoryBase is r14, t0 is eax
emit "pmovsxwd (%r14,%rax), %xmm0"
else
break # Not implemented
end
pushVec(v0)
end)
end)
ipintOp(_simd_v128_load_16x4u_mem, macro()
# v128.load16x4_u - load 4 16-bit values, zero-extend each to i32
simdMemoryOp(8, macro()
if ARM64 or ARM64E
loadd [memoryBase, t0], ft0
# offlineasm ft0 = ARM v0
# offlineasm v0 = ARM v16
emit "uxtl v16.4s, v0.4h"
elsif X86_64
# memoryBase is r14, t0 is eax
emit "pmovzxwd (%r14,%rax), %xmm0"
else
break # Not implemented
end
pushVec(v0)
end)
end)
ipintOp(_simd_v128_load_32x2s_mem, macro()
# v128.load32x2_s - load 2 32-bit values, sign-extend each to i64
simdMemoryOp(8, macro()
if ARM64 or ARM64E
loadd [memoryBase, t0], ft0
# offlineasm ft0 = ARM v0
# offlineasm v0 = ARM v16
emit "sxtl v16.2d, v0.2s"
elsif X86_64
# memoryBase is r14, t0 is eax
emit "pmovsxdq (%r14,%rax), %xmm0"
else
break # Not implemented
end
pushVec(v0)
end)
end)
ipintOp(_simd_v128_load_32x2u_mem, macro()
# v128.load32x2_u - load 2 32-bit values, zero-extend each to i64
simdMemoryOp(8, macro()
if ARM64 or ARM64E
loadd [memoryBase, t0], ft0
# offlineasm ft0 = ARM v0
# offlineasm v0 = ARM v16
emit "uxtl v16.2d, v0.2s"
elsif X86_64
# memoryBase is r14, t0 is eax
emit "pmovzxdq (%r14,%rax), %xmm0"
else
break # Not implemented
end
pushVec(v0)
end)
end)
ipintOp(_simd_v128_load8_splat_mem, macro()
# v128.load8_splat - load 1 8-bit value and splat to all 16 lanes
simdMemoryOp(1, macro()
if ARM64 or ARM64E
loadb [memoryBase, t0], t1
emit "dup v16.16b, w1"
elsif X86_64
# memoryBase is r14, t0 is eax
emit "vpinsrb $0, (%r14,%rax), %xmm0, %xmm0"
emit "vpxor %xmm1, %xmm1, %xmm1"
emit "vpshufb %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
end)
end)
ipintOp(_simd_v128_load16_splat_mem, macro()
# v128.load16_splat - load 1 16-bit value and splat to all 8 lanes
simdMemoryOp(2, macro()
if ARM64 or ARM64E
loadh [memoryBase, t0], t1
emit "dup v16.8h, w1"
elsif X86_64
# memoryBase is r14, t0 is eax
emit "vpinsrw $0, (%r14,%rax), %xmm0, %xmm0"
emit "vpshuflw $0, %xmm0, %xmm0"
emit "vpunpcklqdq %xmm0, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
end)
end)
ipintOp(_simd_v128_load32_splat_mem, macro()
# v128.load32_splat - load 1 32-bit value and splat to all 4 lanes
simdMemoryOp(4, macro()
if ARM64 or ARM64E
loadi [memoryBase, t0], t1
emit "dup v16.4s, w1"
elsif X86_64
# Load and broadcast 32-bit value directly from memory to all 4 dwords
# memoryBase is r14, t0 is eax
emit "vbroadcastss (%r14,%rax), %xmm0"
else
break # Not implemented
end
pushVec(v0)
end)
end)
ipintOp(_simd_v128_load64_splat_mem, macro()
# v128.load64_splat - load 1 64-bit value and splat to all 2 lanes
simdMemoryOp(8, macro()
if ARM64 or ARM64E
loadq [memoryBase, t0], t1
emit "dup v16.2d, x1"
elsif X86_64
# Load and broadcast 64-bit value directly from memory to both qwords
# memoryBase is r14, t0 is eax
emit "vmovddup (%r14,%rax), %xmm0"
else
break # Not implemented
end
pushVec(v0)
end)
end)
ipintOp(_simd_v128_store_mem, macro()
# v128.store
popVec(v0)
simdMemoryOp(16, macro()
storev v0, [memoryBase, t0]
end)
end)
# 0xFD 0x0C: v128.const
ipintOp(_simd_v128_const, macro()
# v128.const
loadv 2[PC], v0
pushVec(v0)
advancePC(18)
nextIPIntInstruction()
end)
# 0xFD 0x0D - 0xFD 0x14: splat (+ shuffle/swizzle)
ipintOp(_simd_i8x16_shuffle, macro()
# i8x16.shuffle - shuffle bytes from two vectors using 16 immediate indices
if ARM64 or ARM64E
popVec(v1)
popVec(v0)
loadv ImmLaneIdxOffset[PC], v2
emit "tbl v16.16b, {v16.16b, v17.16b}, v18.16b"
pushVec(v0)
else
# X86_64 doesn't natively support shuffle so emulate it
subp V128ISize, sp # Allocate temp result
# Loop through 16 output positions
move 0, t0
.shuffleLoop:
loadb ImmLaneIdxOffset[PC, t0, 1], t1
bigt t1, 31, .outOfBounds
bigt t1, 15, .useRightVector
.useLeftVector:
loadb 32[sp, t1], t2
jmp .storeByte
.useRightVector:
subq t1, 16, t3
loadb 16[sp, t3], t2
jmp .storeByte
.outOfBounds:
move 0, t2
.storeByte:
storeb t2, [sp, t0] # Store to temp result
addq 1, t0 # Increment loop counter
bilt t0, 16, .shuffleLoop
# Copy temp result to final result location
loadq [sp], t0
loadq 8[sp], t1
storeq t0, 32[sp]
storeq t1, 40[sp]
addp 2 * V128ISize, sp # Pop temp result and right vector
end
advancePC(18) # 2 bytes opcode + 16 bytes immediate
nextIPIntInstruction()
end)
ipintOp(_simd_i8x16_swizzle, macro()
# i8x16.swizzle - swizzle bytes from first vector using indices from second vector
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "tbl v16.16b, {v16.16b}, v17.16b"
elsif X86_64
# vpshufb only checks bit 7 for out-of-bounds (returns 0 if bit 7 is set)
# WebAssembly requires returning 0 for any index >= 16
# Add 0x70 with unsigned saturation, so any index > 15 sets bit 7
# (15 + 0x70 = 0x7F, anything > 15 saturates to 0xFF)
# See BBQJIT::fixupOutOfBoundsIndicesForSwizzle
emit "movabsq $0x7070707070707070, %rax"
emit "vmovq %rax, %xmm2"
emit "vpunpcklqdq %xmm2, %xmm2, %xmm2" # xmm2 = [0x70, 0x70, ..., 0x70] (16 bytes)
emit "vpaddusb %xmm2, %xmm1, %xmm1" # Saturating add to set bit 7 for indices > 15
emit "vpshufb %xmm1, %xmm0, %xmm0" # Now vpshufb will return 0 for out-of-bounds
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i8x16_splat, macro()
# i8x16.splat - splat i32 value to all 16 8-bit lanes
popInt32(t0)
if ARM64 or ARM64E
emit "dup v16.16b, w0"
elsif X86_64
# t0 is eax on X86_64, move to xmm0 and broadcast to all 16 bytes
emit "vmovd %eax, %xmm0"
emit "vpinsrb $1, %eax, %xmm0, %xmm0"
emit "vpshuflw $0, %xmm0, %xmm0"
emit "vpunpcklqdq %xmm0, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i16x8_splat, macro()
# i16x8.splat - splat i32 value to all 8 16-bit lanes
popInt32(t0)
if ARM64 or ARM64E
emit "dup v16.8h, w0"
elsif X86_64
# t0 is eax on X86_64, move to xmm0 and broadcast to all 8 words
emit "vmovd %eax, %xmm0"
emit "vpshuflw $0, %xmm0, %xmm0"
emit "vpunpcklqdq %xmm0, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i32x4_splat, macro()
# i32x4.splat - splat i32 value to all 4 32-bit lanes
popInt32(t0)
if ARM64 or ARM64E
emit "dup v16.4s, w0"
elsif X86_64
# t0 is eax on X86_64, move to xmm0 and broadcast to all 4 dwords
emit "vmovd %eax, %xmm0"
emit "vshufps $0, %xmm0, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i64x2_splat, macro()
# i64x2.splat - splat i64 value to all 2 64-bit lanes
popInt64(t0)
if ARM64 or ARM64E
emit "dup v16.2d, x0"
elsif X86_64
# t0 is rax on X86_64
emit "vmovq %rax, %xmm0"
emit "vmovddup %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_f32x4_splat, macro()
# f32x4.splat - splat f32 value to all 4 32-bit float lanes
popFloat32(ft0)
if ARM64 or ARM64E
emit "dup v16.4s, v0.s[0]"
elsif X86_64
# ft0 is xmm0 on X86_64, broadcast to all 4 float lanes
emit "vshufps $0x00, %xmm0, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_f64x2_splat, macro()
# f64x2.splat - splat f64 value to all 2 64-bit float lanes
popFloat64(ft0)
if ARM64 or ARM64E
emit "dup v16.2d, v0.d[0]"
elsif X86_64
# ft0 is xmm0 on X86_64, duplicate lower 64-bit to both lanes
emit "vmovddup %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
# 0xFD 0x15 - 0xFD 0x22: extract and replace lanes
ipintOp(_simd_i8x16_extract_lane_s, macro()
# i8x16.extract_lane_s (lane)
loadb ImmLaneIdxOffset[PC], t0
andi ImmLaneIdx16Mask, t0
loadbsi [sp, t0], t0
addp V128ISize, sp
pushInt32(t0)
advancePC(3)
nextIPIntInstruction()
end)
ipintOp(_simd_i8x16_extract_lane_u, macro()
# i8x16.extract_lane_u (lane)
loadb ImmLaneIdxOffset[PC], t0
andi ImmLaneIdx16Mask, t0
loadb [sp, t0], t0
addp V128ISize, sp
pushInt32(t0)
advancePC(3)
nextIPIntInstruction()
end)
ipintOp(_simd_i8x16_replace_lane, macro()
# i8x16.replace_lane (lane)
loadb ImmLaneIdxOffset[PC], t0
andi ImmLaneIdx16Mask, t0
popInt32(t1) # value to replace with
storeb t1, [sp, t0] # replace the byte at lane index
advancePC(3)
nextIPIntInstruction()
end)
ipintOp(_simd_i16x8_extract_lane_s, macro()
# i16x8.extract_lane_s (lane)
loadb ImmLaneIdxOffset[PC], t0
andi ImmLaneIdx8Mask, t0
loadhsi [sp, t0, 2], t0
addp V128ISize, sp
pushInt32(t0)
advancePC(3)
nextIPIntInstruction()
end)
ipintOp(_simd_i16x8_extract_lane_u, macro()
# i16x8.extract_lane_u (lane)
loadb ImmLaneIdxOffset[PC], t0
andi ImmLaneIdx8Mask, t0
loadh [sp, t0, 2], t0
addp V128ISize, sp
pushInt32(t0)
advancePC(3)
nextIPIntInstruction()
end)
ipintOp(_simd_i16x8_replace_lane, macro()
# i16x8.replace_lane (lane)
loadb ImmLaneIdxOffset[PC], t0
andi ImmLaneIdx8Mask, t0
popInt32(t1) # value to replace with
storeh t1, [sp, t0, 2] # replace the 16-bit value at lane index
advancePC(3)
nextIPIntInstruction()
end)
ipintOp(_simd_i32x4_extract_lane, macro()
# i32x4.extract_lane (lane)
loadb ImmLaneIdxOffset[PC], t0
andi ImmLaneIdx4Mask, t0
loadi [sp, t0, 4], t0
addp V128ISize, sp
pushInt32(t0)
advancePC(3)
nextIPIntInstruction()
end)
ipintOp(_simd_i32x4_replace_lane, macro()
# i32x4.replace_lane (lane)
loadb ImmLaneIdxOffset[PC], t0
andi ImmLaneIdx4Mask, t0
popInt32(t1) # value to replace with
storei t1, [sp, t0, 4] # replace the 32-bit value at lane index
advancePC(3)
nextIPIntInstruction()
end)
ipintOp(_simd_i64x2_extract_lane, macro()
# i64x2.extract_lane (lane)
loadb ImmLaneIdxOffset[PC], t0
andi ImmLaneIdx2Mask, t0
loadq [sp, t0, 8], t0
addp V128ISize, sp
pushInt64(t0)
advancePC(3)
nextIPIntInstruction()
end)
ipintOp(_simd_i64x2_replace_lane, macro()
# i64x2.replace_lane (lane)
loadb ImmLaneIdxOffset[PC], t0
andi ImmLaneIdx2Mask, t0
popInt64(t1) # value to replace with
storeq t1, [sp, t0, 8] # replace the 64-bit value at lane index
advancePC(3)
nextIPIntInstruction()
end)
ipintOp(_simd_f32x4_extract_lane, macro()
# f32x4.extract_lane (lane)
loadb ImmLaneIdxOffset[PC], t0
andi ImmLaneIdx4Mask, t0
loadf [sp, t0, 4], ft0
addp V128ISize, sp
pushFloat32(ft0)
advancePC(3)
nextIPIntInstruction()
end)
ipintOp(_simd_f32x4_replace_lane, macro()
# f32x4.replace_lane (lane)
loadb ImmLaneIdxOffset[PC], t0
andi ImmLaneIdx4Mask, t0
popFloat32(ft0) # value to replace with
storef ft0, [sp, t0, 4] # replace the 32-bit float at lane index
advancePC(3)
nextIPIntInstruction()
end)
ipintOp(_simd_f64x2_extract_lane, macro()
# f64x2.extract_lane (lane)
loadb ImmLaneIdxOffset[PC], t0
andi ImmLaneIdx2Mask, t0
loadd [sp, t0, 8], ft0
addp V128ISize, sp
pushFloat64(ft0)
advancePC(3)
nextIPIntInstruction()
end)
ipintOp(_simd_f64x2_replace_lane, macro()
# f64x2.replace_lane (lane)
loadb ImmLaneIdxOffset[PC], t0
andi ImmLaneIdx2Mask, t0
popFloat64(ft0) # value to replace with
stored ft0, [sp, t0, 8] # replace the 64-bit float at lane index
advancePC(3)
nextIPIntInstruction()
end)
# 0xFD 0x23 - 0xFD 0x2C: i8x16 operations
ipintOp(_simd_i8x16_eq, macro()
# i8x16.eq - compare 16 8-bit integers for equality
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "cmeq v16.16b, v16.16b, v17.16b"
elsif X86_64
emit "vpcmpeqb %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i8x16_ne, macro()
# i8x16.ne - compare 16 8-bit integers for inequality
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
# Compare 16 bytes for equality, then invert the result
emit "cmeq v16.16b, v16.16b, v17.16b"
emit "mvn v16.16b, v16.16b"
elsif X86_64
# Compare for equality, then invert the result
emit "vpcmpeqb %xmm1, %xmm0, %xmm0"
emit "vpcmpeqb %xmm2, %xmm2, %xmm2" # Set all bits to 1
emit "vpxor %xmm2, %xmm0, %xmm0" # Invert result
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i8x16_lt_s, macro()
# i8x16.lt_s - compare 16 8-bit signed integers for less than
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
# cmgt v17, v16 gives us v1 > v0, which is equivalent to v0 < v1
emit "cmgt v16.16b, v17.16b, v16.16b"
elsif X86_64
# vpcmpgtb xmm1, xmm0 gives us xmm1 > xmm0, which is equivalent to xmm0 < xmm1
emit "vpcmpgtb %xmm0, %xmm1, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i8x16_lt_u, macro()
# i8x16.lt_u - compare 16 8-bit unsigned integers for less than
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
# cmhi v17, v16 gives us v1 > v0 (unsigned), which is equivalent to v0 < v1
emit "cmhi v16.16b, v17.16b, v16.16b"
elsif X86_64
# For unsigned comparison, we need to use min/max approach since there's no direct unsigned compare
emit "vpminub %xmm1, %xmm0, %xmm2" # min(xmm0, xmm1) -> xmm2
emit "vpcmpeqb %xmm0, %xmm2, %xmm2" # xmm0 == min ? (xmm0 <= xmm1)
emit "vpcmpeqb %xmm1, %xmm0, %xmm0" # xmm0 == xmm1 ?
emit "vpandn %xmm2, %xmm0, %xmm0" # (xmm0 <= xmm1) && (xmm0 != xmm1) = (xmm0 < xmm1)
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i8x16_gt_s, macro()
# i8x16.gt_s - compare 16 8-bit signed integers for greater than
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "cmgt v16.16b, v16.16b, v17.16b"
elsif X86_64
emit "vpcmpgtb %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i8x16_gt_u, macro()
# i8x16.gt_u - compare 16 8-bit unsigned integers for greater than
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "cmhi v16.16b, v16.16b, v17.16b"
elsif X86_64
# For unsigned comparison: xmm0 > xmm1 iff min(xmm0, xmm1) == xmm1 && xmm0 != xmm1
emit "vpminub %xmm1, %xmm0, %xmm2" # min(xmm0, xmm1) -> xmm2
emit "vpcmpeqb %xmm1, %xmm2, %xmm2" # xmm1 == min ? (xmm1 <= xmm0)
emit "vpcmpeqb %xmm1, %xmm0, %xmm0" # xmm0 == xmm1 ?
emit "vpandn %xmm2, %xmm0, %xmm0" # (xmm1 <= xmm0) && (xmm0 != xmm1) = (xmm0 > xmm1)
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i8x16_le_s, macro()
# i8x16.le_s - compare 16 8-bit signed integers for less than or equal
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
# cmge v17, v16 gives us v1 >= v0, which is equivalent to v0 <= v1
emit "cmge v16.16b, v17.16b, v16.16b"
elsif X86_64
# xmm0 <= xmm1 iff !(xmm0 > xmm1)
emit "vpcmpgtb %xmm1, %xmm0, %xmm0" # xmm0 > xmm1
emit "vpcmpeqb %xmm2, %xmm2, %xmm2" # Set all bits to 1
emit "vpxor %xmm2, %xmm0, %xmm0" # Invert result: !(xmm0 > xmm1)
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i8x16_le_u, macro()
# i8x16.le_u - compare 16 8-bit unsigned integers for less than or equal
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
# cmhs v17, v16 gives us v1 >= v0 (unsigned), which is equivalent to v0 <= v1
emit "cmhs v16.16b, v17.16b, v16.16b"
elsif X86_64
# xmm0 <= xmm1 iff min(xmm0, xmm1) == xmm0
emit "vpminub %xmm1, %xmm0, %xmm2" # min(xmm0, xmm1) -> xmm2
emit "vpcmpeqb %xmm0, %xmm2, %xmm0" # xmm0 == min ? (xmm0 <= xmm1)
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i8x16_ge_s, macro()
# i8x16.ge_s - compare 16 8-bit signed integers for greater than or equal
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "cmge v16.16b, v16.16b, v17.16b"
elsif X86_64
# xmm0 >= xmm1 iff !(xmm0 < xmm1) iff !(xmm1 > xmm0)
emit "vpcmpgtb %xmm0, %xmm1, %xmm0" # xmm1 > xmm0
emit "vpcmpeqb %xmm2, %xmm2, %xmm2" # Set all bits to 1
emit "vpxor %xmm2, %xmm0, %xmm0" # Invert result: !(xmm1 > xmm0)
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i8x16_ge_u, macro()
# i8x16.ge_u - compare 16 8-bit unsigned integers for greater than or equal
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "cmhs v16.16b, v16.16b, v17.16b"
elsif X86_64
# xmm0 >= xmm1 iff min(xmm0, xmm1) == xmm1
emit "vpminub %xmm1, %xmm0, %xmm2" # min(xmm0, xmm1) -> xmm2
emit "vpcmpeqb %xmm1, %xmm2, %xmm0" # xmm1 == min ? (xmm1 <= xmm0) = (xmm0 >= xmm1)
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
# 0xFD 0x2D - 0xFD 0x36: i8x16 operations
ipintOp(_simd_i16x8_eq, macro()
# i16x8.eq - compare 8 16-bit integers for equality
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "cmeq v16.8h, v16.8h, v17.8h"
elsif X86_64
emit "vpcmpeqw %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i16x8_ne, macro()
# i16x8.ne - compare 8 16-bit integers for inequality
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "cmeq v16.8h, v16.8h, v17.8h"
emit "mvn v16.16b, v16.16b"
elsif X86_64
# Compare for equality, then invert the result
emit "vpcmpeqw %xmm1, %xmm0, %xmm0"
emit "vpcmpeqw %xmm2, %xmm2, %xmm2" # Set all bits to 1
emit "vpxor %xmm2, %xmm0, %xmm0" # Invert result
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i16x8_lt_s, macro()
# i16x8.lt_s - compare 8 16-bit signed integers for less than
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
# cmgt v17, v16 gives us v1 > v0, which is equivalent to v0 < v1
emit "cmgt v16.8h, v17.8h, v16.8h"
elsif X86_64
# vpcmpgtw xmm1, xmm0 gives us xmm1 > xmm0, which is equivalent to xmm0 < xmm1
emit "vpcmpgtw %xmm0, %xmm1, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i16x8_lt_u, macro()
# i16x8.lt_u - compare 8 16-bit unsigned integers for less than
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
# cmhi v17, v16 gives us v1 > v0 (unsigned), which is equivalent to v0 < v1
emit "cmhi v16.8h, v17.8h, v16.8h"
elsif X86_64
# For unsigned comparison, we need to use min/max approach since there's no direct unsigned compare
emit "vpminuw %xmm1, %xmm0, %xmm2" # min(xmm0, xmm1) -> xmm2
emit "vpcmpeqw %xmm0, %xmm2, %xmm2" # xmm0 == min ? (xmm0 <= xmm1)
emit "vpcmpeqw %xmm1, %xmm0, %xmm0" # xmm0 == xmm1 ?
emit "vpandn %xmm2, %xmm0, %xmm0" # (xmm0 <= xmm1) && (xmm0 != xmm1) = (xmm0 < xmm1)
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i16x8_gt_s, macro()
# i16x8.gt_s - compare 8 16-bit signed integers for greater than
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "cmgt v16.8h, v16.8h, v17.8h"
elsif X86_64
emit "vpcmpgtw %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i16x8_gt_u, macro()
# i16x8.gt_u - compare 8 16-bit unsigned integers for greater than
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "cmhi v16.8h, v16.8h, v17.8h"
elsif X86_64
# For unsigned comparison: xmm0 > xmm1 iff min(xmm0, xmm1) == xmm1 && xmm0 != xmm1
emit "vpminuw %xmm1, %xmm0, %xmm2" # min(xmm0, xmm1) -> xmm2
emit "vpcmpeqw %xmm1, %xmm2, %xmm2" # xmm1 == min ? (xmm1 <= xmm0)
emit "vpcmpeqw %xmm1, %xmm0, %xmm0" # xmm0 == xmm1 ?
emit "vpandn %xmm2, %xmm0, %xmm0" # (xmm1 <= xmm0) && (xmm0 != xmm1) = (xmm0 > xmm1)
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i16x8_le_s, macro()
# i16x8.le_s - compare 8 16-bit signed integers for less than or equal
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
# cmge v17, v16 gives us v1 >= v0, which is equivalent to v0 <= v1
emit "cmge v16.8h, v17.8h, v16.8h"
elsif X86_64
# xmm0 <= xmm1 iff !(xmm0 > xmm1)
emit "vpcmpgtw %xmm1, %xmm0, %xmm0" # xmm0 > xmm1
emit "vpcmpeqw %xmm2, %xmm2, %xmm2" # Set all bits to 1
emit "vpxor %xmm2, %xmm0, %xmm0" # Invert result: !(xmm0 > xmm1)
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i16x8_le_u, macro()
# i16x8.le_u - compare 8 16-bit unsigned integers for less than or equal
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
# cmhs v17, v16 gives us v1 >= v0 (unsigned), which is equivalent to v0 <= v1
emit "cmhs v16.8h, v17.8h, v16.8h"
elsif X86_64
# xmm0 <= xmm1 iff min(xmm0, xmm1) == xmm0
emit "vpminuw %xmm1, %xmm0, %xmm2" # min(xmm0, xmm1) -> xmm2
emit "vpcmpeqw %xmm0, %xmm2, %xmm0" # xmm0 == min ? (xmm0 <= xmm1)
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i16x8_ge_s, macro()
# i16x8.ge_s - compare 8 16-bit signed integers for greater than or equal
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "cmge v16.8h, v16.8h, v17.8h"
elsif X86_64
# xmm0 >= xmm1 iff !(xmm0 < xmm1) iff !(xmm1 > xmm0)
emit "vpcmpgtw %xmm0, %xmm1, %xmm0" # xmm1 > xmm0
emit "vpcmpeqw %xmm2, %xmm2, %xmm2" # Set all bits to 1
emit "vpxor %xmm2, %xmm0, %xmm0" # Invert result: !(xmm1 > xmm0)
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i16x8_ge_u, macro()
# i16x8.ge_u - compare 8 16-bit unsigned integers for greater than or equal
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "cmhs v16.8h, v16.8h, v17.8h"
elsif X86_64
# xmm0 >= xmm1 iff min(xmm0, xmm1) == xmm1
emit "vpminuw %xmm1, %xmm0, %xmm2" # min(xmm0, xmm1) -> xmm2
emit "vpcmpeqw %xmm1, %xmm2, %xmm0" # xmm1 == min ? (xmm1 <= xmm0) = (xmm0 >= xmm1)
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
# 0xFD 0x37 - 0xFD 0x40: i32x4 operations
ipintOp(_simd_i32x4_eq, macro()
# i32x4.eq - compare 4 32-bit integers for equality
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "cmeq v16.4s, v16.4s, v17.4s"
elsif X86_64
emit "vpcmpeqd %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i32x4_ne, macro()
# i32x4.ne - compare 4 32-bit integers for inequality
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "cmeq v16.4s, v16.4s, v17.4s"
emit "mvn v16.16b, v16.16b"
elsif X86_64
# Compare for equality, then invert the result
emit "vpcmpeqd %xmm1, %xmm0, %xmm0"
emit "vpcmpeqd %xmm2, %xmm2, %xmm2" # Set all bits to 1
emit "vpxor %xmm2, %xmm0, %xmm0" # Invert result
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i32x4_lt_s, macro()
# i32x4.lt_s - compare 4 32-bit signed integers for less than
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
# cmgt v17, v16 gives us v1 > v0, which is equivalent to v0 < v1
emit "cmgt v16.4s, v17.4s, v16.4s"
elsif X86_64
# vpcmpgtd xmm1, xmm0 gives us xmm1 > xmm0, which is equivalent to xmm0 < xmm1
emit "vpcmpgtd %xmm0, %xmm1, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i32x4_lt_u, macro()
# i32x4.lt_u - compare 4 32-bit unsigned integers for less than
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
# cmhi v17, v16 gives us v1 > v0 (unsigned), which is equivalent to v0 < v1
emit "cmhi v16.4s, v17.4s, v16.4s"
elsif X86_64
# For unsigned comparison, we need to use min/max approach
emit "vpminud %xmm1, %xmm0, %xmm2" # min(xmm0, xmm1) -> xmm2
emit "vpcmpeqd %xmm0, %xmm2, %xmm2" # xmm0 == min ? (xmm0 <= xmm1)
emit "vpcmpeqd %xmm1, %xmm0, %xmm0" # xmm0 == xmm1 ?
emit "vpandn %xmm2, %xmm0, %xmm0" # (xmm0 <= xmm1) && (xmm0 != xmm1) = (xmm0 < xmm1)
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i32x4_gt_s, macro()
# i32x4.gt_s - compare 4 32-bit signed integers for greater than
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "cmgt v16.4s, v16.4s, v17.4s"
elsif X86_64
emit "vpcmpgtd %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i32x4_gt_u, macro()
# i32x4.gt_u - compare 4 32-bit unsigned integers for greater than
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "cmhi v16.4s, v16.4s, v17.4s"
elsif X86_64
# For unsigned comparison: xmm0 > xmm1 iff min(xmm0, xmm1) == xmm1 && xmm0 != xmm1
emit "vpminud %xmm1, %xmm0, %xmm2" # min(xmm0, xmm1) -> xmm2
emit "vpcmpeqd %xmm1, %xmm2, %xmm2" # xmm1 == min ? (xmm1 <= xmm0)
emit "vpcmpeqd %xmm1, %xmm0, %xmm0" # xmm0 == xmm1 ?
emit "vpandn %xmm2, %xmm0, %xmm0" # (xmm1 <= xmm0) && (xmm0 != xmm1) = (xmm0 > xmm1)
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i32x4_le_s, macro()
# i32x4.le_s - compare 4 32-bit signed integers for less than or equal
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
# cmge v17, v16 gives us v1 >= v0, which is equivalent to v0 <= v1
emit "cmge v16.4s, v17.4s, v16.4s"
elsif X86_64
# xmm0 <= xmm1 iff !(xmm0 > xmm1)
emit "vpcmpgtd %xmm1, %xmm0, %xmm0" # xmm0 > xmm1
emit "vpcmpeqd %xmm2, %xmm2, %xmm2" # Set all bits to 1
emit "vpxor %xmm2, %xmm0, %xmm0" # Invert result: !(xmm0 > xmm1)
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i32x4_le_u, macro()
# i32x4.le_u - compare 4 32-bit unsigned integers for less than or equal
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
# cmhs v17, v16 gives us v1 >= v0 (unsigned), which is equivalent to v0 <= v1
emit "cmhs v16.4s, v17.4s, v16.4s"
elsif X86_64
# xmm0 <= xmm1 iff min(xmm0, xmm1) == xmm0
emit "vpminud %xmm1, %xmm0, %xmm2" # min(xmm0, xmm1) -> xmm2
emit "vpcmpeqd %xmm0, %xmm2, %xmm0" # xmm0 == min ? (xmm0 <= xmm1)
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i32x4_ge_s, macro()
# i32x4.ge_s - compare 4 32-bit signed integers for greater than or equal
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "cmge v16.4s, v16.4s, v17.4s"
elsif X86_64
# xmm0 >= xmm1 iff !(xmm0 < xmm1) iff !(xmm1 > xmm0)
emit "vpcmpgtd %xmm0, %xmm1, %xmm0" # xmm1 > xmm0
emit "vpcmpeqd %xmm2, %xmm2, %xmm2" # Set all bits to 1
emit "vpxor %xmm2, %xmm0, %xmm0" # Invert result: !(xmm1 > xmm0)
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i32x4_ge_u, macro()
# i32x4.ge_u - compare 4 32-bit unsigned integers for greater than or equal
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "cmhs v16.4s, v16.4s, v17.4s"
elsif X86_64
# xmm0 >= xmm1 iff min(xmm0, xmm1) == xmm1
emit "vpminud %xmm1, %xmm0, %xmm2" # min(xmm0, xmm1) -> xmm2
emit "vpcmpeqd %xmm1, %xmm2, %xmm0" # xmm1 == min ? (xmm1 <= xmm0) = (xmm0 >= xmm1)
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
# 0xFD 0x41 - 0xFD 0x46: f32x4 operations
ipintOp(_simd_f32x4_eq, macro()
# f32x4.eq - compare 4 32-bit floats for equality
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "fcmeq v16.4s, v16.4s, v17.4s"
elsif X86_64
emit "vcmpeqps %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_f32x4_ne, macro()
# f32x4.ne - compare 4 32-bit floats for inequality
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "fcmeq v16.4s, v16.4s, v17.4s"
emit "mvn v16.16b, v16.16b"
elsif X86_64
emit "vcmpneqps %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_f32x4_lt, macro()
# f32x4.lt - compare 4 32-bit floats for less than
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
# fcmgt v17, v16 gives us v1 > v0, which is equivalent to v0 < v1
emit "fcmgt v16.4s, v17.4s, v16.4s"
elsif X86_64
emit "vcmpltps %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_f32x4_gt, macro()
# f32x4.gt - compare 4 32-bit floats for greater than
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "fcmgt v16.4s, v16.4s, v17.4s"
elsif X86_64
emit "vcmpgtps %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_f32x4_le, macro()
# f32x4.le - compare 4 32-bit floats for less than or equal
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
# fcmge v17, v16 gives us v1 >= v0, which is equivalent to v0 <= v1
emit "fcmge v16.4s, v17.4s, v16.4s"
elsif X86_64
emit "vcmpleps %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_f32x4_ge, macro()
# f32x4.ge - compare 4 32-bit floats for greater than or equal
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "fcmge v16.4s, v16.4s, v17.4s"
elsif X86_64
emit "vcmpgeps %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
# 0xFD 0x47 - 0xFD 0x4c: f64x2 operations
ipintOp(_simd_f64x2_eq, macro()
# f64x2.eq - compare 2 64-bit floats for equality
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "fcmeq v16.2d, v16.2d, v17.2d"
elsif X86_64
emit "vcmpeqpd %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_f64x2_ne, macro()
# f64x2.ne - compare 2 64-bit floats for inequality
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "fcmeq v16.2d, v16.2d, v17.2d"
emit "mvn v16.16b, v16.16b"
elsif X86_64
emit "vcmpneqpd %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_f64x2_lt, macro()
# f64x2.lt - compare 2 64-bit floats for less than
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
# fcmgt v17, v16 gives us v1 > v0, which is equivalent to v0 < v1
emit "fcmgt v16.2d, v17.2d, v16.2d"
elsif X86_64
emit "vcmpltpd %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_f64x2_gt, macro()
# f64x2.gt - compare 2 64-bit floats for greater than
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "fcmgt v16.2d, v16.2d, v17.2d"
elsif X86_64
emit "vcmpgtpd %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_f64x2_le, macro()
# f64x2.le - compare 2 64-bit floats for less than or equal
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
# fcmge v17, v16 gives us v1 >= v0, which is equivalent to v0 <= v1
emit "fcmge v16.2d, v17.2d, v16.2d"
elsif X86_64
emit "vcmplepd %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_f64x2_ge, macro()
# f64x2.ge - compare 2 64-bit floats for greater than or equal
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "fcmge v16.2d, v16.2d, v17.2d"
elsif X86_64
emit "vcmpgepd %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
# 0xFD 0x4D - 0xFD 0x53: v128 operations
ipintOp(_simd_v128_not, macro()
# v128.not - bitwise NOT of 128-bit vector
popVec(v0)
if ARM64 or ARM64E
emit "mvn v16.16b, v16.16b"
elsif X86_64
emit "vpcmpeqb %xmm1, %xmm1, %xmm1" # Set all bits to 1
emit "vpxor %xmm1, %xmm0, %xmm0" # Invert all bits
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_v128_and, macro()
# v128.and - bitwise AND of two 128-bit vectors
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "and v16.16b, v16.16b, v17.16b"
elsif X86_64
emit "vpand %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_v128_andnot, macro()
# v128.andnot - bitwise AND NOT of two 128-bit vectors (v0 & ~v1)
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "bic v16.16b, v16.16b, v17.16b"
elsif X86_64
emit "vpandn %xmm0, %xmm1, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_v128_or, macro()
# v128.or - bitwise OR of two 128-bit vectors
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "orr v16.16b, v16.16b, v17.16b"
elsif X86_64
emit "vpor %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_v128_xor, macro()
# v128.xor - bitwise XOR of two 128-bit vectors
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "eor v16.16b, v16.16b, v17.16b"
elsif X86_64
emit "vpxor %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_v128_bitselect, macro()
# v128.bitselect - bitwise select: (a & c) | (b & ~c)
popVec(v2) # selector c
popVec(v1) # b
popVec(v0) # a
if ARM64 or ARM64E
# Use BSL (Bit Select) instruction: bsl vd, vn, vm
# BSL performs: vd = (vd & vn) | (~vd & vm)
# We need: result = (a & c) | (b & ~c)
# So we put c in the destination, then BSL with a and b
emit "mov v18.16b, v18.16b" # v2 -> v18 (selector)
emit "bsl v18.16b, v16.16b, v17.16b" # (c & a) | (~c & b)
emit "mov v16.16b, v18.16b" # result -> v0
elsif X86_64
emit "vpand %xmm2, %xmm0, %xmm3" # xmm3 = a & c
emit "vpandn %xmm1, %xmm2, %xmm2" # xmm2 = b & ~c (vpandn does ~src1 & src2)
emit "vpor %xmm2, %xmm3, %xmm0" # xmm0 = (a & c) | (b & ~c)
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_v128_any_true, macro()
# v128.any_true - return 1 if any bit is set, 0 otherwise
popVec(v0)
if ARM64 or ARM64E
# Use UMAXV to find maximum across all bytes
emit "umaxv b16, v16.16b"
# Extract the result to general purpose register
emit "fmov w0, s16"
# Convert non-zero to 1
emit "cmp w0, #0"
emit "cset w0, ne"
elsif X86_64
emit "vptest %xmm0, %xmm0"
emit "setne %al" # Set AL to 1 if ZF=0 (any bit set), 0 if ZF=1 (all zero)
emit "movzbl %al, %eax" # Zero-extend AL to EAX
else
break # Not implemented
end
pushInt32(t0)
advancePC(2)
nextIPIntInstruction()
end)
# 0xFD 0x54 - 0xFD 0x5D: v128 load/store lane
ipintOp(_simd_v128_load8_lane_mem, macro()
# v128.load8_lane - load 8-bit value from memory and replace lane in existing vector
popVec(v0)
popMemoryIndex(t0, t2)
loadi IPInt::Const32Metadata::value[MC], t2
addp t2, t0
ipintCheckMemoryBound(t0, t2, 1)
loadb [memoryBase, t0], t0
# The lane index comes after the variable length memory offset, so find it by
# advancing the PC and loading the byte before the next instruction.
loadb IPInt::Const32Metadata::instructionLength[MC], t1
advancePCByReg(t1)
loadb -1[PC], t1
andi ImmLaneIdx16Mask, t1
# Push the result and then replace one lane of the result with the loaded value
pushVec(v0)
storeb t0, [sp, t1]
advanceMC(constexpr (sizeof(IPInt::Const32Metadata)))
nextIPIntInstruction()
end)
ipintOp(_simd_v128_load16_lane_mem, macro()
# v128.load16_lane - load 16-bit value from memory and replace lane in existing vector
popVec(v0)
popMemoryIndex(t0, t2)
loadi IPInt::Const32Metadata::value[MC], t2
addp t2, t0
ipintCheckMemoryBound(t0, t2, 2)
loadh [memoryBase, t0], t0
# The lane index comes after the variable length memory offset, so find it by
# advancing the PC and loading the byte before the next instruction.
loadb IPInt::Const32Metadata::instructionLength[MC], t1
advancePCByReg(t1)
loadb -1[PC], t1
andi ImmLaneIdx8Mask, t1
# Push the result and then replace one lane of the result with the loaded value
pushVec(v0)
storeh t0, [sp, t1, 2]
advanceMC(constexpr (sizeof(IPInt::Const32Metadata)))
nextIPIntInstruction()
end)
ipintOp(_simd_v128_load32_lane_mem, macro()
# v128.load32_lane - load 32-bit value from memory and replace lane in existing vector
popVec(v0)
popMemoryIndex(t0, t2)
loadi IPInt::Const32Metadata::value[MC], t2
addp t2, t0
ipintCheckMemoryBound(t0, t2, 4)
loadi [memoryBase, t0], t0
# The lane index comes after the variable length memory offset, so find it by
# advancing the PC and loading the byte before the next instruction.
loadb IPInt::Const32Metadata::instructionLength[MC], t1
advancePCByReg(t1)
loadb -1[PC], t1
andi ImmLaneIdx4Mask, t1
# Push the result and then replace one lane of the result with the loaded value
pushVec(v0)
storei t0, [sp, t1, 4]
advanceMC(constexpr (sizeof(IPInt::Const32Metadata)))
nextIPIntInstruction()
end)
ipintOp(_simd_v128_load64_lane_mem, macro()
# v128.load64_lane - load 64-bit value from memory and replace lane in existing vector
popVec(v0)
popMemoryIndex(t0, t2)
loadi IPInt::Const32Metadata::value[MC], t2
addp t2, t0
ipintCheckMemoryBound(t0, t2, 8)
loadq [memoryBase, t0], t0
# The lane index comes after the variable length memory offset, so find it by
# advancing the PC and loading the byte before the next instruction.
loadb IPInt::Const32Metadata::instructionLength[MC], t1
advancePCByReg(t1)
loadb -1[PC], t1
andi ImmLaneIdx2Mask, t1
# Push the result and then replace one lane of the result with the loaded value
pushVec(v0)
storeq t0, [sp, t1, 8]
advanceMC(constexpr (sizeof(IPInt::Const32Metadata)))
nextIPIntInstruction()
end)
ipintOp(_simd_v128_store8_lane_mem, macro()
# v128.store8_lane - extract 8-bit value from lane and store to memory
# The lane index comes after the variable length memory offset, so find it by
# advancing the PC and loading the byte before the next instruction.
loadb IPInt::Const32Metadata::instructionLength[MC], t0
advancePCByReg(t0)
loadb -1[PC], t1
andi ImmLaneIdx16Mask, t1
loadb [sp, t1], t1 # Load value from lane in vector on stack
addp V128ISize, sp # Pop the vector
popMemoryIndex(t0, t2)
loadi IPInt::Const32Metadata::value[MC], t2
addp t2, t0
ipintCheckMemoryBound(t0, t2, 1)
storeb t1, [memoryBase, t0]
advanceMC(constexpr (sizeof(IPInt::Const32Metadata)))
nextIPIntInstruction()
end)
ipintOp(_simd_v128_store16_lane_mem, macro()
# v128.store16_lane - extract 16-bit value from lane and store to memory
# The lane index comes after the variable length memory offset, so find it by
# advancing the PC and loading the byte before the next instruction.
loadb IPInt::Const32Metadata::instructionLength[MC], t0
advancePCByReg(t0)
loadb -1[PC], t1
andi ImmLaneIdx8Mask, t1
loadh [sp, t1, 2], t1 # Load value from lane in vector on stack
addp V128ISize, sp # Pop the vector
popMemoryIndex(t0, t2)
loadi IPInt::Const32Metadata::value[MC], t2
addp t2, t0
ipintCheckMemoryBound(t0, t2, 2)
storeh t1, [memoryBase, t0]
advanceMC(constexpr (sizeof(IPInt::Const32Metadata)))
nextIPIntInstruction()
end)
ipintOp(_simd_v128_store32_lane_mem, macro()
# v128.store32_lane - extract 32-bit value from lane and store to memory
# The lane index comes after the variable length memory offset, so find it by
# advancing the PC and loading the byte before the next instruction.
loadb IPInt::Const32Metadata::instructionLength[MC], t0
advancePCByReg(t0)
loadb -1[PC], t1
andi ImmLaneIdx4Mask, t1
loadi [sp, t1, 4], t1 # Load value from lane in vector on stack
addp V128ISize, sp # Pop the vector
popMemoryIndex(t0, t2)
loadi IPInt::Const32Metadata::value[MC], t2
addp t2, t0
ipintCheckMemoryBound(t0, t2, 4)
storei t1, [memoryBase, t0]
advanceMC(constexpr (sizeof(IPInt::Const32Metadata)))
nextIPIntInstruction()
end)
ipintOp(_simd_v128_store64_lane_mem, macro()
# v128.store64_lane - extract 64-bit value from lane and store to memory
# The lane index comes after the variable length memory offset, so find it by
# advancing the PC and loading the byte before the next instruction.
loadb IPInt::Const32Metadata::instructionLength[MC], t0
advancePCByReg(t0)
loadb -1[PC], t1
andi ImmLaneIdx2Mask, t1
loadq [sp, t1, 8], t1 # Load value from lane in vector on stack
addp V128ISize, sp # Pop the vector
popMemoryIndex(t0, t2)
loadi IPInt::Const32Metadata::value[MC], t2
addp t2, t0
ipintCheckMemoryBound(t0, t2, 8)
storeq t1, [memoryBase, t0]
advanceMC(constexpr (sizeof(IPInt::Const32Metadata)))
nextIPIntInstruction()
end)
ipintOp(_simd_v128_load32_zero_mem, macro()
# v128.load32_zero - load 32-bit value from memory and zero-pad to 128 bits
simdMemoryOp(4, macro()
loadi [memoryBase, t0], t0
subp V128ISize, sp
storei t0, [sp]
storei 0, 4[sp]
storeq 0, 8[sp]
end)
end)
ipintOp(_simd_v128_load64_zero_mem, macro()
# v128.load64_zero - load 64-bit value from memory and zero-pad to 128 bits
simdMemoryOp(8, macro()
loadq [memoryBase, t0], t0
subp V128ISize, sp
storeq t0, [sp]
storeq 0, 8[sp]
end)
end)
# 0xFD 0x5E - 0xFD 0x5F: f32x4/f64x2 conversion
ipintOp(_simd_f32x4_demote_f64x2_zero, macro()
# f32x4.demote_f64x2_zero - demote 2 f64 values to f32, zero upper 2 lanes
popVec(v0)
if ARM64 or ARM64E
# Convert the two f64 values in lanes 0,1 to f32 and store in lanes 0,1
emit "fcvtn v16.2s, v16.2d"
# Zero the upper 64 bits (lanes 2,3)
emit "mov v16.d[1], xzr"
elsif X86_64
emit "vcvtpd2ps %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_f64x2_promote_low_f32x4, macro()
# f64x2.promote_low_f32x4 - promote lower 2 f32 values to f64
popVec(v0)
if ARM64 or ARM64E
emit "fcvtl v16.2d, v16.2s"
elsif X86_64
emit "vcvtps2pd %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
# 0xFD 0x60 - 0x66: i8x16 operations
ipintOp(_simd_i8x16_abs, macro()
# i8x16.abs - absolute value of 16 8-bit signed integers
popVec(v0)
if ARM64 or ARM64E
emit "abs v16.16b, v16.16b"
elsif X86_64
emit "vpabsb %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i8x16_neg, macro()
# i8x16.neg - negate 16 8-bit integers
popVec(v0)
if ARM64 or ARM64E
emit "neg v16.16b, v16.16b"
elsif X86_64
# Negate by subtracting from zero
emit "vpxor %xmm1, %xmm1, %xmm1"
emit "vpsubb %xmm0, %xmm1, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i8x16_popcnt, macro()
# i8x16.popcnt - population count (count set bits) for 16 8-bit integers
popVec(v0)
if ARM64 or ARM64E
emit "cnt v16.16b, v16.16b"
elsif X86_64
# x86_64 does not natively support vector lanewise popcount, so we emulate it using
# lookup tables, similar to BBQ JIT implementation
# Create bottom nibble mask (0x0f repeated 16 times)
emit "movabsq $0x0f0f0f0f0f0f0f0f, %rax"
emit "vmovq %rax, %xmm1"
emit "vmovq %rax, %xmm4"
emit "vpunpcklqdq %xmm4, %xmm1, %xmm1" # xmm1 = bottom nibble mask
# Create popcount lookup table
emit "movabsq $0x0302020102010100, %rax" # Low 64 bits of lookup table
emit "vmovq %rax, %xmm2"
emit "movabsq $0x0403030203020201, %rax" # High 64 bits of lookup table
emit "vmovq %rax, %xmm4"
emit "vpunpcklqdq %xmm4, %xmm2, %xmm2" # xmm2 = popcount lookup table
# Split input into low and high nibbles
emit "vmovdqa %xmm0, %xmm3" # xmm3 = copy of input
emit "vpand %xmm1, %xmm0, %xmm0" # xmm0 = low nibbles (input & mask)
emit "vpsrlw $4, %xmm3, %xmm3" # Shift right 4 bits
emit "vpand %xmm1, %xmm3, %xmm3" # xmm3 = high nibbles ((input >> 4) & mask)
# Lookup popcount for both nibbles using pshufb
emit "vpshufb %xmm0, %xmm2, %xmm0" # Lookup low nibbles
emit "vpshufb %xmm3, %xmm2, %xmm3" # Lookup high nibbles
# Add the results
emit "vpaddb %xmm3, %xmm0, %xmm0" # Add popcount of low and high nibbles
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i8x16_all_true, macro()
# i8x16.all_true - return 1 if all 16 8-bit lanes are non-zero, 0 otherwise
popVec(v0)
if ARM64 or ARM64E
emit "cmeq v17.16b, v16.16b, #0" # Compare each lane with 0
emit "umaxv b17, v17.16b" # Find maximum (any zero lane will make this non-zero)
emit "fmov w0, s17" # Move to general register
emit "cmp w0, #0" # Compare with 0
emit "cset w0, eq" # Set to 1 if equal (all lanes non-zero), 0 otherwise
elsif X86_64
# Compare each byte with zero to create mask of zero lanes
emit "vpxor %xmm1, %xmm1, %xmm1" # Create zero vector
emit "vpcmpeqb %xmm1, %xmm0, %xmm0" # Compare each byte with 0 (0xFF if zero, 0x00 if non-zero)
emit "vpmovmskb %xmm0, %eax" # Extract sign bits to create 16-bit mask
emit "test %eax, %eax" # Test if any bit is set (any lane was zero)
emit "sete %al" # Set AL to 1 if no bits set (all lanes non-zero), 0 otherwise
emit "movzbl %al, %eax" # Zero-extend to full 32-bit register
else
break # Not implemented
end
pushInt32(t0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i8x16_bitmask, macro()
# i8x16.bitmask - extract most significant bit from each 8-bit lane into a 16-bit integer
# Simple loop over the 16 bytes on the stack
move 0, t0 # Initialize result
move 0, t3 # Byte counter
.bitmask_i8x16_loop:
# Load byte and check sign bit
loadb [sp, t3], t1
andq 0x80, t1 # Extract sign bit
btiz t1, .bitmask_i8x16_next
# Set corresponding bit in result
move 1, t1
lshiftq t3, t1 # Shift to bit position
orq t1, t0
.bitmask_i8x16_next:
addq 1, t3 # Next byte
bilt t3, 16, .bitmask_i8x16_loop
addp V128ISize, sp # Pop the vector
pushInt32(t0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i8x16_narrow_i16x8_s, macro()
# i8x16.narrow_i16x8_s - narrow 2 i16x8 vectors to 1 i8x16 vector with signed saturation
popVec(v1) # Second operand
popVec(v0) # First operand
if ARM64 or ARM64E
# Signed saturating extract narrow: combine v0.8h and v1.8h into v16.16b
emit "sqxtn v16.8b, v16.8h" # Narrow first vector (v0) to lower 8 bytes
emit "sqxtn2 v16.16b, v17.8h" # Narrow second vector (v1) to upper 8 bytes
elsif X86_64
emit "vpacksswb %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i8x16_narrow_i16x8_u, macro()
# i8x16.narrow_i16x8_u - narrow 2 i16x8 vectors to 1 i8x16 vector with unsigned saturation
popVec(v1) # Second operand
popVec(v0) # First operand
if ARM64 or ARM64E
# Signed saturate extract unsigned narrow: combine v0.8h and v1.8h into v16.16b
emit "sqxtun v16.8b, v16.8h" # Narrow first vector (v0) to lower 8 bytes
emit "sqxtun2 v16.16b, v17.8h" # Narrow second vector (v1) to upper 8 bytes
elsif X86_64
emit "vpackuswb %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
# 0xFD 0x67 - 0xFD 0x6A: f32x4 operations
ipintOp(_simd_f32x4_ceil, macro()
# f32x4.ceil - ceiling of 4 32-bit floats
popVec(v0)
if ARM64 or ARM64E
emit "frintp v16.4s, v16.4s"
elsif X86_64
emit "vroundps $0x2, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_f32x4_floor, macro()
# f32x4.floor - floor of 4 32-bit floats
popVec(v0)
if ARM64 or ARM64E
emit "frintm v16.4s, v16.4s"
elsif X86_64
emit "vroundps $0x1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_f32x4_trunc, macro()
# f32x4.trunc - truncate 4 32-bit floats
popVec(v0)
if ARM64 or ARM64E
emit "frintz v16.4s, v16.4s"
elsif X86_64
emit "vroundps $0x3, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_f32x4_nearest, macro()
# f32x4.nearest - round to nearest integer (ties to even) for 4 32-bit floats
popVec(v0)
if ARM64 or ARM64E
emit "frintn v16.4s, v16.4s"
elsif X86_64
emit "vroundps $0x0, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
# 0xFD 0x6B - 0xFD 0x73: i8x16 binary operations
ipintOp(_simd_i8x16_shl, macro()
# i8x16.shl - left shift 16 8-bit integers
popInt32(t0) # shift count
popVec(v0) # vector
if ARM64 or ARM64E
# Mask shift count to 0-7 range for 8-bit elements
andi 7, t0
# Duplicate shift count to all lanes of vector register
emit "dup v17.16b, w0"
# Perform left shift
emit "ushl v16.16b, v16.16b, v17.16b"
elsif X86_64
andi 7, t0
emit "movd %eax, %xmm1"
# See MacroAssemblerX86_64::vectorUshl8()
# Unpack and zero-extend low input bytes to words
emit "vxorps %xmm3, %xmm3, %xmm3"
emit "vpunpcklbw %xmm3, %xmm0, %xmm2"
# Word-wise shift low input bytes
emit "vpsllw %xmm1, %xmm2, %xmm2"
# Unpack and zero-extend high input bytes to words
emit "vpunpckhbw %xmm3, %xmm0, %xmm3"
# Word-wise shift high input bytes
emit "vpsllw %xmm1, %xmm3, %xmm3"
# Mask away higher bits of left-shifted results
emit "vpsllw $8, %xmm2, %xmm2"
emit "vpsllw $8, %xmm3, %xmm3"
emit "vpsrlw $8, %xmm2, %xmm2"
emit "vpsrlw $8, %xmm3, %xmm3"
# Pack low and high results back to bytes
emit "vpackuswb %xmm3, %xmm2, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i8x16_shr_s, macro()
# i8x16.shr_s - arithmetic right shift 16 8-bit signed integers
popInt32(t0) # shift count
popVec(v0) # vector
if ARM64 or ARM64E
# Mask shift count to 0-7 range for 8-bit elements
andi 7, t0
# Negate for right shift
negi t0
# Duplicate shift count to all lanes of vector register
emit "dup v17.16b, w0"
# Perform arithmetic right shift
emit "sshl v16.16b, v16.16b, v17.16b"
elsif X86_64
andi 7, t0
emit "movd %eax, %xmm1"
# See MacroAssemblerX86_64::vectorSshr8()
# Unpack and sign-extend low input bytes to words
emit "vpmovsxbw %xmm0, %xmm2"
# Word-wise shift low input bytes
emit "vpsraw %xmm1, %xmm2, %xmm2"
# Unpack and sign-extend high input bytes
emit "vpshufd $0x0e, %xmm0, %xmm3" # Move high 8 bytes to low position
emit "vpmovsxbw %xmm3, %xmm3"
# Word-wise shift high input bytes
emit "vpsraw %xmm1, %xmm3, %xmm3"
# Pack low and high results back to signed bytes
emit "vpacksswb %xmm3, %xmm2, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i8x16_shr_u, macro()
# i8x16.shr_u - logical right shift 16 8-bit unsigned integers
popInt32(t0) # shift count
popVec(v0) # vector
if ARM64 or ARM64E
# Mask shift count to 0-7 range for 8-bit elements
andi 7, t0
# Negate for right shift
negi t0
# Duplicate shift count to all lanes of vector register
emit "dup v17.16b, w0"
# Perform logical right shift
emit "ushl v16.16b, v16.16b, v17.16b"
elsif X86_64
andi 7, t0
emit "movd %eax, %xmm1"
# See MacroAssemblerX86_64::vectorUshr8()
# Unpack and zero-extend low input bytes to words
emit "vxorps %xmm3, %xmm3, %xmm3"
emit "vpunpcklbw %xmm3, %xmm0, %xmm2"
# Word-wise shift low input bytes
emit "vpsrlw %xmm1, %xmm2, %xmm2"
# Unpack and zero-extend high input bytes to words
emit "vpunpckhbw %xmm3, %xmm0, %xmm3"
# Word-wise shift high input bytes
emit "vpsrlw %xmm1, %xmm3, %xmm3"
# Pack low and high results back to unsigned bytes
emit "vpackuswb %xmm3, %xmm2, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i8x16_add, macro()
# i8x16.add - add 16 8-bit integers
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "add v16.16b, v16.16b, v17.16b"
elsif X86_64
emit "vpaddb %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i8x16_add_sat_s, macro()
# i8x16.add_sat_s - add 16 8-bit signed integers with saturation
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "sqadd v16.16b, v16.16b, v17.16b"
elsif X86_64
emit "vpaddsb %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i8x16_add_sat_u, macro()
# i8x16.add_sat_u - add 16 8-bit unsigned integers with saturation
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "uqadd v16.16b, v16.16b, v17.16b"
elsif X86_64
emit "vpaddusb %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i8x16_sub, macro()
# i8x16.sub - subtract 16 8-bit integers
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "sub v16.16b, v16.16b, v17.16b"
elsif X86_64
emit "vpsubb %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i8x16_sub_sat_s, macro()
# i8x16.sub_sat_s - subtract 16 8-bit signed integers with saturation
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "sqsub v16.16b, v16.16b, v17.16b"
elsif X86_64
emit "vpsubsb %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i8x16_sub_sat_u, macro()
# i8x16.sub_sat_u - subtract 16 8-bit unsigned integers with saturation
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "uqsub v16.16b, v16.16b, v17.16b"
elsif X86_64
emit "vpsubusb %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
# 0xFD 0x74 - 0xFD 0x75: f64x2 operations
ipintOp(_simd_f64x2_ceil, macro()
# f64x2.ceil - ceiling of 2 64-bit floats
popVec(v0)
if ARM64 or ARM64E
emit "frintp v16.2d, v16.2d"
elsif X86_64
emit "vroundpd $0x2, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_f64x2_floor, macro()
# f64x2.floor - floor of 2 64-bit floats
popVec(v0)
if ARM64 or ARM64E
emit "frintm v16.2d, v16.2d"
elsif X86_64
emit "vroundpd $0x1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
# 0xFD 0x76 - 0xFD 0x79: i8x16 binary operations
ipintOp(_simd_i8x16_min_s, macro()
# i8x16.min_s - minimum of 16 8-bit signed integers
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "smin v16.16b, v16.16b, v17.16b"
elsif X86_64
emit "vpminsb %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i8x16_min_u, macro()
# i8x16.min_u - minimum of 16 8-bit unsigned integers
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "umin v16.16b, v16.16b, v17.16b"
elsif X86_64
emit "vpminub %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i8x16_max_s, macro()
# i8x16.max_s - maximum of 16 8-bit signed integers
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "smax v16.16b, v16.16b, v17.16b"
elsif X86_64
emit "vpmaxsb %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i8x16_max_u, macro()
# i8x16.max_u - maximum of 16 8-bit unsigned integers
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "umax v16.16b, v16.16b, v17.16b"
elsif X86_64
emit "vpmaxub %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
# 0xFD 0x7A: f64x2 trunc
ipintOp(_simd_f64x2_trunc, macro()
# f64x2.trunc - truncate 2 64-bit floats
popVec(v0)
if ARM64 or ARM64E
emit "frintz v16.2d, v16.2d"
elsif X86_64
emit "vroundpd $0x3, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
# 0xFD 0x7B: i8x16 avgr_u
ipintOp(_simd_i8x16_avgr_u, macro()
# i8x16.avgr_u - average of 16 8-bit unsigned integers with rounding
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "urhadd v16.16b, v16.16b, v17.16b"
elsif X86_64
emit "vpavgb %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
# 0xFD 0x7C - 0xFD 0x7F: extadd_pairwise
ipintOp(_simd_i16x8_extadd_pairwise_i8x16_s, macro()
# i16x8.extadd_pairwise_i8x16_s - pairwise addition of signed 8-bit integers to 16-bit
popVec(v0)
if ARM64 or ARM64E
emit "saddlp v16.8h, v16.16b"
elsif X86_64
emit "vpcmpeqd %xmm1, %xmm1, %xmm1" # Set all bits to 1
emit "vpsrlw $15, %xmm1, %xmm1" # Shift to get 0x0001 in each 16-bit lane
emit "vpackuswb %xmm1, %xmm1, %xmm1" # Pack to get 0x01 in each 8-bit lane
emit "vpmaddubsw %xmm0, %xmm1, %xmm0" # Pairwise multiply-add (signed)
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i16x8_extadd_pairwise_i8x16_u, macro()
# i16x8.extadd_pairwise_i8x16_u - pairwise addition of unsigned 8-bit integers to 16-bit
popVec(v0)
if ARM64 or ARM64E
emit "uaddlp v16.8h, v16.16b"
elsif X86_64
emit "vpcmpeqd %xmm1, %xmm1, %xmm1" # Set all bits to 1
emit "vpsrlw $15, %xmm1, %xmm1" # Shift to get 0x0001 in each 16-bit lane
emit "vpackuswb %xmm1, %xmm1, %xmm1" # Pack to get 0x01 in each 8-bit lane
emit "vpmaddubsw %xmm1, %xmm0, %xmm0" # Pairwise multiply-add (unsigned)
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i32x4_extadd_pairwise_i16x8_s, macro()
# i32x4.extadd_pairwise_i16x8_s - pairwise addition of signed 16-bit integers to 32-bit
popVec(v0)
if ARM64 or ARM64E
emit "saddlp v16.4s, v16.8h"
elsif X86_64
emit "vpcmpeqd %xmm1, %xmm1, %xmm1" # Set all bits to 1
emit "vpsrld $31, %xmm1, %xmm1" # Shift to get 0x00000001 in each 32-bit lane
emit "vpackssdw %xmm1, %xmm1, %xmm1" # Pack to get 0x0001 in each 16-bit lane
emit "vpmaddwd %xmm0, %xmm1, %xmm0" # Pairwise multiply-add
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i32x4_extadd_pairwise_i16x8_u, macro()
# i32x4.extadd_pairwise_i16x8_u - pairwise addition of unsigned 16-bit integers to 32-bit
popVec(v0)
if ARM64 or ARM64E
emit "uaddlp v16.4s, v16.8h"
elsif X86_64
emit "vpsrld $16, %xmm0, %xmm1" # Shift right to get high 16-bits in low position
emit "vpblendw $0xAA, %xmm1, %xmm0, %xmm0" # Blend: keep low 16-bits from src, high 16-bits from shifted
emit "vpaddd %xmm1, %xmm0, %xmm0" # Add the pairs
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
# 0xFD 0x80 0x01 - 0xFD 0x93 0x01: i16x8 operations
ipintOp(_simd_i16x8_abs, macro()
# i16x8.abs - absolute value of 8 16-bit signed integers
popVec(v0)
if ARM64 or ARM64E
emit "abs v16.8h, v16.8h"
elsif X86_64
emit "vpabsw %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i16x8_neg, macro()
# i16x8.neg - negate 8 16-bit integers
popVec(v0)
if ARM64 or ARM64E
emit "neg v16.8h, v16.8h"
elsif X86_64
# Negate by subtracting from zero
emit "vpxor %xmm1, %xmm1, %xmm1"
emit "vpsubw %xmm0, %xmm1, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i16x8_q15mulr_sat_s, macro()
# i16x8.q15mulr_sat_s - Q15 multiply with rounding and saturation
# Q15 format: multiply two 16-bit values, shift right by 15, round and saturate
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "sqrdmulh v16.8h, v16.8h, v17.8h"
elsif X86_64
# See MacroAssemblerX86_64::vectorMulSat
emit "vpmulhrsw %xmm1, %xmm0, %xmm0" # Q15 multiply with rounding
emit "mov $0x8000, %eax" # Load -32768 (0x8000)
emit "vmovd %eax, %xmm2" # Move to XMM register
emit "vpshuflw $0x00, %xmm2, %xmm2" # Splat to low 4 words
emit "vpshufd $0x00, %xmm2, %xmm2" # Splat to all 8 words
emit "vpcmpeqw %xmm2, %xmm0, %xmm2" # Compare result with -32768
emit "vpxor %xmm2, %xmm0, %xmm0" # Fix saturation: -32768 becomes 32767
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i16x8_all_true, macro()
# i16x8.all_true - return 1 if all 8 16-bit lanes are non-zero, 0 otherwise
popVec(v0)
if ARM64 or ARM64E
emit "cmeq v17.8h, v16.8h, #0" # Compare each lane with 0
emit "umaxv h17, v17.8h" # Find maximum (any zero lane will make this non-zero)
emit "fmov w0, s17" # Move to general register
emit "cmp w0, #0" # Compare with 0
emit "cset w0, eq" # Set to 1 if equal (all lanes non-zero), 0 otherwise
elsif X86_64
# Compare each 16-bit lane with zero
emit "vpxor %xmm1, %xmm1, %xmm1" # Create zero vector
emit "vpcmpeqw %xmm1, %xmm0, %xmm1" # Compare each word with 0 (1 if zero, 0 if non-zero)
# Test if any lane is zero
emit "vpmovmskb %xmm1, %eax" # Extract sign bits
emit "testl %eax, %eax" # Test if any bits are set
emit "sete %al" # Set AL to 1 if no bits set (all lanes non-zero), 0 otherwise
emit "movzbl %al, %eax" # Zero-extend to 32-bit
else
break # Not implemented
end
pushInt32(t0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i16x8_bitmask, macro()
# i16x8.bitmask - extract most significant bit from each 16-bit lane into an 8-bit integer
# Simple loop over the 8 16-bit values on the stack
move 0, t0 # Initialize result
move 0, t3 # Lane counter
.bitmask_i16x8_loop:
# Load 16-bit value and check sign bit
loadh [sp, t3, 2], t1 # Load 16-bit value at offset t1*2
andq 0x8000, t1 # Extract sign bit (bit 15)
btiz t1, .bitmask_i16x8_next
# Set corresponding bit in result
move 1, t1
lshiftq t3, t1 # Shift to bit position
orq t1, t0
.bitmask_i16x8_next:
addq 1, t3 # Next lane
bilt t3, 8, .bitmask_i16x8_loop
addp V128ISize, sp # Pop the vector
pushInt32(t0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i16x8_narrow_i32x4_s, macro()
# i16x8.narrow_i32x4_s - narrow 2 i32x4 vectors to 1 i16x8 vector with signed saturation
popVec(v1) # Second operand
popVec(v0) # First operand
if ARM64 or ARM64E
# Signed saturating extract narrow: combine v0.4s and v1.4s into v16.8h
emit "sqxtn v16.4h, v16.4s" # Narrow first vector (v0) to lower 4 halfwords
emit "sqxtn2 v16.8h, v17.4s" # Narrow second vector (v1) to upper 4 halfwords
elsif X86_64
emit "vpackssdw %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i16x8_narrow_i32x4_u, macro()
# i16x8.narrow_i32x4_u - narrow 2 i32x4 vectors to 1 i16x8 vector with unsigned saturation
popVec(v1) # Second operand
popVec(v0) # First operand
if ARM64 or ARM64E
# Signed saturate extract unsigned narrow: combine v0.4s and v1.4s into v16.8h
emit "sqxtun v16.4h, v16.4s" # Narrow first vector (v0) to lower 4 halfwords
emit "sqxtun2 v16.8h, v17.4s" # Narrow second vector (v1) to upper 4 halfwords
elsif X86_64
emit "vpackusdw %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i16x8_extend_low_i8x16_s, macro()
# i16x8.extend_low_i8x16_s - sign-extend lower 8 i8 values to i16
popVec(v0)
if ARM64 or ARM64E
emit "sxtl v16.8h, v16.8b"
elsif X86_64
emit "vpmovsxbw %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i16x8_extend_high_i8x16_s, macro()
# i16x8.extend_high_i8x16_s - sign-extend upper 8 i8 values to i16
popVec(v0)
if ARM64 or ARM64E
emit "sxtl2 v16.8h, v16.16b"
elsif X86_64
# Move high 64 bits to low, then sign extend
emit "vpsrldq $8, %xmm0, %xmm0" # Shift right 8 bytes to get high half
emit "vpmovsxbw %xmm0, %xmm0" # Sign extend
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i16x8_extend_low_i8x16_u, macro()
# i16x8.extend_low_i8x16_u - zero-extend lower 8 i8 values to i16
popVec(v0)
if ARM64 or ARM64E
emit "uxtl v16.8h, v16.8b"
elsif X86_64
emit "vpmovzxbw %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i16x8_extend_high_i8x16_u, macro()
# i16x8.extend_high_i8x16_u - zero-extend upper 8 i8 values to i16
popVec(v0)
if ARM64 or ARM64E
emit "uxtl2 v16.8h, v16.16b"
elsif X86_64
# Move high 64 bits to low, then zero extend
emit "vpsrldq $8, %xmm0, %xmm0" # Shift right 8 bytes to get high half
emit "vpmovzxbw %xmm0, %xmm0" # Zero extend
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i16x8_shl, macro()
# i16x8.shl - left shift 8 16-bit integers
popInt32(t0) # shift count
popVec(v0) # vector
if ARM64 or ARM64E
# Mask shift count to 0-15 range for 16-bit elements
andi 15, t0
# Duplicate shift count to all lanes of vector register
emit "dup v17.8h, w0"
# Perform left shift
emit "ushl v16.8h, v16.8h, v17.8h"
elsif X86_64
# Mask shift count to 0-15 range for 16-bit elements
andi 15, t0
emit "movd %eax, %xmm1"
# Perform left shift on 16-bit words
emit "vpsllw %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i16x8_shr_s, macro()
# i16x8.shr_s - arithmetic right shift 8 16-bit signed integers
popInt32(t0) # shift count
popVec(v0) # vector
if ARM64 or ARM64E
# Mask shift count to 0-15 range for 16-bit elements
andi 15, t0
# Negate for right shift
negi t0
# Duplicate shift count to all lanes of vector register
emit "dup v17.8h, w0"
# Perform arithmetic right shift
emit "sshl v16.8h, v16.8h, v17.8h"
elsif X86_64
# Mask shift count to 0-15 range for 16-bit elements
andi 15, t0
emit "movd %eax, %xmm1"
# Perform arithmetic right shift on 16-bit words
emit "vpsraw %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i16x8_shr_u, macro()
# i16x8.shr_u - logical right shift 8 16-bit unsigned integers
popInt32(t0) # shift count
popVec(v0) # vector
if ARM64 or ARM64E
# Mask shift count to 0-15 range for 16-bit elements
andi 15, t0
# Negate for right shift
negi t0
# Duplicate shift count to all lanes of vector register
emit "dup v17.8h, w0"
# Perform logical right shift
emit "ushl v16.8h, v16.8h, v17.8h"
elsif X86_64
andi 15, t0
emit "movd %eax, %xmm1"
emit "vpsrlw %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i16x8_add, macro()
# i16x8.add - add 8 16-bit integers
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "add v16.8h, v16.8h, v17.8h"
elsif X86_64
emit "vpaddw %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i16x8_add_sat_s, macro()
# i16x8.add_sat_s - add 8 16-bit signed integers with saturation
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "sqadd v16.8h, v16.8h, v17.8h"
elsif X86_64
emit "vpaddsw %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i16x8_add_sat_u, macro()
# i16x8.add_sat_u - add 8 16-bit unsigned integers with saturation
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "uqadd v16.8h, v16.8h, v17.8h"
elsif X86_64
emit "vpaddusw %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i16x8_sub, macro()
# i16x8.sub - subtract 8 16-bit integers
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "sub v16.8h, v16.8h, v17.8h"
elsif X86_64
emit "vpsubw %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i16x8_sub_sat_s, macro()
# i16x8.sub_sat_s - subtract 8 16-bit signed integers with saturation
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "sqsub v16.8h, v16.8h, v17.8h"
elsif X86_64
emit "vpsubsw %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i16x8_sub_sat_u, macro()
# i16x8.sub_sat_u - subtract 8 16-bit unsigned integers with saturation
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "uqsub v16.8h, v16.8h, v17.8h"
elsif X86_64
emit "vpsubusw %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
# 0xFD 0x94 0x01: f64x2.nearest
ipintOp(_simd_f64x2_nearest, macro()
# f64x2.nearest - round to nearest integer (ties to even) for 2 64-bit floats
popVec(v0)
if ARM64 or ARM64E
emit "frintn v16.2d, v16.2d"
elsif X86_64
emit "vroundpd $0x0, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
# 0xFD 0x95 0x01 - 0xFD 0x9F 0x01: i16x8 operations
ipintOp(_simd_i16x8_mul, macro()
# i16x8.mul - multiply 8 16-bit integers
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "mul v16.8h, v16.8h, v17.8h"
elsif X86_64
emit "vpmullw %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i16x8_min_s, macro()
# i16x8.min_s - minimum of 8 16-bit signed integers
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "smin v16.8h, v16.8h, v17.8h"
elsif X86_64
emit "vpminsw %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i16x8_min_u, macro()
# i16x8.min_u - minimum of 8 16-bit unsigned integers
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "umin v16.8h, v16.8h, v17.8h"
elsif X86_64
emit "vpminuw %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i16x8_max_s, macro()
# i16x8.max_s - maximum of 8 16-bit signed integers
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "smax v16.8h, v16.8h, v17.8h"
elsif X86_64
emit "vpmaxsw %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i16x8_max_u, macro()
# i16x8.max_u - maximum of 8 16-bit unsigned integers
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "umax v16.8h, v16.8h, v17.8h"
elsif X86_64
emit "vpmaxuw %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
reservedOpcode(0xfd9a01)
ipintOp(_simd_i16x8_avgr_u, macro()
# i16x8.avgr_u - average of 8 16-bit unsigned integers with rounding
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "urhadd v16.8h, v16.8h, v17.8h"
elsif X86_64
emit "vpavgw %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i16x8_extmul_low_i8x16_s, macro()
# i16x8.extmul_low_i8x16_s - multiply lower 8 i8 elements and extend to i16
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "smull v16.8h, v16.8b, v17.8b"
elsif X86_64
# See MacroAssemblerX86_64::vectorMulLow
emit "vpmovsxbw %xmm0, %xmm2" # Sign extend left to scratch
emit "vpmovsxbw %xmm1, %xmm0" # Sign extend right to dest
emit "vpmullw %xmm2, %xmm0, %xmm0" # Multiply
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i16x8_extmul_high_i8x16_s, macro()
# i16x8.extmul_high_i8x16_s - multiply upper 8 i8 elements and extend to i16
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "smull2 v16.8h, v16.16b, v17.16b"
elsif X86_64
# See MacroAssemblerX86_64::vectorMulHigh
emit "vpunpckhbw %xmm0, %xmm0, %xmm2" # Unpack high bytes of left
emit "vpsraw $8, %xmm2, %xmm2" # Arithmetic shift to sign extend
emit "vpunpckhbw %xmm1, %xmm1, %xmm0" # Unpack high bytes of right
emit "vpsraw $8, %xmm0, %xmm0" # Arithmetic shift to sign extend
emit "vpmullw %xmm2, %xmm0, %xmm0" # Multiply
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i16x8_extmul_low_i8x16_u, macro()
# i16x8.extmul_low_i8x16_u - multiply lower 8 u8 elements and extend to i16
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "umull v16.8h, v16.8b, v17.8b"
elsif X86_64
# See MacroAssemblerX86_64::vectorMulLow
emit "vpmovzxbw %xmm0, %xmm2" # Zero extend left to scratch
emit "vpmovzxbw %xmm1, %xmm0" # Zero extend right to dest
emit "vpmullw %xmm2, %xmm0, %xmm0" # Multiply
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i16x8_extmul_high_i8x16_u, macro()
# i16x8.extmul_high_i8x16_u - multiply upper 8 u8 elements and extend to i16
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "umull2 v16.8h, v16.16b, v17.16b"
elsif X86_64
# See MacroAssemblerX86_64::vectorMulHigh
emit "vpxor %xmm2, %xmm2, %xmm2" # Zero scratch register
emit "vpunpckhbw %xmm2, %xmm1, %xmm1" # Unpack high bytes of right with zeros
emit "vpunpckhbw %xmm2, %xmm0, %xmm0" # Unpack high bytes of left with zeros
emit "vpmullw %xmm1, %xmm0, %xmm0" # Multiply
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
# 0xFD 0xA0 0x01 - 0xFD 0xBF 0x01: i32x4 operations
ipintOp(_simd_i32x4_abs, macro()
# i32x4.abs - absolute value of 4 32-bit signed integers
popVec(v0)
if ARM64 or ARM64E
emit "abs v16.4s, v16.4s"
elsif X86_64
emit "vpabsd %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i32x4_neg, macro()
# i32x4.neg - negate 4 32-bit integers
popVec(v0)
if ARM64 or ARM64E
emit "neg v16.4s, v16.4s"
elsif X86_64
# Negate by subtracting from zero
emit "vpxor %xmm1, %xmm1, %xmm1"
emit "vpsubd %xmm0, %xmm1, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
reservedOpcode(0xfda201)
ipintOp(_simd_i32x4_all_true, macro()
# i32x4.all_true - return 1 if all 4 32-bit lanes are non-zero, 0 otherwise
popVec(v0)
if ARM64 or ARM64E
emit "cmeq v17.4s, v16.4s, #0" # Compare each lane with 0
emit "umaxv s17, v17.4s" # Find maximum (any zero lane will make this non-zero)
emit "fmov w0, s17" # Move to general register
emit "cmp w0, #0" # Compare with 0
emit "cset w0, eq" # Set to 1 if equal (all lanes non-zero), 0 otherwise
elsif X86_64
# Compare each 32-bit lane with zero
emit "vpxor %xmm1, %xmm1, %xmm1" # Create zero vector
emit "vpcmpeqd %xmm1, %xmm0, %xmm1" # Compare each dword with 0 (1 if zero, 0 if non-zero)
# Test if any lane is zero
emit "vpmovmskb %xmm1, %eax" # Extract sign bits
emit "testl %eax, %eax" # Test if any bits are set
emit "sete %al" # Set AL to 1 if no bits set (all lanes non-zero), 0 otherwise
emit "movzbl %al, %eax" # Zero-extend to 32-bit
else
break # Not implemented
end
pushInt32(t0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i32x4_bitmask, macro()
# i32x4.bitmask - extract most significant bit from each 32-bit lane into a 4-bit integer
# Simple loop over the 4 32-bit values on the stack
move 0, t0 # Initialize result
move 0, t3 # Lane counter
.bitmask_i32x4_loop:
# Load 32-bit value and check sign bit
loadi [sp, t3, 4], t1 # Load 32-bit value at offset t1*4
andq 0x80000000, t1 # Extract sign bit (bit 31)
btiz t1, .bitmask_i32x4_next
# Set corresponding bit in result
move 1, t1
lshiftq t3, t1 # Shift to bit position
orq t1, t0
.bitmask_i32x4_next:
addq 1, t3 # Next lane
bilt t3, 4, .bitmask_i32x4_loop
addp V128ISize, sp # Pop the vector
pushInt32(t0)
advancePC(2)
nextIPIntInstruction()
end)
reservedOpcode(0xfda501)
reservedOpcode(0xfda601)
ipintOp(_simd_i32x4_extend_low_i16x8_s, macro()
# i32x4.extend_low_i16x8_s - sign-extend lower 4 i16 values to i32
popVec(v0)
if ARM64 or ARM64E
emit "sxtl v16.4s, v16.4h"
elsif X86_64
emit "vpmovsxwd %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i32x4_extend_high_i16x8_s, macro()
# i32x4.extend_high_i16x8_s - sign-extend upper 4 i16 values to i32
popVec(v0)
if ARM64 or ARM64E
emit "sxtl2 v16.4s, v16.8h"
elsif X86_64
# Move high 64 bits to low, then sign extend
emit "vpsrldq $8, %xmm0, %xmm0" # Shift right 8 bytes to get high half
emit "vpmovsxwd %xmm0, %xmm0" # Sign extend
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i32x4_extend_low_i16x8_u, macro()
# i32x4.extend_low_i16x8_u - zero-extend lower 4 i16 values to i32
popVec(v0)
if ARM64 or ARM64E
emit "uxtl v16.4s, v16.4h"
elsif X86_64
emit "vpmovzxwd %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i32x4_extend_high_i16x8_u, macro()
# i32x4.extend_high_i16x8_u - zero-extend upper 4 i16 values to i32
popVec(v0)
if ARM64 or ARM64E
emit "uxtl2 v16.4s, v16.8h"
elsif X86_64
# Move high 64 bits to low, then zero extend
emit "vpsrldq $8, %xmm0, %xmm0" # Shift right 8 bytes to get high half
emit "vpmovzxwd %xmm0, %xmm0" # Zero extend
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i32x4_shl, macro()
# i32x4.shl - left shift 4 32-bit integers
popInt32(t0) # shift count
popVec(v0) # vector
if ARM64 or ARM64E
# Mask shift count to 0-31 range for 32-bit elements
andi 31, t0
# Duplicate shift count to all lanes of vector register
emit "dup v17.4s, w0"
# Perform left shift
emit "ushl v16.4s, v16.4s, v17.4s"
elsif X86_64
andi 31, t0
emit "vmovd %eax, %xmm1"
emit "vpslld %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i32x4_shr_s, macro()
# i32x4.shr_s - arithmetic right shift 4 32-bit signed integers
popInt32(t0) # shift count
popVec(v0) # vector
if ARM64 or ARM64E
# Mask shift count to 0-31 range for 32-bit elements
andi 31, t0
# Negate for right shift
negi t0
# Duplicate shift count to all lanes of vector register
emit "dup v17.4s, w0"
# Perform arithmetic right shift
emit "sshl v16.4s, v16.4s, v17.4s"
elsif X86_64
andi 31, t0
emit "vmovd %eax, %xmm1"
emit "vpsrad %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i32x4_shr_u, macro()
# i32x4.shr_u - logical right shift 4 32-bit unsigned integers
popInt32(t0) # shift count
popVec(v0) # vector
if ARM64 or ARM64E
# Mask shift count to 0-31 range for 32-bit elements
andi 31, t0
# Negate for right shift
negi t0
# Duplicate shift count to all lanes of vector register
emit "dup v17.4s, w0"
# Perform logical right shift
emit "ushl v16.4s, v16.4s, v17.4s"
elsif X86_64
andi 31, t0
emit "vmovd %eax, %xmm1"
emit "vpsrld %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i32x4_add, macro()
# i32x4.add - add 4 32-bit integers
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "add v16.4s, v16.4s, v17.4s"
elsif X86_64
emit "vpaddd %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
reservedOpcode(0xfdaf01)
reservedOpcode(0xfdb001)
ipintOp(_simd_i32x4_sub, macro()
# i32x4.sub - subtract 4 32-bit integers
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "sub v16.4s, v16.4s, v17.4s"
elsif X86_64
emit "vpsubd %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
reservedOpcode(0xfdb201)
reservedOpcode(0xfdb301)
reservedOpcode(0xfdb401)
ipintOp(_simd_i32x4_mul, macro()
# i32x4.mul - multiply 4 32-bit integers
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "mul v16.4s, v16.4s, v17.4s"
elsif X86_64
emit "vpmulld %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i32x4_min_s, macro()
# i32x4.min_s - minimum of 4 32-bit signed integers
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "smin v16.4s, v16.4s, v17.4s"
elsif X86_64
emit "vpminsd %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i32x4_min_u, macro()
# i32x4.min_u - minimum of 4 32-bit unsigned integers
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "umin v16.4s, v16.4s, v17.4s"
elsif X86_64
emit "vpminud %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i32x4_max_s, macro()
# i32x4.max_s - maximum of 4 32-bit signed integers
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "smax v16.4s, v16.4s, v17.4s"
elsif X86_64
emit "vpmaxsd %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i32x4_max_u, macro()
# i32x4.max_u - maximum of 4 32-bit unsigned integers
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "umax v16.4s, v16.4s, v17.4s"
elsif X86_64
emit "vpmaxud %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i32x4_dot_i16x8_s, macro()
# i32x4.dot_i16x8_s - dot product of signed 16-bit integers to 32-bit
# Multiplies pairs of adjacent 16-bit elements and adds the results
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
# Use signed multiply long to multiply adjacent pairs, then pairwise add
emit "smull v18.4s, v16.4h, v17.4h" # multiply low 4 pairs to v18
emit "smull2 v16.4s, v16.8h, v17.8h" # multiply high 4 pairs to v19
# Now pairwise add adjacent elements within each vector to get dot products
emit "addp v16.4s, v18.4s, v16.4s" # pairwise add to get final dot product result
elsif X86_64
emit "vpmaddwd %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
reservedOpcode(0xfdbb01)
ipintOp(_simd_i32x4_extmul_low_i16x8_s, macro()
# i32x4.extmul_low_i16x8_s - multiply lower 4 i16 elements and extend to i32
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "smull v16.4s, v16.4h, v17.4h"
elsif X86_64
# See MacroAssemblerX86_64::vectorMulLow
emit "vpmullw %xmm1, %xmm0, %xmm2" # Low multiply to scratch
emit "vpmulhw %xmm1, %xmm0, %xmm0" # High multiply (signed) to dest
emit "vpunpcklwd %xmm0, %xmm2, %xmm0" # Interleave low words
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i32x4_extmul_high_i16x8_s, macro()
# i32x4.extmul_high_i16x8_s - multiply upper 4 i16 elements and extend to i32
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "smull2 v16.4s, v16.8h, v17.8h"
elsif X86_64
# See MacroAssemblerX86_64::vectorMulHigh
emit "vpmullw %xmm1, %xmm0, %xmm2" # Low multiply to scratch
emit "vpmulhw %xmm1, %xmm0, %xmm0" # High multiply (signed) to dest
emit "vpunpckhwd %xmm0, %xmm2, %xmm0" # Interleave high words
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i32x4_extmul_low_i16x8_u, macro()
# i32x4.extmul_low_i16x8_u - multiply lower 4 u16 elements and extend to i32
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "umull v16.4s, v16.4h, v17.4h"
elsif X86_64
# See MacroAssemblerX86_64::vectorMulLow
emit "vpmullw %xmm1, %xmm0, %xmm2" # Low multiply to scratch
emit "vpmulhuw %xmm1, %xmm0, %xmm0" # High multiply (unsigned) to dest
emit "vpunpcklwd %xmm0, %xmm2, %xmm0" # Interleave low words
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i32x4_extmul_high_i16x8_u, macro()
# i32x4.extmul_high_i16x8_u - multiply upper 4 u16 elements and extend to i32
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "umull2 v16.4s, v16.8h, v17.8h"
elsif X86_64
# See MacroAssemblerX86_64::vectorMulHigh
emit "vpmullw %xmm1, %xmm0, %xmm2" # Low multiply to scratch
emit "vpmulhuw %xmm1, %xmm0, %xmm0" # High multiply (unsigned) to dest
emit "vpunpckhwd %xmm0, %xmm2, %xmm0" # Interleave high words
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
# 0xFD 0xC0 0x01 - 0xFD 0xDF 0x01: i64x2 operations
ipintOp(_simd_i64x2_abs, macro()
# i64x2.abs - absolute value of 2 64-bit signed integers
popVec(v0)
if ARM64 or ARM64E
emit "abs v16.2d, v16.2d"
elsif X86_64
# No direct vpabsq instruction, implement manually
# For each 64-bit lane: result = (x < 0) ? -x : x
emit "vpxor %xmm1, %xmm1, %xmm1" # xmm1 = 0
emit "vpcmpgtq %xmm0, %xmm1, %xmm2" # xmm2 = mask where x < 0 (0 > x)
emit "vpsubq %xmm0, %xmm1, %xmm1" # xmm1 = -x
emit "vpblendvb %xmm2, %xmm1, %xmm0, %xmm0" # blend: use -x where mask is true, x otherwise
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i64x2_neg, macro()
# i64x2.neg - negate 2 64-bit integers
popVec(v0)
if ARM64 or ARM64E
emit "neg v16.2d, v16.2d"
elsif X86_64
# Negate by subtracting from zero
emit "vpxor %xmm1, %xmm1, %xmm1"
emit "vpsubq %xmm0, %xmm1, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
reservedOpcode(0xfdc201)
ipintOp(_simd_i64x2_all_true, macro()
# i64x2.all_true - return 1 if all 2 64-bit lanes are non-zero, 0 otherwise
popVec(v0)
if ARM64 or ARM64E
emit "cmeq v17.2d, v16.2d, #0" # Compare each lane with 0
emit "addp d17, v17.2d" # Add pair - if any lane was 0, result will be non-zero
emit "fmov x0, d17" # Move to general register
emit "cmp x0, #0" # Compare with 0
emit "cset w0, eq" # Set to 1 if equal (all lanes non-zero), 0 otherwise
elsif X86_64
# Compare each 64-bit lane with zero
emit "vpxor %xmm1, %xmm1, %xmm1" # Create zero vector
emit "vpcmpeqq %xmm1, %xmm0, %xmm1" # Compare each qword with 0 (1 if zero, 0 if non-zero)
# Test if any lane is zero
emit "vpmovmskb %xmm1, %eax" # Extract sign bits
emit "testl %eax, %eax" # Test if any bits are set
emit "sete %al" # Set AL to 1 if no bits set (all lanes non-zero), 0 otherwise
emit "movzbl %al, %eax" # Zero-extend to 32-bit
else
break # Not implemented
end
pushInt32(t0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i64x2_bitmask, macro()
# i64x2.bitmask - extract most significant bit from each 64-bit lane into a 2-bit integer
# Handle both 64-bit values directly
# Load both 64-bit values
loadq [sp], t0 # Load lane 0
loadq 8[sp], t1 # Load lane 1
addp V128ISize, sp # Pop the vector
# Initialize result
move 0, t2
# Check lane 0 sign bit (bit 63)
move 0x8000000000000000, t3
andq t3, t0
btqz t0, .bitmask_i64x2_lane1
orq 1, t2 # Set bit 0
.bitmask_i64x2_lane1:
# Check lane 1 sign bit (bit 63)
andq t3, t1
btqz t1, .bitmask_i64x2_done
orq 2, t2 # Set bit 1
.bitmask_i64x2_done:
pushInt32(t2)
advancePC(2)
nextIPIntInstruction()
end)
reservedOpcode(0xfdc501)
reservedOpcode(0xfdc601)
ipintOp(_simd_i64x2_extend_low_i32x4_s, macro()
# i64x2.extend_low_i32x4_s - sign-extend lower 2 i32 values to i64
popVec(v0)
if ARM64 or ARM64E
emit "sxtl v16.2d, v16.2s"
elsif X86_64
emit "vpmovsxdq %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i64x2_extend_high_i32x4_s, macro()
# i64x2.extend_high_i32x4_s - sign-extend upper 2 i32 values to i64
popVec(v0)
if ARM64 or ARM64E
emit "sxtl2 v16.2d, v16.4s"
elsif X86_64
# Move high 64 bits to low, then sign extend
emit "vpsrldq $8, %xmm0, %xmm0" # Shift right 8 bytes to get high half
emit "vpmovsxdq %xmm0, %xmm0" # Sign extend
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i64x2_extend_low_i32x4_u, macro()
# i64x2.extend_low_i32x4_u - zero-extend lower 2 i32 values to i64
popVec(v0)
if ARM64 or ARM64E
emit "uxtl v16.2d, v16.2s"
elsif X86_64
emit "vpmovzxdq %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i64x2_extend_high_i32x4_u, macro()
# i64x2.extend_high_i32x4_u - zero-extend upper 2 i32 values to i64
popVec(v0)
if ARM64 or ARM64E
emit "uxtl2 v16.2d, v16.4s"
elsif X86_64
# Move high 64 bits to low, then zero extend
emit "vpsrldq $8, %xmm0, %xmm0" # Shift right 8 bytes to get high half
emit "vpmovzxdq %xmm0, %xmm0" # Zero extend
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i64x2_shl, macro()
# i64x2.shl - left shift 2 64-bit integers
popInt32(t0) # shift count
popVec(v0) # vector
if ARM64 or ARM64E
# Mask shift count to 0-63 range for 64-bit elements
andi 63, t0
# Duplicate shift count to all lanes of vector register
emit "dup v17.2d, x0"
# Perform left shift
emit "ushl v16.2d, v16.2d, v17.2d"
elsif X86_64
andi 63, t0
emit "movd %eax, %xmm1"
emit "vpsllq %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i64x2_shr_s, macro()
# i64x2.shr_s - arithmetic right shift 2 64-bit signed integers
popInt32(t0) # shift count
# Mask shift count to 0-63 range for 64-bit elements
andi 63, t0
loadq 8[sp], t1
rshiftq t0, t1
storeq t1, 8[sp]
loadq [sp], t1
rshiftq t0, t1
storeq t1, [sp]
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i64x2_shr_u, macro()
# i64x2.shr_u - logical right shift 2 64-bit unsigned integers
popInt32(t0) # shift count
popVec(v0) # vector
if ARM64 or ARM64E
# Mask shift count to 0-63 range for 64-bit elements
andi 63, t0
# Negate for right shift
negq t0
# Duplicate shift count to all lanes of vector register
emit "dup v17.2d, x0"
# Perform logical right shift
emit "ushl v16.2d, v16.2d, v17.2d"
elsif X86_64
andi 63, t0
emit "movd %eax, %xmm1"
emit "vpsrlq %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i64x2_add, macro()
# i64x2.add - add 2 64-bit integers
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "add v16.2d, v16.2d, v17.2d"
elsif X86_64
emit "vpaddq %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
reservedOpcode(0xfdcf01)
reservedOpcode(0xfdd001)
ipintOp(_simd_i64x2_sub, macro()
# i64x2.sub - subtract 2 64-bit integers
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "sub v16.2d, v16.2d, v17.2d"
elsif X86_64
emit "vpsubq %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
reservedOpcode(0xfdd201)
reservedOpcode(0xfdd301)
reservedOpcode(0xfdd401)
ipintOp(_simd_i64x2_mul, macro()
# i64x2.mul - multiply 2 64-bit integers (low 64 bits of result)
# Extract and multiply lane 0 (first 64-bit element)
loadq [sp], t0 # Load lane 0 of vector1
loadq 16[sp], t1 # Load lane 0 of vector0
mulq t1, t0 # Multiply: t0 = t0 * t1
storeq t0, 16[sp] # Store result back to vector0
# Extract and multiply lane 1 (second 64-bit element)
loadq 8[sp], t0 # Load lane 1 of vector1
loadq 24[sp], t1 # Load lane 1 of vector0
mulq t1, t0 # Multiply: t0 = t0 * t1
storeq t0, 24[sp] # Store result back to vector0
# Pop vector1, result in vector0
addp V128ISize, sp # Remove first vector from stack, leaving result
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i64x2_eq, macro()
# i64x2.eq - compare 2 64-bit integers for equality
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "cmeq v16.2d, v16.2d, v17.2d"
elsif X86_64
emit "vpcmpeqq %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i64x2_ne, macro()
# i64x2.ne - compare 2 64-bit integers for inequality
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "cmeq v16.2d, v16.2d, v17.2d"
emit "mvn v16.16b, v16.16b"
elsif X86_64
# Compare for equality, then invert the result
emit "vpcmpeqq %xmm1, %xmm0, %xmm0"
emit "vpcmpeqq %xmm2, %xmm2, %xmm2" # Set all bits to 1
emit "vpxor %xmm2, %xmm0, %xmm0" # Invert result
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i64x2_lt_s, macro()
# i64x2.lt_s - compare 2 64-bit signed integers for less than
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
# cmgt v17, v16 gives us v1 > v0, which is equivalent to v0 < v1
emit "cmgt v16.2d, v17.2d, v16.2d"
elsif X86_64
# vpcmpgtq xmm1, xmm0 gives us xmm1 > xmm0, which is equivalent to xmm0 < xmm1
emit "vpcmpgtq %xmm0, %xmm1, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i64x2_gt_s, macro()
# i64x2.gt_s - compare 2 64-bit signed integers for greater than
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "cmgt v16.2d, v16.2d, v17.2d"
elsif X86_64
emit "vpcmpgtq %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i64x2_le_s, macro()
# i64x2.le_s - compare 2 64-bit signed integers for less than or equal
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
# cmge v17, v16 gives us v1 >= v0, which is equivalent to v0 <= v1
emit "cmge v16.2d, v17.2d, v16.2d"
elsif X86_64
# xmm0 <= xmm1 iff !(xmm0 > xmm1)
emit "vpcmpgtq %xmm1, %xmm0, %xmm0" # xmm0 > xmm1
emit "vpcmpeqq %xmm2, %xmm2, %xmm2" # Set all bits to 1
emit "vpxor %xmm2, %xmm0, %xmm0" # Invert result: !(xmm0 > xmm1)
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i64x2_ge_s, macro()
# i64x2.ge_s - compare 2 64-bit signed integers for greater than or equal
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "cmge v16.2d, v16.2d, v17.2d"
elsif X86_64
# xmm0 >= xmm1 iff !(xmm0 < xmm1) iff !(xmm1 > xmm0)
emit "vpcmpgtq %xmm0, %xmm1, %xmm0" # xmm1 > xmm0
emit "vpcmpeqq %xmm2, %xmm2, %xmm2" # Set all bits to 1
emit "vpxor %xmm2, %xmm0, %xmm0" # Invert result: !(xmm1 > xmm0)
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i64x2_extmul_low_i32x4_s, macro()
# i64x2.extmul_low_i32x4_s - multiply lower 2 i32 elements and extend to i64
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "smull v16.2d, v16.2s, v17.2s"
elsif X86_64
# See MacroAssemblerX86_64::vectorMulLow
emit "vpunpckldq %xmm0, %xmm0, %xmm2" # Duplicate low dwords of left
emit "vpunpckldq %xmm1, %xmm1, %xmm0" # Duplicate low dwords of right
emit "vpmuldq %xmm2, %xmm0, %xmm0" # Signed multiply
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i64x2_extmul_high_i32x4_s, macro()
# i64x2.extmul_high_i32x4_s - multiply upper 2 i32 elements and extend to i64
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "smull2 v16.2d, v16.4s, v17.4s"
elsif X86_64
# See MacroAssemblerX86_64::vectorMulHigh
emit "vpunpckhdq %xmm0, %xmm0, %xmm2" # Duplicate high dwords of left
emit "vpunpckhdq %xmm1, %xmm1, %xmm0" # Duplicate high dwords of right
emit "vpmuldq %xmm2, %xmm0, %xmm0" # Signed multiply
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i64x2_extmul_low_i32x4_u, macro()
# i64x2.extmul_low_i32x4_u - multiply lower 2 u32 elements and extend to i64
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "umull v16.2d, v16.2s, v17.2s"
elsif X86_64
# See MacroAssemblerX86_64::vectorMulLow
emit "vpunpckldq %xmm0, %xmm0, %xmm2" # Duplicate low dwords of left
emit "vpunpckldq %xmm1, %xmm1, %xmm0" # Duplicate low dwords of right
emit "vpmuludq %xmm2, %xmm0, %xmm0" # Unsigned multiply
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i64x2_extmul_high_i32x4_u, macro()
# i64x2.extmul_high_i32x4_u - multiply upper 2 u32 elements and extend to i64
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "umull2 v16.2d, v16.4s, v17.4s"
elsif X86_64
# See MacroAssemblerX86_64::vectorMulHigh
emit "vpunpckhdq %xmm0, %xmm0, %xmm2" # Duplicate high dwords of left
emit "vpunpckhdq %xmm1, %xmm1, %xmm0" # Duplicate high dwords of right
emit "vpmuludq %xmm2, %xmm0, %xmm0" # Unsigned multiply
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
# 0xFD 0xE0 0x01 - 0xFD 0xEB 0x01: f32x4 operations
ipintOp(_simd_f32x4_abs, macro()
# f32x4.abs - absolute value of 4 32-bit floats
popVec(v0)
if ARM64 or ARM64E
emit "fabs v16.4s, v16.4s"
elsif X86_64
# Clear sign bit by AND with 0x7FFFFFFF mask
emit "movabsq $0x7fffffff7fffffff, %rax"
emit "vmovq %rax, %xmm1"
emit "vpunpcklqdq %xmm1, %xmm1, %xmm1"
emit "vandps %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_f32x4_neg, macro()
# f32x4.neg - negate 4 32-bit floats
popVec(v0)
if ARM64 or ARM64E
emit "fneg v16.4s, v16.4s"
elsif X86_64
# Flip sign bit by XOR with 0x80000000 mask
emit "movabsq $0x8000000080000000, %rax"
emit "vmovq %rax, %xmm1"
emit "vpunpcklqdq %xmm1, %xmm1, %xmm1"
emit "vxorps %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
reservedOpcode(0xfde201)
ipintOp(_simd_f32x4_sqrt, macro()
# f32x4.sqrt - square root of 4 32-bit floats
popVec(v0)
if ARM64 or ARM64E
emit "fsqrt v16.4s, v16.4s"
elsif X86_64
emit "vsqrtps %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_f32x4_add, macro()
# f32x4.add - add 4 32-bit floats
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "fadd v16.4s, v16.4s, v17.4s"
elsif X86_64
emit "vaddps %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_f32x4_sub, macro()
# f32x4.sub - subtract 4 32-bit floats
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "fsub v16.4s, v16.4s, v17.4s"
elsif X86_64
emit "vsubps %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_f32x4_mul, macro()
# f32x4.mul - multiply 4 32-bit floats
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "fmul v16.4s, v16.4s, v17.4s"
elsif X86_64
emit "vmulps %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_f32x4_div, macro()
# f32x4.div - divide 4 32-bit floats
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "fdiv v16.4s, v16.4s, v17.4s"
elsif X86_64
emit "vdivps %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_f32x4_min, macro()
# f32x4.min - minimum of 4 32-bit floats (IEEE 754-2008 semantics)
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "fmin v16.4s, v16.4s, v17.4s"
elsif X86_64
# Wasm differs from X86_64 in terms of signed zero values and propagating NaNs
# so some special handling of those cases are needed.
# Compute result in both directions to handle NaN asymmetry
emit "vminps %xmm1, %xmm0, %xmm2" # xmm2 = min(xmm0, xmm1)
emit "vminps %xmm0, %xmm1, %xmm0" # xmm0 = min(xmm1, xmm0)
# OR results to propagate sign bits and NaN bits
emit "vorps %xmm0, %xmm2, %xmm2" # xmm2 = xmm0 | xmm2
# Canonicalize NaNs by checking for unordered values and clearing mantissa
emit "vcmpunordps %xmm2, %xmm0, %xmm0" # xmm0 = NaN mask (all 1's where NaN)
emit "vorps %xmm0, %xmm2, %xmm2" # xmm2 |= NaN mask
emit "vpsrld $10, %xmm0, %xmm0" # Shift mask to clear mantissa bits (f32 uses 10)
emit "vpandn %xmm2, %xmm0, %xmm0" # Clear mantissa to canonicalize NaN
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_f32x4_max, macro()
# f32x4.max - maximum of 4 32-bit floats (IEEE 754-2008 semantics)
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "fmax v16.4s, v16.4s, v17.4s"
elsif X86_64
# Wasm differs from X86_64 in terms of signed zero values and propagating NaNs
# so some special handling of those cases are needed.
# Compute result in both directions to handle NaN asymmetry
emit "vmaxps %xmm1, %xmm0, %xmm2" # xmm2 = max(xmm0, xmm1)
emit "vmaxps %xmm0, %xmm1, %xmm0" # xmm0 = max(xmm1, xmm0)
# Check for discrepancies by XORing the results
emit "vxorps %xmm0, %xmm2, %xmm0" # xmm0 = xmm0 ^ xmm2
# OR results to propagate sign bits and NaN bits
emit "vorps %xmm0, %xmm2, %xmm2" # xmm2 = xmm0 | xmm2
# Propagate discrepancies in sign bit
emit "vsubps %xmm0, %xmm2, %xmm2" # xmm2 = xmm2 - xmm0
# Canonicalize NaNs by checking for unordered values and clearing mantissa
emit "vcmpunordps %xmm2, %xmm0, %xmm0" # xmm0 = NaN mask (all 1's where NaN)
emit "vpsrld $10, %xmm0, %xmm0" # Shift mask to clear mantissa bits (f32 uses 10)
emit "vpandn %xmm2, %xmm0, %xmm0" # Clear mantissa to canonicalize NaN
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_f32x4_pmin, macro()
# f32x4.pmin - pseudo-minimum of 4 32-bit floats (b < a ? b : a)
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
# Use fcmgt to compare v0 > v1, then use bsl to select
emit "fcmgt v18.4s, v16.4s, v17.4s"
emit "bsl v18.16b, v17.16b, v16.16b"
emit "mov v16.16b, v18.16b"
elsif X86_64
emit "vcmpgtps %xmm1, %xmm0, %xmm2" # xmm2 = (a > b) ? 0xFFFFFFFF : 0x00000000
emit "vblendvps %xmm2, %xmm1, %xmm0, %xmm0" # select b if mask is true, a if false
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_f32x4_pmax, macro()
# f32x4.pmax - pseudo-maximum of 4 32-bit floats (a < b ? b : a)
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
# Use fcmgt to compare v1 > v0, then use bsl to select
emit "fcmgt v18.4s, v17.4s, v16.4s"
emit "bsl v18.16b, v17.16b, v16.16b"
emit "mov v16.16b, v18.16b"
elsif X86_64
emit "vcmpgtps %xmm0, %xmm1, %xmm2" # xmm2 = (b > a) ? 0xFFFFFFFF : 0x00000000
emit "vblendvps %xmm2, %xmm1, %xmm0, %xmm0" # select b if mask is true, a if false
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
# 0xFD 0xEC 0x01 - 0xFD 0xF7 0x01: f64x2 operations
ipintOp(_simd_f64x2_abs, macro()
# f64x2.abs - absolute value of 2 64-bit floats
popVec(v0)
if ARM64 or ARM64E
emit "fabs v16.2d, v16.2d"
elsif X86_64
# Clear sign bit by AND with 0x7FFFFFFFFFFFFFFF mask
emit "movabsq $0x7fffffffffffffff, %rax"
emit "vmovq %rax, %xmm1"
emit "vpunpcklqdq %xmm1, %xmm1, %xmm1"
emit "vandpd %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_f64x2_neg, macro()
# f64x2.neg - negate 2 64-bit floats
popVec(v0)
if ARM64 or ARM64E
emit "fneg v16.2d, v16.2d"
elsif X86_64
# Flip sign bit by XOR with 0x8000000000000000 mask
emit "movabsq $0x8000000000000000, %rax"
emit "vmovq %rax, %xmm1"
emit "vpunpcklqdq %xmm1, %xmm1, %xmm1"
emit "vxorpd %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
reservedOpcode(0xfdee01)
ipintOp(_simd_f64x2_sqrt, macro()
# f64x2.sqrt - square root of 2 64-bit floats
popVec(v0)
if ARM64 or ARM64E
emit "fsqrt v16.2d, v16.2d"
elsif X86_64
emit "vsqrtpd %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_f64x2_add, macro()
# f64x2.add - add 2 64-bit floats
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "fadd v16.2d, v16.2d, v17.2d"
elsif X86_64
emit "vaddpd %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_f64x2_sub, macro()
# f64x2.sub - subtract 2 64-bit floats
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "fsub v16.2d, v16.2d, v17.2d"
elsif X86_64
emit "vsubpd %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_f64x2_mul, macro()
# f64x2.mul - multiply 2 64-bit floats
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "fmul v16.2d, v16.2d, v17.2d"
elsif X86_64
emit "vmulpd %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_f64x2_div, macro()
# f64x2.div - divide 2 64-bit floats
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "fdiv v16.2d, v16.2d, v17.2d"
elsif X86_64
emit "vdivpd %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_f64x2_min, macro()
# f64x2.min - minimum of 2 64-bit floats (IEEE 754-2008 semantics)
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "fmin v16.2d, v16.2d, v17.2d"
elsif X86_64
# Wasm differs from X86_64 in terms of signed zero values and propagating NaNs
# so some special handling of those cases are needed.
# Compute result in both directions to handle NaN asymmetry
emit "vminpd %xmm1, %xmm0, %xmm2" # xmm2 = min(xmm0, xmm1)
emit "vminpd %xmm0, %xmm1, %xmm0" # xmm0 = min(xmm1, xmm0)
# OR results to propagate sign bits and NaN bits
emit "vorpd %xmm0, %xmm2, %xmm2" # xmm2 = xmm0 | xmm2
# Canonicalize NaNs by checking for unordered values and clearing mantissa
emit "vcmpunordpd %xmm2, %xmm0, %xmm0" # xmm0 = NaN mask (all 1's where NaN)
emit "vorpd %xmm0, %xmm2, %xmm2" # xmm2 |= NaN mask
emit "vpsrlq $13, %xmm0, %xmm0" # Shift mask to clear mantissa bits
emit "vpandn %xmm2, %xmm0, %xmm0" # Clear mantissa to canonicalize NaN
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_f64x2_max, macro()
# f64x2.max - maximum of 2 64-bit floats (IEEE 754-2008 semantics)
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
emit "fmax v16.2d, v16.2d, v17.2d"
elsif X86_64
# Wasm differs from X86_64 in terms of signed zero values and propagating NaNs
# so some special handling of those cases are needed.
# Compute result in both directions to handle NaN asymmetry
emit "vmaxpd %xmm1, %xmm0, %xmm2" # xmm2 = max(xmm0, xmm1)
emit "vmaxpd %xmm0, %xmm1, %xmm0" # xmm0 = max(xmm1, xmm0)
# Check for discrepancies by XORing the results
emit "vxorpd %xmm0, %xmm2, %xmm0" # xmm0 = xmm0 ^ xmm2
# OR results to propagate sign bits and NaN bits
emit "vorpd %xmm0, %xmm2, %xmm2" # xmm2 = xmm0 | xmm2
# Propagate discrepancies in sign bit
emit "vsubpd %xmm0, %xmm2, %xmm2" # xmm2 = xmm2 - xmm0
# Canonicalize NaNs by checking for unordered values and clearing mantissa
emit "vcmpunordpd %xmm2, %xmm0, %xmm0" # xmm0 = NaN mask (all 1's where NaN)
emit "vpsrlq $13, %xmm0, %xmm0" # Shift mask to clear mantissa bits
emit "vpandn %xmm2, %xmm0, %xmm0" # Clear mantissa to canonicalize NaN
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_f64x2_pmin, macro()
# f64x2.pmin - pseudo-minimum of 2 64-bit floats (b < a ? b : a)
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
# Use fcmgt to compare v0 > v1, then use bsl to select
emit "fcmgt v18.2d, v16.2d, v17.2d"
emit "bsl v18.16b, v17.16b, v16.16b"
emit "mov v16.16b, v18.16b"
elsif X86_64
emit "vcmpgtpd %xmm1, %xmm0, %xmm2" # xmm2 = (a > b) ? 0xFFFFFFFF : 0x00000000
emit "vblendvpd %xmm2, %xmm1, %xmm0, %xmm0" # select b if mask is true, a if false
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_f64x2_pmax, macro()
# f64x2.pmax - pseudo-maximum of 2 64-bit floats (a < b ? b : a)
popVec(v1)
popVec(v0)
if ARM64 or ARM64E
# Use fcmgt to compare v1 > v0, then use bsl to select
emit "fcmgt v18.2d, v17.2d, v16.2d"
emit "bsl v18.16b, v17.16b, v16.16b"
emit "mov v16.16b, v18.16b"
elsif X86_64
emit "vcmpgtpd %xmm0, %xmm1, %xmm2" # xmm2 = (b > a) ? 0xFFFFFFFF : 0x00000000
emit "vblendvpd %xmm2, %xmm1, %xmm0, %xmm0" # select b if mask is true, a if false
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
# 0xFD 0xF8 0x01 - 0xFD 0xFF 0x01: trunc/convert
ipintOp(_simd_i32x4_trunc_sat_f32x4_s, macro()
# i32x4.trunc_sat_f32x4_s - truncate 4 f32 values to signed i32 with saturation
popVec(v0)
if ARM64 or ARM64E
emit "fcvtzs v16.4s, v16.4s"
elsif X86_64
# Saturation logic following MacroAssembler implementation
emit "vmovaps %xmm0, %xmm1" # xmm1 = src
emit "vcmpunordps %xmm1, %xmm1, %xmm1" # xmm1 = NaN mask
emit "vandnps %xmm0, %xmm1, %xmm1" # xmm1 = src with NaN lanes cleared
# Load 0x1.0p+31f (2147483648.0f) constant
emit "movl $0x4f000000, %eax" # 0x1.0p+31f
emit "vmovd %eax, %xmm2"
emit "vshufps $0, %xmm2, %xmm2, %xmm2" # Broadcast to all 4 lanes
emit "vcmpnltps %xmm2, %xmm1, %xmm3" # xmm3 = positive overflow mask (src >= 0x80000000)
emit "vcvttps2dq %xmm1, %xmm1" # Convert with overflow saturated to 0x80000000
emit "vpxor %xmm3, %xmm1, %xmm0" # Convert positive overflow to 0x7FFFFFFF
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i32x4_trunc_sat_f32x4_u, macro()
# i32x4.trunc_sat_f32x4_u - truncate 4 f32 values to unsigned i32 with saturation
popVec(v0)
if ARM64 or ARM64E
emit "fcvtzu v16.4s, v16.4s"
elsif X86_64
# Unsigned saturation logic following MacroAssembler implementation
emit "vxorps %xmm1, %xmm1, %xmm1" # xmm1 = 0
emit "vmaxps %xmm1, %xmm0, %xmm0" # Clear NaN and negatives
# Load 2147483647.0f constant (rounds to 2147483648.0f in float32)
emit "movl $0x4f000000, %eax" # 2147483647.0f
emit "vmovd %eax, %xmm2"
emit "vshufps $0, %xmm2, %xmm2, %xmm2" # Broadcast to all 4 lanes
emit "vmovaps %xmm0, %xmm3" # xmm3 = src copy
emit "vsubps %xmm2, %xmm3, %xmm3" # xmm3 = src - 2147483647.0f
emit "vcmpnltps %xmm2, %xmm3, %xmm1" # xmm1 = mask for overflow
emit "vcvttps2dq %xmm3, %xmm3" # Convert (src - 2147483647.0f)
emit "vpxor %xmm1, %xmm3, %xmm3" # Saturate positive overflow to 0x7FFFFFFF
emit "vpxor %xmm4, %xmm4, %xmm4" # xmm4 = 0
emit "vpmaxsd %xmm4, %xmm3, %xmm3" # Clear negatives
emit "vcvttps2dq %xmm0, %xmm0" # Convert original src
emit "vpaddd %xmm3, %xmm0, %xmm0" # Add correction
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_f32x4_convert_i32x4_s, macro()
# f32x4.convert_i32x4_s - convert 4 signed i32 values to f32
popVec(v0)
if ARM64 or ARM64E
emit "scvtf v16.4s, v16.4s"
elsif X86_64
emit "vcvtdq2ps %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_f32x4_convert_i32x4_u, macro()
# f32x4.convert_i32x4_u - convert 4 unsigned i32 values to f32
popVec(v0)
if ARM64 or ARM64E
emit "ucvtf v16.4s, v16.4s"
elsif X86_64
# See MacroAssembler::vectorConvertUnsigned
emit "vpxor %xmm1, %xmm1, %xmm1" # clear scratch
emit "vpblendw $0x55, %xmm0, %xmm1, %xmm1" # i_low = low 16 bits of src
emit "vpsubd %xmm1, %xmm0, %xmm0" # i_high = high 16 bits of src
emit "vcvtdq2ps %xmm1, %xmm1" # f_low = convertToF32(i_low)
emit "vpsrld $1, %xmm0, %xmm0" # i_half_high = i_high / 2
emit "vcvtdq2ps %xmm0, %xmm0" # f_half_high = convertToF32(i_half_high)
emit "vaddps %xmm0, %xmm0, %xmm0" # dst = f_half_high + f_half_high + f_low
emit "vaddps %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i32x4_trunc_sat_f64x2_s_zero, macro()
# i32x4.trunc_sat_f64x2_s_zero - truncate 2 f64 values to signed i32, zero upper 2 lanes
popVec(v0)
if ARM64 or ARM64E
# Convert f64 to signed i64 first
emit "fcvtzs v16.2d, v16.2d"
# Signed saturating extract narrow from i64 to i32
emit "sqxtn v16.2s, v16.2d"
# Zero the upper 64 bits (lanes 2,3)
emit "mov v16.d[1], xzr"
elsif X86_64
emit "vcmppd $0, %xmm0, %xmm0, %xmm1" # xmm1 = ordered comparison mask (not NaN)
# Load 2147483647.0 constant
emit "movabsq $0x41dfffffffc00000, %rax" # 2147483647.0 as double
emit "vmovq %rax, %xmm2"
emit "vpunpcklqdq %xmm2, %xmm2, %xmm2" # Broadcast to both lanes
emit "vandpd %xmm2, %xmm1, %xmm1" # xmm1 = 2147483647.0 where not NaN, 0 where NaN
emit "vminpd %xmm1, %xmm0, %xmm0" # Clamp to max value and handle NaN
emit "vcvttpd2dq %xmm0, %xmm0" # Convert to i32 (result in lower 64 bits, upper zeroed)
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_i32x4_trunc_sat_f64x2_u_zero, macro()
# i32x4.trunc_sat_f64x2_u_zero - truncate 2 f64 values to unsigned i32, zero upper 2 lanes
popVec(v0)
if ARM64 or ARM64E
# Convert f64 to unsigned i64 first
emit "fcvtzu v16.2d, v16.2d"
# Unsigned saturating extract narrow from i64 to i32
emit "uqxtn v16.2s, v16.2d"
# Zero the upper 64 bits (lanes 2,3)
emit "mov v16.d[1], xzr"
elsif X86_64
# See MacroAssembler::vectorTruncSatUnsignedFloat64
# Load constants: 4294967295.0 and 0x1.0p+52
emit "movabsq $0x41efffffffe00000, %rax" # 4294967295.0 as double
emit "vmovq %rax, %xmm2"
emit "vpunpcklqdq %xmm2, %xmm2, %xmm2" # xmm2 = [4294967295.0, 4294967295.0]
emit "movabsq $0x4330000000000000, %rax" # 0x1.0p+52 as double
emit "vmovq %rax, %xmm3"
emit "vpunpcklqdq %xmm3, %xmm3, %xmm3" # xmm3 = [0x1.0p+52, 0x1.0p+52]
emit "vxorpd %xmm1, %xmm1, %xmm1" # xmm1 = 0.0
emit "vmaxpd %xmm1, %xmm0, %xmm0" # Clear negatives
emit "vminpd %xmm2, %xmm0, %xmm0" # Clamp to 4294967295.0
emit "vroundpd $3, %xmm0, %xmm0" # Truncate toward zero
emit "vaddpd %xmm3, %xmm0, %xmm0" # Add 0x1.0p+52 (magic number conversion)
emit "vshufps $0x88, %xmm1, %xmm0, %xmm0" # Pack to i32 and zero upper
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_f64x2_convert_low_i32x4_s, macro()
# f64x2.convert_low_i32x4_s - convert lower 2 signed i32 values to f64
popVec(v0)
if ARM64 or ARM64E
# Sign-extend lower 2 i32 values to i64, then convert to f64
emit "sxtl v16.2d, v16.2s"
emit "scvtf v16.2d, v16.2d"
elsif X86_64
emit "vcvtdq2pd %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
ipintOp(_simd_f64x2_convert_low_i32x4_u, macro()
# f64x2.convert_low_i32x4_u - convert lower 2 unsigned i32 values to f64
popVec(v0)
if ARM64 or ARM64E
# Zero-extend lower 2 i32 values to i64, then convert to f64
emit "uxtl v16.2d, v16.2s"
emit "ucvtf v16.2d, v16.2d"
elsif X86_64
# See MacroAssembler::vectorConvertLowUnsignedInt32
# Load 0x43300000 (high32Bits) and splat to all lanes
emit "movl $0x43300000, %eax"
emit "vmovd %eax, %xmm1"
emit "vpshufd $0, %xmm1, %xmm1"
# Unpack lower 2 i32 with high32Bits
emit "vunpcklps %xmm1, %xmm0, %xmm0" # Interleave: [i32_0, 0x43300000, i32_1, 0x43300000]
# Load 0x1.0p+52 mask
emit "movabsq $0x4330000000000000, %rax" # 0x1.0p+52 as double
emit "vmovq %rax, %xmm1"
emit "vpunpcklqdq %xmm1, %xmm1, %xmm1" # xmm1 = [0x1.0p+52, 0x1.0p+52]
# Subtract to get the correct unsigned values
emit "vsubpd %xmm1, %xmm0, %xmm0"
else
break # Not implemented
end
pushVec(v0)
advancePC(2)
nextIPIntInstruction()
end)
#########################
## Atomic instructions ##
#########################
macro ipintCheckMemoryBoundWithAlignmentCheck(mem, scratch, size)
leap size - 1[mem], scratch
bpb scratch, boundsCheckingSize, .continuationInBounds
.throwOOB:
ipintException(OutOfBoundsMemoryAccess)
.continuationInBounds:
btpz mem, (size - 1), .continuationAligned
.throwUnaligned:
throwException(UnalignedMemoryAccess)
.continuationAligned:
end
macro ipintCheckMemoryBoundWithAlignmentCheck1(mem, scratch)
ipintCheckMemoryBound(mem, scratch, 1)
end
macro ipintCheckMemoryBoundWithAlignmentCheck2(mem, scratch)
ipintCheckMemoryBoundWithAlignmentCheck(mem, scratch, 2)
end
macro ipintCheckMemoryBoundWithAlignmentCheck4(mem, scratch)
ipintCheckMemoryBoundWithAlignmentCheck(mem, scratch, 4)
end
macro ipintCheckMemoryBoundWithAlignmentCheck8(mem, scratch)
ipintCheckMemoryBoundWithAlignmentCheck(mem, scratch, 8)
end
ipintOp(_memory_atomic_notify, macro()
# pop count
popInt32(a3)
# pop pointer
popInt32(a1)
# load offset
loadi IPInt::Const32Metadata::value[MC], a2
operationCall(macro() cCall4(_ipint_extern_memory_atomic_notify) end)
bilt r0, 0, .atomic_notify_throw
pushInt32(r0)
loadb IPInt::Const32Metadata::instructionLength[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::Const32Metadata)))
nextIPIntInstruction()
.atomic_notify_throw:
ipintException(OutOfBoundsMemoryAccess)
end)
ipintOp(_memory_atomic_wait32, macro()
# pop timeout
popInt32(a3)
# pop value
popInt32(a2)
# pop pointer
popInt32(a1)
# load offset
loadi IPInt::Const32Metadata::value[MC], t0
# merge them since the slow path takes the combined pointer + offset.
addq t0, a1
operationCall(macro() cCall4(_ipint_extern_memory_atomic_wait32) end)
bilt r0, 0, .atomic_wait32_throw
pushInt32(r0)
loadb IPInt::Const32Metadata::instructionLength[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::Const32Metadata)))
nextIPIntInstruction()
.atomic_wait32_throw:
ipintException(OutOfBoundsMemoryAccess)
end)
ipintOp(_memory_atomic_wait64, macro()
# pop timeout
popInt32(a3)
# pop value
popInt64(a2)
# pop pointer
popInt32(a1)
# load offset
loadi IPInt::Const32Metadata::value[MC], t0
# merge them since the slow path takes the combined pointer + offset.
addq t0, a1
operationCall(macro() cCall4(_ipint_extern_memory_atomic_wait64) end)
bilt r0, 0, .atomic_wait64_throw
pushInt32(r0)
loadb IPInt::Const32Metadata::instructionLength[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::Const32Metadata)))
nextIPIntInstruction()
.atomic_wait64_throw:
ipintException(OutOfBoundsMemoryAccess)
end)
ipintOp(_atomic_fence, macro()
fence
loadb IPInt::InstructionLengthMetadata::length[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::InstructionLengthMetadata)))
nextIPIntInstruction()
end)
reservedOpcode(atomic_0x4)
reservedOpcode(atomic_0x5)
reservedOpcode(atomic_0x6)
reservedOpcode(atomic_0x7)
reservedOpcode(atomic_0x8)
reservedOpcode(atomic_0x9)
reservedOpcode(atomic_0xa)
reservedOpcode(atomic_0xb)
reservedOpcode(atomic_0xc)
reservedOpcode(atomic_0xd)
reservedOpcode(atomic_0xe)
reservedOpcode(atomic_0xf)
macro atomicLoadOp(boundsAndAlignmentCheck, loadAndPush)
# pop index
popInt32(t0)
ori 0, t0
# load offset
loadi IPInt::Const32Metadata::value[MC], t2
addp t2, t0
boundsAndAlignmentCheck(t0, t3)
addq memoryBase, t0
loadAndPush(t0, t2)
loadb IPInt::Const32Metadata::instructionLength[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::Const32Metadata)))
nextIPIntInstruction()
end
ipintOp(_i32_atomic_load, macro()
atomicLoadOp(ipintCheckMemoryBoundWithAlignmentCheck4, macro(mem, scratch)
if ARM64 or ARM64E or X86_64
atomicloadi [mem], scratch
else
error
end
pushInt32(scratch)
end)
end)
ipintOp(_i64_atomic_load, macro()
atomicLoadOp(ipintCheckMemoryBoundWithAlignmentCheck8, macro(mem, scratch)
if ARM64 or ARM64E or X86_64
atomicloadq [mem], scratch
else
error
end
pushInt64(scratch)
end)
end)
ipintOp(_i32_atomic_load8_u, macro()
atomicLoadOp(ipintCheckMemoryBoundWithAlignmentCheck1, macro(mem, scratch)
if ARM64 or ARM64E or X86_64
atomicloadb [mem], scratch
else
error
end
pushInt32(scratch)
end)
end)
ipintOp(_i32_atomic_load16_u, macro()
atomicLoadOp(ipintCheckMemoryBoundWithAlignmentCheck2, macro(mem, scratch)
if ARM64 or ARM64E or X86_64
atomicloadh [mem], scratch
else
error
end
pushInt32(scratch)
end)
end)
ipintOp(_i64_atomic_load8_u, macro()
atomicLoadOp(ipintCheckMemoryBoundWithAlignmentCheck1, macro(mem, scratch)
if ARM64 or ARM64E or X86_64
atomicloadb [mem], scratch
else
error
end
pushInt64(scratch)
end)
end)
ipintOp(_i64_atomic_load16_u, macro()
atomicLoadOp(ipintCheckMemoryBoundWithAlignmentCheck2, macro(mem, scratch)
if ARM64 or ARM64E or X86_64
atomicloadh [mem], scratch
else
error
end
pushInt64(scratch)
end)
end)
ipintOp(_i64_atomic_load32_u, macro()
atomicLoadOp(ipintCheckMemoryBoundWithAlignmentCheck4, macro(mem, scratch)
if ARM64 or ARM64E or X86_64
atomicloadi [mem], scratch
else
error
end
pushInt64(scratch)
end)
end)
macro weakCASLoopByte(mem, value, scratch1AndOldValue, scratch2, fn)
validateOpcodeConfig(scratch1AndOldValue)
if X86_64
loadb [mem], scratch1AndOldValue
.loop:
move scratch1AndOldValue, scratch2
fn(value, scratch2)
batomicweakcasb scratch1AndOldValue, scratch2, [mem], .loop
else
.loop:
loadlinkacqb [mem], scratch1AndOldValue
fn(value, scratch1AndOldValue, scratch2)
storecondrelb ws2, scratch2, [mem]
bineq ws2, 0, .loop
end
end
macro weakCASLoopHalf(mem, value, scratch1AndOldValue, scratch2, fn)
validateOpcodeConfig(scratch1AndOldValue)
if X86_64
loadh [mem], scratch1AndOldValue
.loop:
move scratch1AndOldValue, scratch2
fn(value, scratch2)
batomicweakcash scratch1AndOldValue, scratch2, [mem], .loop
else
.loop:
loadlinkacqh [mem], scratch1AndOldValue
fn(value, scratch1AndOldValue, scratch2)
storecondrelh ws2, scratch2, [mem]
bineq ws2, 0, .loop
end
end
macro weakCASLoopInt(mem, value, scratch1AndOldValue, scratch2, fn)
validateOpcodeConfig(scratch1AndOldValue)
if X86_64
loadi [mem], scratch1AndOldValue
.loop:
move scratch1AndOldValue, scratch2
fn(value, scratch2)
batomicweakcasi scratch1AndOldValue, scratch2, [mem], .loop
else
.loop:
loadlinkacqi [mem], scratch1AndOldValue
fn(value, scratch1AndOldValue, scratch2)
storecondreli ws2, scratch2, [mem]
bineq ws2, 0, .loop
end
end
macro weakCASLoopQuad(mem, value, scratch1AndOldValue, scratch2, fn)
validateOpcodeConfig(scratch1AndOldValue)
if X86_64
loadq [mem], scratch1AndOldValue
.loop:
move scratch1AndOldValue, scratch2
fn(value, scratch2)
batomicweakcasq scratch1AndOldValue, scratch2, [mem], .loop
else
.loop:
loadlinkacqq [mem], scratch1AndOldValue
fn(value, scratch1AndOldValue, scratch2)
storecondrelq ws2, scratch2, [mem]
bineq ws2, 0, .loop
end
end
macro atomicStoreOp(boundsAndAlignmentCheck, popAndStore)
# pop value
popInt64(t1)
# pop index
popInt32(t2)
ori 0, t2
# load offset
loadi IPInt::Const32Metadata::value[MC], t0
addp t0, t2
boundsAndAlignmentCheck(t2, t3)
addq memoryBase, t2
popAndStore(t2, t1, t0, t3)
loadb IPInt::Const32Metadata::instructionLength[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::Const32Metadata)))
nextIPIntInstruction()
end
ipintOp(_i32_atomic_store, macro()
atomicStoreOp(ipintCheckMemoryBoundWithAlignmentCheck4, macro(mem, value, scratch1, scratch2)
if ARM64E
atomicxchgi value, [mem], value
elsif X86_64
atomicxchgi value, [mem]
elsif ARM64
weakCASLoopInt(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
move value, newValue
end)
else
error
end
end)
end)
ipintOp(_i64_atomic_store, macro()
atomicStoreOp(ipintCheckMemoryBoundWithAlignmentCheck8, macro(mem, value, scratch1, scratch2)
if ARM64E
atomicxchgq value, [mem], value
elsif X86_64
atomicxchgq value, [mem]
elsif ARM64
weakCASLoopQuad(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
move value, newValue
end)
else
error
end
end)
end)
ipintOp(_i32_atomic_store8_u, macro()
atomicStoreOp(ipintCheckMemoryBoundWithAlignmentCheck1, macro(mem, value, scratch1, scratch2)
if ARM64E
atomicxchgb value, [mem], value
elsif X86_64
atomicxchgb value, [mem]
elsif ARM64
weakCASLoopByte(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
move value, newValue
end)
else
error
end
end)
end)
ipintOp(_i32_atomic_store16_u, macro()
atomicStoreOp(ipintCheckMemoryBoundWithAlignmentCheck2, macro(mem, value, scratch1, scratch2)
if ARM64E
atomicxchgh value, [mem], value
elsif X86_64
atomicxchgh value, [mem]
elsif ARM64
weakCASLoopHalf(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
move value, newValue
end)
else
error
end
end)
end)
ipintOp(_i64_atomic_store8_u, macro()
atomicStoreOp(ipintCheckMemoryBoundWithAlignmentCheck1, macro(mem, value, scratch1, scratch2)
if ARM64E
atomicxchgb value, [mem], value
elsif X86_64
atomicxchgb value, [mem]
elsif ARM64
weakCASLoopByte(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
move value, newValue
end)
else
error
end
end)
end)
ipintOp(_i64_atomic_store16_u, macro()
atomicStoreOp(ipintCheckMemoryBoundWithAlignmentCheck2, macro(mem, value, scratch1, scratch2)
if ARM64E
atomicxchgh value, [mem], value
elsif X86_64
atomicxchgh value, [mem]
elsif ARM64
weakCASLoopHalf(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
move value, newValue
end)
else
error
end
end)
end)
ipintOp(_i64_atomic_store32_u, macro()
atomicStoreOp(ipintCheckMemoryBoundWithAlignmentCheck4, macro(mem, value, scratch1, scratch2)
if ARM64E
atomicxchgi value, [mem], value
elsif X86_64
atomicxchgi value, [mem]
elsif ARM64
weakCASLoopInt(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
move value, newValue
end)
else
error
end
end)
end)
macro atomicRMWOp(boundsAndAlignmentCheck, rmw)
# pop value
popInt64(t1)
# pop index
popInt32(t2)
ori 0, t2
# load offset
loadi IPInt::Const32Metadata::value[MC], t0
addp t0, t2
boundsAndAlignmentCheck(t2, t3)
addq memoryBase, t2
rmw(t2, t1, t0, t3)
loadb IPInt::Const32Metadata::instructionLength[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::Const32Metadata)))
nextIPIntInstruction()
end
ipintOp(_i32_atomic_rmw_add, macro()
atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck4, macro(mem, value, scratch1, scratch2)
if ARM64E
atomicxchgaddi value, [mem], scratch1
elsif X86_64
atomicxchgaddi value, [mem]
move value, scratch1
elsif ARM64
weakCASLoopInt(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
addi value, oldValue, newValue
end)
else
error
end
pushInt32(scratch1)
end)
end)
ipintOp(_i64_atomic_rmw_add, macro()
atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck8, macro(mem, value, scratch1, scratch2)
if ARM64E
atomicxchgaddq value, [mem], scratch1
elsif X86_64
atomicxchgaddq value, [mem]
move value, scratch1
elsif ARM64
weakCASLoopQuad(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
addq value, oldValue, newValue
end)
else
error
end
pushInt64(scratch1)
end)
end)
ipintOp(_i32_atomic_rmw8_add_u, macro()
atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck1, macro(mem, value, scratch1, scratch2)
if ARM64E
atomicxchgaddb value, [mem], scratch1
elsif X86_64
atomicxchgaddb value, [mem]
move value, scratch1
andi 0xff, scratch1
elsif ARM64
weakCASLoopByte(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
addi value, oldValue, newValue
end)
else
error
end
pushInt32(scratch1)
end)
end)
ipintOp(_i32_atomic_rmw16_add_u, macro()
atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck2, macro(mem, value, scratch1, scratch2)
if ARM64E
atomicxchgaddh value, [mem], scratch1
elsif X86_64
atomicxchgaddh value, [mem]
move value, scratch1
andi 0xffff, scratch1
elsif ARM64
weakCASLoopHalf(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
addi value, oldValue, newValue
end)
else
error
end
pushInt32(scratch1)
end)
end)
ipintOp(_i64_atomic_rmw8_add_u, macro()
atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck1, macro(mem, value, scratch1, scratch2)
if ARM64E
atomicxchgaddb value, [mem], scratch1
elsif X86_64
atomicxchgaddb value, [mem]
move value, scratch1
andi 0xff, scratch1
elsif ARM64
weakCASLoopByte(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
addi value, oldValue, newValue
end)
else
error
end
pushInt64(scratch1)
end)
end)
ipintOp(_i64_atomic_rmw16_add_u, macro()
atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck2, macro(mem, value, scratch1, scratch2)
if ARM64E
atomicxchgaddh value, [mem], scratch1
elsif X86_64
atomicxchgaddh value, [mem]
move value, scratch1
andi 0xffff, scratch1
elsif ARM64
weakCASLoopHalf(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
addi value, oldValue, newValue
end)
else
error
end
pushInt64(scratch1)
end)
end)
ipintOp(_i64_atomic_rmw32_add_u, macro()
atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck4, macro(mem, value, scratch1, scratch2)
if ARM64E
atomicxchgaddi value, [mem], scratch1
elsif X86_64
atomicxchgaddi value, [mem]
move value, scratch1
ori 0, scratch1
elsif ARM64
weakCASLoopInt(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
addi value, oldValue, newValue
end)
else
error
end
pushInt64(scratch1)
end)
end)
ipintOp(_i32_atomic_rmw_sub, macro()
atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck4, macro(mem, value, scratch1, scratch2)
if ARM64E
negi value
atomicxchgaddi value, [mem], scratch1
elsif X86_64
negi value
atomicxchgaddi value, [mem]
move value, scratch1
elsif ARM64
weakCASLoopInt(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
subi oldValue, value, newValue
end)
else
error
end
pushInt32(scratch1)
end)
end)
ipintOp(_i64_atomic_rmw_sub, macro()
atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck8, macro(mem, value, scratch1, scratch2)
if ARM64E
negq value
atomicxchgaddq value, [mem], scratch1
elsif X86_64
negq value
atomicxchgaddq value, [mem]
move value, scratch1
elsif ARM64
weakCASLoopQuad(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
subq oldValue, value, newValue
end)
else
error
end
pushInt64(scratch1)
end)
end)
ipintOp(_i32_atomic_rmw8_sub_u, macro()
atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck1, macro(mem, value, scratch1, scratch2)
if ARM64E
negi value
atomicxchgaddb value, [mem], scratch1
elsif X86_64
negi value
atomicxchgaddb value, [mem]
move value, scratch1
andi 0xff, scratch1
elsif ARM64
weakCASLoopByte(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
subi oldValue, value, newValue
end)
else
error
end
pushInt32(scratch1)
end)
end)
ipintOp(_i32_atomic_rmw16_sub_u, macro()
atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck2, macro(mem, value, scratch1, scratch2)
if ARM64E
negi value
atomicxchgaddh value, [mem], scratch1
elsif X86_64
negi value
atomicxchgaddh value, [mem]
move value, scratch1
andi 0xffff, scratch1
elsif ARM64
weakCASLoopHalf(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
subi oldValue, value, newValue
end)
else
error
end
pushInt32(scratch1)
end)
end)
ipintOp(_i64_atomic_rmw8_sub_u, macro()
atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck1, macro(mem, value, scratch1, scratch2)
if ARM64E
negq value
atomicxchgaddb value, [mem], scratch1
elsif X86_64
negq value
atomicxchgaddb value, [mem]
move value, scratch1
andi 0xff, scratch1
elsif ARM64
weakCASLoopByte(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
subi oldValue, value, newValue
end)
else
error
end
pushInt64(scratch1)
end)
end)
ipintOp(_i64_atomic_rmw16_sub_u, macro()
atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck2, macro(mem, value, scratch1, scratch2)
if ARM64E
negq value
atomicxchgaddh value, [mem], scratch1
elsif X86_64
negq value
atomicxchgaddh value, [mem]
move value, scratch1
andi 0xffff, scratch1
elsif ARM64
weakCASLoopHalf(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
subi oldValue, value, newValue
end)
else
error
end
pushInt64(scratch1)
end)
end)
ipintOp(_i64_atomic_rmw32_sub_u, macro()
atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck4, macro(mem, value, scratch1, scratch2)
if ARM64E
negq value
atomicxchgaddi value, [mem], scratch1
elsif X86_64
negq value
atomicxchgaddi value, [mem]
move value, scratch1
ori 0, scratch1
elsif ARM64
weakCASLoopInt(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
subi oldValue, value, newValue
end)
else
error
end
pushInt64(scratch1)
end)
end)
ipintOp(_i32_atomic_rmw_and, macro()
atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck4, macro(mem, value, scratch1, scratch2)
if ARM64E
noti value
atomicxchgcleari value, [mem], scratch1
elsif X86_64
weakCASLoopInt(mem, value, scratch1, scratch2, macro (value, dst)
andq value, dst
end)
elsif ARM64
weakCASLoopInt(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
andi value, oldValue, newValue
end)
else
error
end
pushInt32(scratch1)
end)
end)
ipintOp(_i64_atomic_rmw_and, macro()
atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck8, macro(mem, value, scratch1, scratch2)
if ARM64E
notq value
atomicxchgclearq value, [mem], scratch1
elsif X86_64
weakCASLoopQuad(mem, value, scratch1, scratch2, macro (value, dst)
andq value, dst
end)
elsif ARM64
weakCASLoopQuad(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
andq value, oldValue, newValue
end)
else
error
end
pushInt64(scratch1)
end)
end)
ipintOp(_i32_atomic_rmw8_and_u, macro()
atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck1, macro(mem, value, scratch1, scratch2)
if ARM64E
noti value
atomicxchgclearb value, [mem], scratch1
elsif X86_64
weakCASLoopByte(mem, value, scratch1, scratch2, macro (value, dst)
andq value, dst
end)
elsif ARM64
weakCASLoopByte(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
andi value, oldValue, newValue
end)
else
error
end
pushInt32(scratch1)
end)
end)
ipintOp(_i32_atomic_rmw16_and_u, macro()
atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck2, macro(mem, value, scratch1, scratch2)
if ARM64E
noti value
atomicxchgclearh value, [mem], scratch1
elsif X86_64
weakCASLoopHalf(mem, value, scratch1, scratch2, macro (value, dst)
andq value, dst
end)
elsif ARM64
weakCASLoopHalf(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
andi value, oldValue, newValue
end)
else
error
end
pushInt32(scratch1)
end)
end)
ipintOp(_i64_atomic_rmw8_and_u, macro()
atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck1, macro(mem, value, scratch1, scratch2)
if ARM64E
notq value
atomicxchgclearb value, [mem], scratch1
elsif X86_64
weakCASLoopByte(mem, value, scratch1, scratch2, macro (value, dst)
andq value, dst
end)
elsif ARM64
weakCASLoopByte(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
andi value, oldValue, newValue
end)
else
error
end
pushInt64(scratch1)
end)
end)
ipintOp(_i64_atomic_rmw16_and_u, macro()
atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck2, macro(mem, value, scratch1, scratch2)
if ARM64E
notq value
atomicxchgclearh value, [mem], scratch1
elsif X86_64
weakCASLoopHalf(mem, value, scratch1, scratch2, macro (value, dst)
andq value, dst
end)
elsif ARM64
weakCASLoopHalf(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
andi value, oldValue, newValue
end)
else
error
end
pushInt64(scratch1)
end)
end)
ipintOp(_i64_atomic_rmw32_and_u, macro()
atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck4, macro(mem, value, scratch1, scratch2)
if ARM64E
notq value
atomicxchgcleari value, [mem], scratch1
elsif X86_64
weakCASLoopInt(mem, value, scratch1, scratch2, macro (value, dst)
andq value, dst
end)
elsif ARM64
weakCASLoopInt(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
andi value, oldValue, newValue
end)
else
error
end
pushInt64(scratch1)
end)
end)
ipintOp(_i32_atomic_rmw_or, macro()
atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck4, macro(mem, value, scratch1, scratch2)
if ARM64E
atomicxchgori value, [mem], scratch1
elsif X86_64
weakCASLoopInt(mem, value, scratch1, scratch2, macro (value, dst)
ori value, dst
end)
elsif ARM64
weakCASLoopInt(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
ori value, oldValue, newValue
end)
else
error
end
pushInt32(scratch1)
end)
end)
ipintOp(_i64_atomic_rmw_or, macro()
atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck8, macro(mem, value, scratch1, scratch2)
if ARM64E
atomicxchgorq value, [mem], scratch1
elsif X86_64
weakCASLoopQuad(mem, value, scratch1, scratch2, macro (value, dst)
orq value, dst
end)
elsif ARM64
weakCASLoopQuad(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
orq value, oldValue, newValue
end)
else
error
end
pushInt64(scratch1)
end)
end)
ipintOp(_i32_atomic_rmw8_or_u, macro()
atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck1, macro(mem, value, scratch1, scratch2)
if ARM64E
atomicxchgorb value, [mem], scratch1
elsif X86_64
weakCASLoopByte(mem, value, scratch1, scratch2, macro (value, dst)
orq value, dst
end)
elsif ARM64
weakCASLoopByte(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
ori value, oldValue, newValue
end)
else
error
end
pushInt32(scratch1)
end)
end)
ipintOp(_i32_atomic_rmw16_or_u, macro()
atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck2, macro(mem, value, scratch1, scratch2)
if ARM64E
atomicxchgorh value, [mem], scratch1
elsif X86_64
weakCASLoopHalf(mem, value, scratch1, scratch2, macro (value, dst)
orq value, dst
end)
elsif ARM64
weakCASLoopHalf(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
ori value, oldValue, newValue
end)
else
error
end
pushInt32(scratch1)
end)
end)
ipintOp(_i64_atomic_rmw8_or_u, macro()
atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck1, macro(mem, value, scratch1, scratch2)
if ARM64E
atomicxchgorb value, [mem], scratch1
elsif X86_64
weakCASLoopByte(mem, value, scratch1, scratch2, macro (value, dst)
orq value, dst
end)
elsif ARM64
weakCASLoopByte(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
ori value, oldValue, newValue
end)
else
error
end
pushInt64(scratch1)
end)
end)
ipintOp(_i64_atomic_rmw16_or_u, macro()
atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck2, macro(mem, value, scratch1, scratch2)
if ARM64E
atomicxchgorh value, [mem], scratch1
elsif X86_64
weakCASLoopHalf(mem, value, scratch1, scratch2, macro (value, dst)
orq value, dst
end)
elsif ARM64
weakCASLoopHalf(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
ori value, oldValue, newValue
end)
else
error
end
pushInt64(scratch1)
end)
end)
ipintOp(_i64_atomic_rmw32_or_u, macro()
atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck4, macro(mem, value, scratch1, scratch2)
if ARM64E
atomicxchgori value, [mem], scratch1
elsif X86_64
weakCASLoopInt(mem, value, scratch1, scratch2, macro (value, dst)
orq value, dst
end)
elsif ARM64
weakCASLoopInt(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
ori value, oldValue, newValue
end)
else
error
end
pushInt64(scratch1)
end)
end)
ipintOp(_i32_atomic_rmw_xor, macro()
atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck4, macro(mem, value, scratch1, scratch2)
if ARM64E
atomicxchgxori value, [mem], scratch1
elsif X86_64
weakCASLoopInt(mem, value, scratch1, scratch2, macro (value, dst)
xorq value, dst
end)
elsif ARM64
weakCASLoopInt(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
xori value, oldValue, newValue
end)
else
error
end
pushInt32(scratch1)
end)
end)
ipintOp(_i64_atomic_rmw_xor, macro()
atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck8, macro(mem, value, scratch1, scratch2)
if ARM64E
atomicxchgxorq value, [mem], scratch1
elsif X86_64
weakCASLoopQuad(mem, value, scratch1, scratch2, macro (value, dst)
xorq value, dst
end)
elsif ARM64
weakCASLoopQuad(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
xorq value, oldValue, newValue
end)
else
error
end
pushInt64(scratch1)
end)
end)
ipintOp(_i32_atomic_rmw8_xor_u, macro()
atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck1, macro(mem, value, scratch1, scratch2)
if ARM64E
atomicxchgxorb value, [mem], scratch1
elsif X86_64
weakCASLoopByte(mem, value, scratch1, scratch2, macro (value, dst)
xorq value, dst
end)
elsif ARM64
weakCASLoopByte(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
xori value, oldValue, newValue
end)
else
error
end
pushInt32(scratch1)
end)
end)
ipintOp(_i32_atomic_rmw16_xor_u, macro()
atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck2, macro(mem, value, scratch1, scratch2)
if ARM64E
atomicxchgxorh value, [mem], scratch1
elsif X86_64
weakCASLoopHalf(mem, value, scratch1, scratch2, macro (value, dst)
xorq value, dst
end)
elsif ARM64
weakCASLoopHalf(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
xori value, oldValue, newValue
end)
else
error
end
pushInt32(scratch1)
end)
end)
ipintOp(_i64_atomic_rmw8_xor_u, macro()
atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck1, macro(mem, value, scratch1, scratch2)
if ARM64E
atomicxchgxorb value, [mem], scratch1
elsif X86_64
weakCASLoopByte(mem, value, scratch1, scratch2, macro (value, dst)
xorq value, dst
end)
elsif ARM64
weakCASLoopByte(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
xori value, oldValue, newValue
end)
else
error
end
pushInt64(scratch1)
end)
end)
ipintOp(_i64_atomic_rmw16_xor_u, macro()
atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck2, macro(mem, value, scratch1, scratch2)
if ARM64E
atomicxchgxorh value, [mem], scratch1
elsif X86_64
weakCASLoopHalf(mem, value, scratch1, scratch2, macro (value, dst)
xorq value, dst
end)
elsif ARM64
weakCASLoopHalf(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
xori value, oldValue, newValue
end)
else
error
end
pushInt64(scratch1)
end)
end)
ipintOp(_i64_atomic_rmw32_xor_u, macro()
atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck4, macro(mem, value, scratch1, scratch2)
if ARM64E
atomicxchgxori value, [mem], scratch1
elsif X86_64
weakCASLoopInt(mem, value, scratch1, scratch2, macro (value, dst)
xorq value, dst
end)
elsif ARM64
weakCASLoopInt(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
xori value, oldValue, newValue
end)
else
error
end
pushInt64(scratch1)
end)
end)
ipintOp(_i32_atomic_rmw_xchg, macro()
atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck4, macro(mem, value, scratch1, scratch2)
if ARM64E
atomicxchgi value, [mem], scratch1
elsif X86_64
weakCASLoopInt(mem, value, scratch1, scratch2, macro (value, dst)
move value, dst
end)
elsif ARM64
weakCASLoopInt(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
move value, newValue
end)
else
error
end
pushInt32(scratch1)
end)
end)
ipintOp(_i64_atomic_rmw_xchg, macro()
atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck8, macro(mem, value, scratch1, scratch2)
if ARM64E
atomicxchgq value, [mem], scratch1
elsif X86_64
weakCASLoopQuad(mem, value, scratch1, scratch2, macro (value, dst)
move value, dst
end)
elsif ARM64
weakCASLoopQuad(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
move value, newValue
end)
else
error
end
pushInt64(scratch1)
end)
end)
ipintOp(_i32_atomic_rmw8_xchg_u, macro()
atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck1, macro(mem, value, scratch1, scratch2)
if ARM64E
atomicxchgb value, [mem], scratch1
elsif X86_64
weakCASLoopByte(mem, value, scratch1, scratch2, macro (value, dst)
move value, dst
end)
elsif ARM64
weakCASLoopByte(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
move value, newValue
end)
else
error
end
pushInt32(scratch1)
end)
end)
ipintOp(_i32_atomic_rmw16_xchg_u, macro()
atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck2, macro(mem, value, scratch1, scratch2)
if ARM64E
atomicxchgh value, [mem], scratch1
elsif X86_64
weakCASLoopHalf(mem, value, scratch1, scratch2, macro (value, dst)
move value, dst
end)
elsif ARM64
weakCASLoopHalf(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
move value, newValue
end)
else
error
end
pushInt32(scratch1)
end)
end)
ipintOp(_i64_atomic_rmw8_xchg_u, macro()
atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck1, macro(mem, value, scratch1, scratch2)
if ARM64E
atomicxchgb value, [mem], scratch1
elsif X86_64
weakCASLoopByte(mem, value, scratch1, scratch2, macro (value, dst)
move value, dst
end)
elsif ARM64
weakCASLoopByte(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
move value, newValue
end)
else
error
end
pushInt64(scratch1)
end)
end)
ipintOp(_i64_atomic_rmw16_xchg_u, macro()
atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck2, macro(mem, value, scratch1, scratch2)
if ARM64E
atomicxchgh value, [mem], scratch1
elsif X86_64
weakCASLoopHalf(mem, value, scratch1, scratch2, macro (value, dst)
move value, dst
end)
elsif ARM64
weakCASLoopHalf(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
move value, newValue
end)
else
error
end
pushInt64(scratch1)
end)
end)
ipintOp(_i64_atomic_rmw32_xchg_u, macro()
atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck4, macro(mem, value, scratch1, scratch2)
if ARM64E
atomicxchgi value, [mem], scratch1
elsif X86_64
weakCASLoopInt(mem, value, scratch1, scratch2, macro (value, dst)
move value, dst
end)
elsif ARM64
weakCASLoopInt(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
move value, newValue
end)
else
error
end
pushInt64(scratch1)
end)
end)
macro atomicCmpxchgOp(boundsAndAlignmentCheck, cmpxchg)
# pop value
popInt64(t1)
# pop expected
popInt64(t0)
# pop index
popInt32(t3)
ori 0, t3
# load offset
loadi IPInt::Const32Metadata::value[MC], t2
addp t2, t3
boundsAndAlignmentCheck(t3, t2)
addq memoryBase, t3
cmpxchg(t3, t1, t0, t2, t4)
loadb IPInt::Const32Metadata::instructionLength[MC], t0
advancePCByReg(t0)
advanceMC(constexpr (sizeof(IPInt::Const32Metadata)))
nextIPIntInstruction()
end
macro weakCASExchangeByte(mem, value, expected, scratch, scratch2)
if ARM64
validateOpcodeConfig(scratch2)
.loop:
loadlinkacqb [mem], scratch2
bqneq expected, scratch2, .fail
storecondrelb scratch, value, [mem]
bieq scratch, 0, .done
jmp .loop
.fail:
storecondrelb scratch, scratch2, [mem]
bieq scratch, 0, .done
jmp .loop
.done:
move scratch2, expected
else
error
end
end
macro weakCASExchangeHalf(mem, value, expected, scratch, scratch2)
if ARM64
validateOpcodeConfig(scratch2)
.loop:
loadlinkacqh [mem], scratch2
bqneq expected, scratch2, .fail
storecondrelh scratch, value, [mem]
bieq scratch, 0, .done
jmp .loop
.fail:
storecondrelh scratch, scratch2, [mem]
bieq scratch, 0, .done
jmp .loop
.done:
move scratch2, expected
else
error
end
end
macro weakCASExchangeInt(mem, value, expected, scratch, scratch2)
if ARM64
validateOpcodeConfig(scratch2)
.loop:
loadlinkacqi [mem], scratch2
bqneq expected, scratch2, .fail
storecondreli scratch, value, [mem]
bieq scratch, 0, .done
jmp .loop
.fail:
storecondreli scratch, scratch2, [mem]
bieq scratch, 0, .done
jmp .loop
.done:
move scratch2, expected
else
error
end
end
macro weakCASExchangeQuad(mem, value, expected, scratch, scratch2)
if ARM64
validateOpcodeConfig(scratch2)
.loop:
loadlinkacqq [mem], scratch2
bqneq expected, scratch2, .fail
storecondrelq scratch, value, [mem]
bieq scratch, 0, .done
jmp .loop
.fail:
storecondrelq scratch, scratch2, [mem]
bieq scratch, 0, .done
jmp .loop
.done:
move scratch2, expected
else
error
end
end
ipintOp(_i32_atomic_rmw_cmpxchg, macro()
atomicCmpxchgOp(ipintCheckMemoryBoundWithAlignmentCheck4, macro(mem, value, expected, scratch, scratch2)
andq 0xffffffff, expected
if ARM64E or X86_64
atomicweakcasi expected, value, [mem]
elsif ARM64
weakCASExchangeInt(mem, value, expected, scratch, scratch2)
else
error
end
pushInt32(expected)
end)
end)
ipintOp(_i64_atomic_rmw_cmpxchg, macro()
atomicCmpxchgOp(ipintCheckMemoryBoundWithAlignmentCheck8, macro(mem, value, expected, scratch, scratch2)
if ARM64E or X86_64
atomicweakcasq expected, value, [mem]
elsif ARM64
weakCASExchangeQuad(mem, value, expected, scratch, scratch2)
else
error
end
pushInt64(expected)
end)
end)
ipintOp(_i32_atomic_rmw8_cmpxchg_u, macro()
atomicCmpxchgOp(ipintCheckMemoryBoundWithAlignmentCheck1, macro(mem, value, expected, scratch, scratch2)
andq 0xff, expected
if ARM64E or X86_64
atomicweakcasb expected, value, [mem]
elsif ARM64
weakCASExchangeByte(mem, value, expected, scratch, scratch2)
else
error
end
pushInt32(expected)
end)
end)
ipintOp(_i32_atomic_rmw16_cmpxchg_u, macro()
atomicCmpxchgOp(ipintCheckMemoryBoundWithAlignmentCheck2, macro(mem, value, expected, scratch, scratch2)
andq 0xffff, expected
if ARM64E or X86_64
atomicweakcash expected, value, [mem]
elsif ARM64
weakCASExchangeHalf(mem, value, expected, scratch, scratch2)
else
error
end
pushInt32(expected)
end)
end)
ipintOp(_i64_atomic_rmw8_cmpxchg_u, macro()
atomicCmpxchgOp(ipintCheckMemoryBoundWithAlignmentCheck1, macro(mem, value, expected, scratch, scratch2)
andq 0xff, expected
if ARM64E or X86_64
atomicweakcasb expected, value, [mem]
elsif ARM64
weakCASExchangeByte(mem, value, expected, scratch, scratch2)
else
error
end
pushInt64(expected)
end)
end)
ipintOp(_i64_atomic_rmw16_cmpxchg_u, macro()
atomicCmpxchgOp(ipintCheckMemoryBoundWithAlignmentCheck2, macro(mem, value, expected, scratch, scratch2)
andq 0xffff, expected
if ARM64E or X86_64
atomicweakcash expected, value, [mem]
elsif ARM64
weakCASExchangeHalf(mem, value, expected, scratch, scratch2)
else
error
end
pushInt64(expected)
end)
end)
ipintOp(_i64_atomic_rmw32_cmpxchg_u, macro()
atomicCmpxchgOp(ipintCheckMemoryBoundWithAlignmentCheck4, macro(mem, value, expected, scratch, scratch2)
andq 0xffffffff, expected
if ARM64E or X86_64
atomicweakcasi expected, value, [mem]
elsif ARM64
weakCASExchangeInt(mem, value, expected, scratch, scratch2)
else
error
end
pushInt64(expected)
end)
end)
#######################################
## ULEB128 decoding logic for locals ##
#######################################
macro decodeULEB128(result)
# result should already be the first byte.
andq 0x7f, result
move 7, t2 # t1 holds the shift.
validateOpcodeConfig(t3)
.loop:
loadb [PC], t3
andq t3, 0x7f, t1
lshiftq t2, t1
orq t1, result
addq 7, t2
advancePC(1)
bbaeq t3, 128, .loop
end
slowPathLabel(_local_get)
decodeULEB128(t0)
localGetPostDecode()
slowPathLabel(_local_set)
decodeULEB128(t0)
localSetPostDecode()
slowPathLabel(_local_tee)
decodeULEB128(t0)
localTeePostDecode()
##################################
## "Out of line" logic for call ##
##################################
const mintSS = sc1
macro mintPop(reg)
loadq [mintSS], reg
addq V128ISize, mintSS
end
macro mintPopV(reg)
loadv [mintSS], reg
addq V128ISize, mintSS
end
macro mintArgDispatch()
loadb [MC], sc0
addq 1, MC
bigteq sc0, (constexpr IPInt::CallArgumentBytecode::NumOpcodes), _ipint_mint_arg_dispatch_err
lshiftq (constexpr (WTF::fastLog2(JSC::IPInt::alignMInt))), sc0
if ARM64 or ARM64E
pcrtoaddr _mint_begin, csr4
addq sc0, csr4
jmp csr4
elsif X86_64
leap (_mint_begin - _mint_arg_relativePCBase)[PC, sc0], sc0
jmp sc0
end
end
macro mintRetDispatch()
loadb [MC], sc0
addq 1, MC
bigteq sc0, (constexpr IPInt::CallResultBytecode::NumOpcodes), _ipint_mint_ret_dispatch_err
lshiftq (constexpr (WTF::fastLog2(JSC::IPInt::alignMInt))), sc0
if ARM64 or ARM64E
pcrtoaddr _mint_begin_return, csr4
addq sc0, csr4
jmp csr4
elsif X86_64
leap (_mint_begin_return - _mint_ret_relativePCBase)[PC, sc0], sc0
jmp sc0
end
end
.ipint_call_common:
# we need to do some planning ahead to not step on our own values later
# step 1: save all the stuff we had earlier
# step 2: calling
# - if we have more results than arguments, we need to move our stack pointer up in advance, or else
# pushing 16B values to the stack will overtake cleaning up 8B return values. we get this value from
# CallSignatureMetadata::numExtraResults
# - set up the stack frame (with size CallSignatureMetadata::stackFrameSize)
# step 2.5: saving registers:
# - push our important data onto the stack here, after the saved space
# step 3: jump to called function
# - swap out instances, reload memory, and call
# step 4: returning
# - pop the registers from step 2.5
# - we've left enough space for us to push our new values starting at the original stack pointer now! yay!
# Free up r0 to be used as argument register
const targetEntrypoint = sc2
const targetInstance = sc3
move r0, targetEntrypoint
move r1, targetInstance
const extraSpaceForReturns = t0
const stackFrameSize = t1
const numArguments = t2
loadi IPInt::CallSignatureMetadata::stackFrameSize[MC], stackFrameSize
loadh IPInt::CallSignatureMetadata::numExtraResults[MC], extraSpaceForReturns
mulq StackValueSize, extraSpaceForReturns
loadh IPInt::CallSignatureMetadata::numArguments[MC], numArguments
mulq StackValueSize, numArguments
advanceMC(constexpr (sizeof(IPInt::CallSignatureMetadata)))
# calculate the SP after popping all arguments
move sp, t3
addp numArguments, t3
# (down = decreasing address)
# <first non-arg> <- t3 = SP after all arguments
# arg
# ...
# arg
# arg <- initial SP (wasm stack)
# store sp as our shadow stack for arguments later
move sp, t4
# make extra space if necessary
subp extraSpaceForReturns, sp
# <first non-arg> <- t3
# arg
# ...
# arg
# arg <- t4 = initial SP (wasm stack)
# reserved
# reserved <- sp
# save t3 as a frame-relative value so stack data can be moved easily for JSPI
# t3 is not used after this
subp cfr, t3
push t3, PC
push PL, wasmInstance
# set up the call frame
move sp, t2
subp stackFrameSize, sp
# <first non-arg> <- first_non_arg_addr
# arg
# ...
# arg
# arg <- t4 = initial SP (wasm stack)
# reserved
# reserved
# (first_non_arg_addr - cfr), PC
# PL, wasmInstance <- t2 = native argument stack (pushed by mINT)
# call frame
# call frame
# call frame
# call frame
# call frame
# call frame <- sp
# set up the Callee slot
storeq IPIntCallCallee, Callee - CallerFrameAndPCSize[sp]
storep IPIntCallFunctionSlot, CodeBlock - CallerFrameAndPCSize[sp]
push targetEntrypoint, targetInstance
move t2, sc3
move t4, mintSS
# need a common entrypoint because of x86 PC base
jmp .ipint_mint_arg_dispatch
.ipint_tail_call_common:
# Free up r0 to be used as argument register
# <caller frame>
# return val
# return val
# argument
# argument
# argument
# argument
# call frame
# call frame <- cfr
# (IPInt locals)
# (IPInt stack)
# argument 0
# ...
# argument n-1
# argument n <- sp
# sc1 = target callee => wasmInstance to free up sc1
const savedCallee = wasmInstance
# store entrypoint and target instance on the stack for now
push r0, r1
push IPIntCallCallee, IPIntCallFunctionSlot
# keep the top of IPInt stack in sc1 as shadow stack
move sp, sc1
# we pushed four values previously, so offset for this
addq 32, sc1
# <caller frame>
# return val
# return val
# argument
# argument
# argument
# argument
# call frame
# call frame <- cfr
# (IPInt locals)
# (IPInt stack)
# argument 0
# ...
# argument n-1
# argument n <- sc1
# entrypoint, targetInstance
# callee, function info <- sp
# determine the location to begin copying stack arguments, starting from the last
move cfr, sc2
addp FirstArgumentOffset, sc2
addp t3, sc2 # t3 = callerStackArgSize from the metadata
# <caller frame> <- sc2
# return val
# return val
# argument
# argument
# argument
# argument
# call frame
# call frame <- cfr
# (IPInt locals)
# (IPInt stack)
# argument 0
# ...
# argument n-1
# argument n <- sc1
# entrypoint, targetInstance
# callee, function info <- sp
# get saved MC and PC
if ARM64 or ARM64E
loadpairq -0x10[cfr], t0, t1
elsif X86_64 or RISCV64
loadp -0x8[cfr], t1
loadp -0x10[cfr], t0
end
push t0, t1
# store the return address and CFR on the stack so we don't lose it
loadp ReturnPC[cfr], t0
loadp [cfr], t1
push t0, t1
# <caller frame> <- sc2
# return val
# return val
# argument
# argument
# argument
# argument
# call frame
# call frame <- cfr
# (IPInt locals)
# (IPInt stack)
# argument 0
# ...
# argument n-1
# argument n <- sc1
# entrypoint, targetInstance
# callee, function info
# saved MC/PC
# return address, saved CFR <- sp
.ipint_mint_arg_dispatch:
# on x86, we'll use PC for our PC base
initPCRelative(mint_arg, PC)
// We've already validateOpcodeConfig() in all the Wasm call opcodes.
mintArgDispatch()
# tail calls reuse most of mINT's argument logic, but exit into a different tail call stub.
# we use sc2 to keep the new stack frame
mintAlign(_a0)
_mint_begin:
mintPop(a0)
mintArgDispatch()
mintAlign(_a1)
mintPop(a1)
mintArgDispatch()
mintAlign(_a2)
if ARM64 or ARM64E or X86_64
mintPop(a2)
mintArgDispatch()
else
break
end
mintAlign(_a3)
if ARM64 or ARM64E or X86_64
mintPop(a3)
mintArgDispatch()
else
break
end
mintAlign(_a4)
if ARM64 or ARM64E or X86_64
mintPop(a4)
mintArgDispatch()
else
break
end
mintAlign(_a5)
if ARM64 or ARM64E or X86_64
mintPop(a5)
mintArgDispatch()
else
break
end
mintAlign(_a6)
if ARM64 or ARM64E
mintPop(a6)
mintArgDispatch()
else
break
end
mintAlign(_a7)
if ARM64 or ARM64E
mintPop(a7)
mintArgDispatch()
else
break
end
mintAlign(_fa0)
mintPopV(wfa0)
mintArgDispatch()
mintAlign(_fa1)
mintPopV(wfa1)
mintArgDispatch()
mintAlign(_fa2)
mintPopV(wfa2)
mintArgDispatch()
mintAlign(_fa3)
mintPopV(wfa3)
mintArgDispatch()
mintAlign(_fa4)
mintPopV(wfa4)
mintArgDispatch()
mintAlign(_fa5)
mintPopV(wfa5)
mintArgDispatch()
mintAlign(_fa6)
mintPopV(wfa6)
mintArgDispatch()
mintAlign(_fa7)
mintPopV(wfa7)
mintArgDispatch()
# Note that the regular call and tail call opcodes will be implemented slightly differently.
# Regular calls have to save space for return values, while tail calls are reusing the stack frame
# and thus do not have to care.
# CallArgumentBytecode::CallArgDecSP (0x10)
mintAlign(_call_argument_dec_sp)
subp 2 * SlotSize, sc3
mintArgDispatch()
# CallArgumentBytecode::CallArgStore0 (0x11)
mintAlign(_call_argument_store_0)
mintPop(sc2)
storeq sc2, [sc3]
mintArgDispatch()
# CallArgumentBytecode::CallArgDecSPStore8 (0x12)
mintAlign(_call_argument_dec_sp_store_8)
mintPop(sc2)
subp 2 * SlotSize, sc3
storeq sc2, 8[sc3]
mintArgDispatch()
# CallArgumentBytecode::CallArgDecSPStoreVector0 (0x13)
mintAlign(_call_argument_dec_sp_store_vector_0)
subp 2 * SlotSize, sc3
loadq [mintSS], sc2
storeq sc2, [sc3]
loadq 8[mintSS], sc2
storeq sc2, 8[sc3]
addq StackValueSize, mintSS
mintArgDispatch()
# CallArgumentBytecode::TailCallArgDecSPStoreVector8 (0x14)
mintAlign(_call_argument_dec_sp_store_vector_8)
subp 2 * SlotSize, sc3
loadq [mintSS], sc2
storeq sc2, 8[sc3]
loadq 8[mintSS], sc2
storeq sc2, 16[sc3]
addq StackValueSize, mintSS
mintArgDispatch()
# For tail calls, we're writing into the same frame. We're going to first push stack arguments onto the stack.
# Once we're done, we'll copy them back down into the new frame, to avoid having to deal with writing over
# arguments lower down on the stack.
# CallArgumentBytecode::TailCallArgDecSP (0x15)
mintAlign(_tail_call_argument_dec_sp)
subp 2 * SlotSize, sp
mintArgDispatch()
# CallArgumentBytecode::TailCallArgStore0 (0x16)
mintAlign(_tail_call_argument_store_0)
mintPop(sc3)
storeq sc3, [sp]
mintArgDispatch()
# CallArgumentBytecode::TailCallArgDecSPStore8 (0x17)
mintAlign(_tail_call_argument_dec_sp_store_8)
mintPop(sc3)
subp 2 * SlotSize, sp
storeq sc3, 8[sp]
mintArgDispatch()
# CallArgumentBytecode::TailCallArgDecSPStoreVector0 (0x18)
mintAlign(_tail_call_argument_dec_sp_store_vector_0)
subp 2 * SlotSize, sp
loadq [mintSS], sc3
storeq sc3, [sp]
loadq 8[mintSS], sc3
storeq sc3, 8[sp]
addq StackValueSize, mintSS
mintArgDispatch()
# CallArgumentBytecode::TailCallArgDecSPStoreVector8 (0x19)
mintAlign(_tail_call_argument_dec_sp_store_vector_8)
subp 2 * SlotSize, sp
loadq [mintSS], sc3
storeq sc3, 8[sp]
loadq 8[mintSS], sc3
storeq sc3, 16[sp]
addq StackValueSize, mintSS
mintArgDispatch()
# CallArgumentBytecode::TailCall (0x1a)
mintAlign(_tail_call)
jmp .ipint_perform_tail_call
# CallArgumentBytecode::Call (0x1b)
mintAlign(_call)
pop wasmInstance, ws0
# pop targetInstance, targetEntrypoint
# Save stack pointer, if we tail call someone who changes the frame above's stack argument size.
# Store its value relative to cfp so stack frames can be easily relocated for JSPI.
move sp, sc1
subp cfr, sc1
storep sc1, ThisArgumentOffset[cfr]
# Swap instances
# move targetInstance, wasmInstance
# Set up memory
push t2, t3
ipintReloadMemory()
pop t3, t2
# move targetEntrypoint, ws0
# Make the call
if ARM64E
leap _g_config, ws1
jmp JSCConfigGateMapOffset + (constexpr Gate::wasm_ipint_call) * PtrSize[ws1], NativeToJITGatePtrTag # WasmEntryPtrTag
end
_wasm_trampoline_wasm_ipint_call:
_wasm_trampoline_wasm_ipint_call_wide16:
_wasm_trampoline_wasm_ipint_call_wide32:
call ws0, WasmEntryPtrTag
_wasm_ipint_call_return_location:
_wasm_ipint_call_return_location_wide16:
_wasm_ipint_call_return_location_wide32:
# Restore the stack pointer
loadp ThisArgumentOffset[cfr], sc0
addp cfr, sc0
move sc0, sp
# <first non-arg> <- first_non_arg_addr
# arg
# ...
# arg
# arg
# reserved
# reserved
# (first_non_arg_addr - cfr), PC
# PL, wasmInstance <- sc3
# call frame return
# call frame return
# call frame
# call frame
# call frame
# call frame <- sp
loadi IPInt::CallReturnMetadata::stackFrameSize[MC], sc3
leap [sp, sc3], sc3
const mintRetSrc = sc1
const mintRetDst = sc2
loadi IPInt::CallReturnMetadata::firstStackResultSPOffset[MC], mintRetSrc
advanceMC(IPInt::CallReturnMetadata::resultBytecode)
leap [sp, mintRetSrc], mintRetSrc
# load (first_non_arg_addr - cfr) from the stack and make it absolute
if ARM64 or ARM64E
loadp (2 * SlotSize)[sc3], mintRetDst
elsif X86_64
loadp (3 * SlotSize)[sc3], mintRetDst
end
addp cfr, mintRetDst
# on x86, we'll use PC again for our PC base
initPCRelative(mint_ret, PC)
// We've already validateOpcodeConfig() in all the Wasm call opcodes, and
// that is the only way to get here.
mintRetDispatch()
mintAlign(_r0)
_mint_begin_return:
subp StackValueSize, mintRetDst
storeq wa0, [mintRetDst]
mintRetDispatch()
mintAlign(_r1)
subp StackValueSize, mintRetDst
storeq wa1, [mintRetDst]
mintRetDispatch()
mintAlign(_r2)
if ARM64 or ARM64E or X86_64
subp StackValueSize, mintRetDst
storeq wa2, [mintRetDst]
mintRetDispatch()
else
break
end
mintAlign(_r3)
if ARM64 or ARM64E or X86_64
subp StackValueSize, mintRetDst
storeq wa3, [mintRetDst]
mintRetDispatch()
else
break
end
mintAlign(_r4)
if ARM64 or ARM64E or X86_64
subp StackValueSize, mintRetDst
storeq wa4, [mintRetDst]
mintRetDispatch()
else
break
end
mintAlign(_r5)
if ARM64 or ARM64E or X86_64
subp StackValueSize, mintRetDst
storeq wa5, [mintRetDst]
mintRetDispatch()
else
break
end
mintAlign(_r6)
if ARM64 or ARM64E
subp StackValueSize, mintRetDst
storeq wa6, [mintRetDst]
mintRetDispatch()
else
break
end
mintAlign(_r7)
if ARM64 or ARM64E
subp StackValueSize, mintRetDst
storeq wa7, [mintRetDst]
mintRetDispatch()
else
break
end
mintAlign(_fr0)
subp StackValueSize, mintRetDst
storev wfa0, [mintRetDst]
mintRetDispatch()
mintAlign(_fr1)
subp StackValueSize, mintRetDst
storev wfa1, [mintRetDst]
mintRetDispatch()
mintAlign(_fr2)
subp StackValueSize, mintRetDst
storev wfa2, [mintRetDst]
mintRetDispatch()
mintAlign(_fr3)
subp StackValueSize, mintRetDst
storev wfa3, [mintRetDst]
mintRetDispatch()
mintAlign(_fr4)
subp StackValueSize, mintRetDst
storev wfa4, [mintRetDst]
mintRetDispatch()
mintAlign(_fr5)
subp StackValueSize, mintRetDst
storev wfa5, [mintRetDst]
mintRetDispatch()
mintAlign(_fr6)
subp StackValueSize, mintRetDst
storev wfa6, [mintRetDst]
mintRetDispatch()
mintAlign(_fr7)
subp StackValueSize, mintRetDst
storev wfa7, [mintRetDst]
mintRetDispatch()
# CallResultBytecode::ResultStack (0x10)
mintAlign(_result_stack)
loadq [mintRetSrc], sc0
addp SlotSize, mintRetSrc
subp StackValueSize, mintRetDst
storeq sc0, [mintRetDst]
mintRetDispatch()
# CallResultBytecode::ResultStackVector (0x11)
mintAlign(_result_stack_vector)
subp StackValueSize, mintRetDst
loadq [mintRetSrc], sc0
storeq sc0, [mintRetDst]
loadq 8[mintRetSrc], sc0
storeq sc0, 8[mintRetDst]
addp 2 * SlotSize, mintRetSrc
mintRetDispatch()
mintAlign(_end)
# <first non-arg> <- first_non_arg_addr
# return result
# ...
# return result
# return result
# return result
# return result <- mintRetDst => new SP
# (first_non_arg_addr - cfr), PC
# PL, wasmInstance <- sc3
# call frame return <- mintRetSrc
# call frame return
# call frame
# call frame
# call frame
# call frame <- sp
# note: we don't care about t3 anymore
if ARM64 or ARM64E
loadpairq [sc3], PL, wasmInstance
else
loadq [sc3], wasmInstance
end
move mintRetDst, sp
if X86_64
move wasmInstance, sc2
end
# Restore PC / MC
loadp Callee[cfr], ws0
unboxWasmCallee(ws0, ws1)
storep ws0, UnboxedWasmCalleeStackSlot[cfr]
if X86_64
move sc2, wasmInstance
loadq 8[sc3], PL
loadp (2 * SlotSize)[sc3], PC
end
# Restore memory
ipintReloadMemory()
nextIPIntInstruction()
.ipint_perform_tail_call:
# <caller frame> <- sc2
# return val
# return val
# argument
# argument
# argument
# argument
# call frame
# call frame <- cfr
# (IPInt locals)
# (IPInt stack) <- sc1 (was shadow stack, now dead and can re-use)
# argument 0
# ...
# argument n-1
# argument n
# entrypoint, targetInstance
# callee, function info
# saved MC/PC
# return address, saved CFR
# stack arguments
# stack arguments
# stack arguments
# stack arguments <- sp
# load the size of the arguments and results space, and subtract that from sc2
loadi [MC], sc3
negq sc3
# copy args to sc2 region
validateOpcodeConfig(sc0)
.ipint_tail_call_copy_stackargs_loop:
bqgteq sc3, 0, .ipint_tail_call_copy_stackargs_loop_end
if ARM64 or ARM64E
loadpairq [sp], sc0, sc1
storepairq sc0, sc1, [sc2, sc3]
else
loadq [sp], sc0
loadq 8[sp], sc1
storeq sc0, [sc2, sc3]
storeq sc1, 8[sc2, sc3]
end
addp 16, sc3
addp 16, sp
jmp .ipint_tail_call_copy_stackargs_loop
.ipint_tail_call_copy_stackargs_loop_end:
# reload it here, which isn't optimal, but we don't really have registers
loadi [MC], sc3
subp sc3, sc2
# re-setup the call frame, and load our return address in
subp FirstArgumentOffset, sc2
if X86_64
pop sc1, sc0
storep sc0, ReturnPC[sc2]
elsif ARM64 or ARM64E or ARMv7 or RISCV64
pop sc1, lr
end
pop PC, MC
# function info, callee
pop sc3, sc0
# save new Callee
storeq sc0, Callee[sc2]
storep sc3, CodeBlock[sc2]
# take off the last two values we stored, and move SP down to make it look like a fresh frame
pop targetInstance, ws0
# <caller frame>
# return val
# return val
# ...
# argument
# argument
# argument
# argument
# argument <- cfr
# argument
# argument
# <to be frame>
# <to be frame> <- NEW SP
# <to be frame> <- sc2
# argument 0
# ...
# argument n-1
# argument n
# on ARM: lr = return address
move sc2, sp
if ARM64E
addp CallerFrameAndPCSize, cfr, ws2
end
# saved cfr
move sc1, cfr
# swap instances
move targetInstance, wasmInstance
# set up memory
push t2, t3
ipintReloadMemory()
pop t3, t2
addp CallerFrameAndPCSize, sp
if X86_64
subp 8, sp
end
# go!
if ARM64E
leap _g_config, ws1
jmp JSCConfigGateMapOffset + (constexpr Gate::wasmIPIntTailCallWasmEntryPtrTag) * PtrSize[ws1], NativeToJITGatePtrTag # WasmEntryPtrTag
end
_wasm_trampoline_wasm_ipint_tail_call:
_wasm_trampoline_wasm_ipint_tail_call_wide16:
_wasm_trampoline_wasm_ipint_tail_call_wide32:
jmp ws0, WasmEntryPtrTag
_ipint_argument_dispatch_err:
move 0x55, a0
break
_ipint_uint_dispatch_err:
move 0x66, a0
break
_ipint_mint_arg_dispatch_err:
move 0x77, a0
break
_ipint_mint_ret_dispatch_err:
move 0x88, a0
break
###########################################
# uINT: function return value interpreter #
###########################################
uintAlign(_r0)
_uint_begin:
popQuad(wa0)
uintDispatch()
uintAlign(_r1)
popQuad(wa1)
uintDispatch()
uintAlign(_r2)
popQuad(wa2)
uintDispatch()
uintAlign(_r3)
popQuad(wa3)
uintDispatch()
uintAlign(_r4)
popQuad(wa4)
uintDispatch()
uintAlign(_r5)
popQuad(wa5)
uintDispatch()
uintAlign(_r6)
if ARM64 or ARM64E
popQuad(wa6)
uintDispatch()
else
break
end
uintAlign(_r7)
if ARM64 or ARM64E
popQuad(wa7)
uintDispatch()
else
break
end
uintAlign(_fr0)
popVec(wfa0)
uintDispatch()
uintAlign(_fr1)
popVec(wfa1)
uintDispatch()
uintAlign(_fr2)
popVec(wfa2)
uintDispatch()
uintAlign(_fr3)
popVec(wfa3)
uintDispatch()
uintAlign(_fr4)
popVec(wfa4)
uintDispatch()
uintAlign(_fr5)
popVec(wfa5)
uintDispatch()
uintAlign(_fr6)
popVec(wfa6)
uintDispatch()
uintAlign(_fr7)
popVec(wfa7)
uintDispatch()
# destination on stack is sc0
uintAlign(_stack)
popInt64(sc1)
subp SlotSize, sc0
storeq sc1, [sc0]
uintDispatch()
uintAlign(_stack_vector)
subp 2 * SlotSize, sc0
loadq [sp], sc1
storeq sc1, [sc0]
loadq 8[sp], sc1
storeq sc1, 8[sc0]
addq StackValueSize, sp
uintDispatch()
uintAlign(_ret)
jmp .ipint_exit
# MC = location in argumINT bytecode
# csr0 = tmp
# csr1 = dst
# csr2 = src
# csr3
# csr4 = for dispatch
# const argumINTDest = csr3
# const argumINTSrc = PB
argumINTAlign(_a0)
_argumINT_begin:
storeq wa0, [argumINTDst]
addp LocalSize, argumINTDst
argumINTDispatch()
argumINTAlign(_a1)
storeq wa1, [argumINTDst]
addp LocalSize, argumINTDst
argumINTDispatch()
argumINTAlign(_a2)
if ARM64 or ARM64E or X86_64
storeq wa2, [argumINTDst]
addp LocalSize, argumINTDst
argumINTDispatch()
else
break
end
argumINTAlign(_a3)
if ARM64 or ARM64E or X86_64
storeq wa3, [argumINTDst]
addp LocalSize, argumINTDst
argumINTDispatch()
else
break
end
argumINTAlign(_a4)
if ARM64 or ARM64E or X86_64
storeq wa4, [argumINTDst]
addp LocalSize, argumINTDst
argumINTDispatch()
else
break
end
argumINTAlign(_a5)
if ARM64 or ARM64E or X86_64
storeq wa5, [argumINTDst]
addp LocalSize, argumINTDst
argumINTDispatch()
else
break
end
argumINTAlign(_a6)
if ARM64 or ARM64E
storeq wa6, [argumINTDst]
addp LocalSize, argumINTDst
argumINTDispatch()
else
break
end
argumINTAlign(_a7)
if ARM64 or ARM64E
storeq wa7, [argumINTDst]
addp LocalSize, argumINTDst
argumINTDispatch()
else
break
end
argumINTAlign(_fa0)
storev wfa0, [argumINTDst]
addp LocalSize, argumINTDst
argumINTDispatch()
argumINTAlign(_fa1)
storev wfa1, [argumINTDst]
addp LocalSize, argumINTDst
argumINTDispatch()
argumINTAlign(_fa2)
storev wfa2, [argumINTDst]
addp LocalSize, argumINTDst
argumINTDispatch()
argumINTAlign(_fa3)
storev wfa3, [argumINTDst]
addp LocalSize, argumINTDst
argumINTDispatch()
argumINTAlign(_fa4)
storev wfa4, [argumINTDst]
addp LocalSize, argumINTDst
argumINTDispatch()
argumINTAlign(_fa5)
storev wfa5, [argumINTDst]
addp LocalSize, argumINTDst
argumINTDispatch()
argumINTAlign(_fa6)
storev wfa6, [argumINTDst]
addp LocalSize, argumINTDst
argumINTDispatch()
argumINTAlign(_fa7)
storev wfa7, [argumINTDst]
addp LocalSize, argumINTDst
argumINTDispatch()
argumINTAlign(_stack)
loadq [argumINTSrc], csr0
addp SlotSize, argumINTSrc
storeq csr0, [argumINTDst]
addp LocalSize, argumINTDst
argumINTDispatch()
argumINTAlign(_stack_vector)
loadq [argumINTSrc], csr0
storeq csr0, [argumINTDst]
loadq 8[argumINTSrc], csr0
storeq csr0, 8[argumINTDst]
addp 2 * SlotSize, argumINTSrc
addp LocalSize, argumINTDst
argumINTDispatch()
argumINTAlign(_end)
jmp .ipint_entry_end_local
if ARM64E
global _wasmTailCallTrampoline
_wasmTailCallTrampoline:
untagReturnAddress ws2
jmp ws0, WasmEntryPtrTag
end