interpreter.go

   1package interpreter
   2
   3import (
   4	"context"
   5	"encoding/binary"
   6	"errors"
   7	"fmt"
   8	"math"
   9	"math/bits"
  10	"sync"
  11	"unsafe"
  12
  13	"github.com/tetratelabs/wazero/api"
  14	"github.com/tetratelabs/wazero/experimental"
  15	"github.com/tetratelabs/wazero/internal/expctxkeys"
  16	"github.com/tetratelabs/wazero/internal/filecache"
  17	"github.com/tetratelabs/wazero/internal/internalapi"
  18	"github.com/tetratelabs/wazero/internal/moremath"
  19	"github.com/tetratelabs/wazero/internal/wasm"
  20	"github.com/tetratelabs/wazero/internal/wasmdebug"
  21	"github.com/tetratelabs/wazero/internal/wasmruntime"
  22)
  23
  24// callStackCeiling is the maximum WebAssembly call frame stack height. This allows wazero to raise
  25// wasm.ErrCallStackOverflow instead of overflowing the Go runtime.
  26//
  27// The default value should suffice for most use cases. Those wishing to change this can via `go build -ldflags`.
  28var callStackCeiling = 2000
  29
  30// engine is an interpreter implementation of wasm.Engine
  31type engine struct {
  32	enabledFeatures   api.CoreFeatures
  33	compiledFunctions map[wasm.ModuleID][]compiledFunction // guarded by mutex.
  34	mux               sync.RWMutex
  35}
  36
  37func NewEngine(_ context.Context, enabledFeatures api.CoreFeatures, _ filecache.Cache) wasm.Engine {
  38	return &engine{
  39		enabledFeatures:   enabledFeatures,
  40		compiledFunctions: map[wasm.ModuleID][]compiledFunction{},
  41	}
  42}
  43
  44// Close implements the same method as documented on wasm.Engine.
  45func (e *engine) Close() (err error) {
  46	return
  47}
  48
  49// CompiledModuleCount implements the same method as documented on wasm.Engine.
  50func (e *engine) CompiledModuleCount() uint32 {
  51	return uint32(len(e.compiledFunctions))
  52}
  53
  54// DeleteCompiledModule implements the same method as documented on wasm.Engine.
  55func (e *engine) DeleteCompiledModule(m *wasm.Module) {
  56	e.deleteCompiledFunctions(m)
  57}
  58
  59func (e *engine) deleteCompiledFunctions(module *wasm.Module) {
  60	e.mux.Lock()
  61	defer e.mux.Unlock()
  62	delete(e.compiledFunctions, module.ID)
  63}
  64
  65func (e *engine) addCompiledFunctions(module *wasm.Module, fs []compiledFunction) {
  66	e.mux.Lock()
  67	defer e.mux.Unlock()
  68	e.compiledFunctions[module.ID] = fs
  69}
  70
  71func (e *engine) getCompiledFunctions(module *wasm.Module) (fs []compiledFunction, ok bool) {
  72	e.mux.RLock()
  73	defer e.mux.RUnlock()
  74	fs, ok = e.compiledFunctions[module.ID]
  75	return
  76}
  77
  78// moduleEngine implements wasm.ModuleEngine
  79type moduleEngine struct {
  80	// codes are the compiled functions in a module instances.
  81	// The index is module instance-scoped.
  82	functions []function
  83
  84	// parentEngine holds *engine from which this module engine is created from.
  85	parentEngine *engine
  86}
  87
  88// GetGlobalValue implements the same method as documented on wasm.ModuleEngine.
  89func (e *moduleEngine) GetGlobalValue(wasm.Index) (lo, hi uint64) {
  90	panic("BUG: GetGlobalValue should never be called on interpreter mode")
  91}
  92
  93// SetGlobalValue implements the same method as documented on wasm.ModuleEngine.
  94func (e *moduleEngine) SetGlobalValue(idx wasm.Index, lo, hi uint64) {
  95	panic("BUG: SetGlobalValue should never be called on interpreter mode")
  96}
  97
  98// OwnsGlobals implements the same method as documented on wasm.ModuleEngine.
  99func (e *moduleEngine) OwnsGlobals() bool { return false }
 100
 101// MemoryGrown implements wasm.ModuleEngine.
 102func (e *moduleEngine) MemoryGrown() {}
 103
 104// callEngine holds context per moduleEngine.Call, and shared across all the
 105// function calls originating from the same moduleEngine.Call execution.
 106//
 107// This implements api.Function.
 108type callEngine struct {
 109	internalapi.WazeroOnlyType
 110
 111	// stack contains the operands.
 112	// Note that all the values are represented as uint64.
 113	stack []uint64
 114
 115	// frames are the function call stack.
 116	frames []*callFrame
 117
 118	// f is the initial function for this call engine.
 119	f *function
 120
 121	// stackiterator for Listeners to walk frames and stack.
 122	stackIterator stackIterator
 123}
 124
 125func (e *moduleEngine) newCallEngine(compiled *function) *callEngine {
 126	return &callEngine{f: compiled}
 127}
 128
 129func (ce *callEngine) pushValue(v uint64) {
 130	ce.stack = append(ce.stack, v)
 131}
 132
 133func (ce *callEngine) pushValues(v []uint64) {
 134	ce.stack = append(ce.stack, v...)
 135}
 136
 137func (ce *callEngine) popValue() (v uint64) {
 138	// No need to check stack bound
 139	// as we can assume that all the operations
 140	// are valid thanks to validateFunction
 141	// at module validation phase
 142	// and interpreterir translation
 143	// before compilation.
 144	stackTopIndex := len(ce.stack) - 1
 145	v = ce.stack[stackTopIndex]
 146	ce.stack = ce.stack[:stackTopIndex]
 147	return
 148}
 149
 150func (ce *callEngine) popValues(v []uint64) {
 151	stackTopIndex := len(ce.stack) - len(v)
 152	copy(v, ce.stack[stackTopIndex:])
 153	ce.stack = ce.stack[:stackTopIndex]
 154}
 155
 156// peekValues peeks api.ValueType values from the stack and returns them.
 157func (ce *callEngine) peekValues(count int) []uint64 {
 158	if count == 0 {
 159		return nil
 160	}
 161	stackLen := len(ce.stack)
 162	return ce.stack[stackLen-count : stackLen]
 163}
 164
 165func (ce *callEngine) drop(raw uint64) {
 166	r := inclusiveRangeFromU64(raw)
 167	if r.Start == -1 {
 168		return
 169	} else if r.Start == 0 {
 170		ce.stack = ce.stack[:int32(len(ce.stack))-1-r.End]
 171	} else {
 172		newStack := ce.stack[:int32(len(ce.stack))-1-r.End]
 173		newStack = append(newStack, ce.stack[int32(len(ce.stack))-r.Start:]...)
 174		ce.stack = newStack
 175	}
 176}
 177
 178func (ce *callEngine) pushFrame(frame *callFrame) {
 179	if callStackCeiling <= len(ce.frames) {
 180		panic(wasmruntime.ErrRuntimeStackOverflow)
 181	}
 182	ce.frames = append(ce.frames, frame)
 183}
 184
 185func (ce *callEngine) popFrame() (frame *callFrame) {
 186	// No need to check stack bound as we can assume that all the operations are valid thanks to validateFunction at
 187	// module validation phase and interpreterir translation before compilation.
 188	oneLess := len(ce.frames) - 1
 189	frame = ce.frames[oneLess]
 190	ce.frames = ce.frames[:oneLess]
 191	return
 192}
 193
 194type callFrame struct {
 195	// pc is the program counter representing the current position in code.body.
 196	pc uint64
 197	// f is the compiled function used in this function frame.
 198	f *function
 199	// base index in the frame of this function, used to detect the count of
 200	// values on the stack.
 201	base int
 202}
 203
 204type compiledFunction struct {
 205	source              *wasm.Module
 206	body                []unionOperation
 207	listener            experimental.FunctionListener
 208	offsetsInWasmBinary []uint64
 209	hostFn              interface{}
 210	ensureTermination   bool
 211	index               wasm.Index
 212}
 213
 214type function struct {
 215	funcType       *wasm.FunctionType
 216	moduleInstance *wasm.ModuleInstance
 217	typeID         wasm.FunctionTypeID
 218	parent         *compiledFunction
 219}
 220
 221// functionFromUintptr resurrects the original *function from the given uintptr
 222// which comes from either funcref table or OpcodeRefFunc instruction.
 223func functionFromUintptr(ptr uintptr) *function {
 224	// Wraps ptrs as the double pointer in order to avoid the unsafe access as detected by race detector.
 225	//
 226	// For example, if we have (*function)(unsafe.Pointer(ptr)) instead, then the race detector's "checkptr"
 227	// subroutine wanrs as "checkptr: pointer arithmetic result points to invalid allocation"
 228	// https://github.com/golang/go/blob/1ce7fcf139417d618c2730010ede2afb41664211/src/runtime/checkptr.go#L69
 229	var wrapped *uintptr = &ptr
 230	return *(**function)(unsafe.Pointer(wrapped))
 231}
 232
 233type snapshot struct {
 234	stack  []uint64
 235	frames []*callFrame
 236	pc     uint64
 237
 238	ret []uint64
 239
 240	ce *callEngine
 241}
 242
 243// Snapshot implements the same method as documented on experimental.Snapshotter.
 244func (ce *callEngine) Snapshot() experimental.Snapshot {
 245	stack := make([]uint64, len(ce.stack))
 246	copy(stack, ce.stack)
 247
 248	frames := make([]*callFrame, len(ce.frames))
 249	copy(frames, ce.frames)
 250
 251	return &snapshot{
 252		stack:  stack,
 253		frames: frames,
 254		ce:     ce,
 255	}
 256}
 257
 258// Restore implements the same method as documented on experimental.Snapshot.
 259func (s *snapshot) Restore(ret []uint64) {
 260	s.ret = ret
 261	panic(s)
 262}
 263
 264func (s *snapshot) doRestore() {
 265	ce := s.ce
 266
 267	ce.stack = s.stack
 268	ce.frames = s.frames
 269	ce.frames[len(ce.frames)-1].pc = s.pc
 270
 271	copy(ce.stack[len(ce.stack)-len(s.ret):], s.ret)
 272}
 273
 274// Error implements the same method on error.
 275func (s *snapshot) Error() string {
 276	return "unhandled snapshot restore, this generally indicates restore was called from a different " +
 277		"exported function invocation than snapshot"
 278}
 279
 280// stackIterator implements experimental.StackIterator.
 281type stackIterator struct {
 282	stack   []uint64
 283	frames  []*callFrame
 284	started bool
 285	fn      *function
 286	pc      uint64
 287}
 288
 289func (si *stackIterator) reset(stack []uint64, frames []*callFrame, f *function) {
 290	si.fn = f
 291	si.pc = 0
 292	si.stack = stack
 293	si.frames = frames
 294	si.started = false
 295}
 296
 297func (si *stackIterator) clear() {
 298	si.stack = nil
 299	si.frames = nil
 300	si.started = false
 301	si.fn = nil
 302}
 303
 304// Next implements the same method as documented on experimental.StackIterator.
 305func (si *stackIterator) Next() bool {
 306	if !si.started {
 307		si.started = true
 308		return true
 309	}
 310
 311	if len(si.frames) == 0 {
 312		return false
 313	}
 314
 315	frame := si.frames[len(si.frames)-1]
 316	si.stack = si.stack[:frame.base]
 317	si.fn = frame.f
 318	si.pc = frame.pc
 319	si.frames = si.frames[:len(si.frames)-1]
 320	return true
 321}
 322
 323// Function implements the same method as documented on
 324// experimental.StackIterator.
 325func (si *stackIterator) Function() experimental.InternalFunction {
 326	return internalFunction{si.fn}
 327}
 328
 329// ProgramCounter implements the same method as documented on
 330// experimental.StackIterator.
 331func (si *stackIterator) ProgramCounter() experimental.ProgramCounter {
 332	return experimental.ProgramCounter(si.pc)
 333}
 334
 335// internalFunction implements experimental.InternalFunction.
 336type internalFunction struct{ *function }
 337
 338// Definition implements the same method as documented on
 339// experimental.InternalFunction.
 340func (f internalFunction) Definition() api.FunctionDefinition {
 341	return f.definition()
 342}
 343
 344// SourceOffsetForPC implements the same method as documented on
 345// experimental.InternalFunction.
 346func (f internalFunction) SourceOffsetForPC(pc experimental.ProgramCounter) uint64 {
 347	offsetsMap := f.parent.offsetsInWasmBinary
 348	if uint64(pc) < uint64(len(offsetsMap)) {
 349		return offsetsMap[pc]
 350	}
 351	return 0
 352}
 353
 354// interpreter mode doesn't maintain call frames in the stack, so pass the zero size to the IR.
 355const callFrameStackSize = 0
 356
 357// CompileModule implements the same method as documented on wasm.Engine.
 358func (e *engine) CompileModule(_ context.Context, module *wasm.Module, listeners []experimental.FunctionListener, ensureTermination bool) error {
 359	if _, ok := e.getCompiledFunctions(module); ok { // cache hit!
 360		return nil
 361	}
 362
 363	funcs := make([]compiledFunction, len(module.FunctionSection))
 364	irCompiler, err := newCompiler(e.enabledFeatures, callFrameStackSize, module, ensureTermination)
 365	if err != nil {
 366		return err
 367	}
 368	imported := module.ImportFunctionCount
 369	for i := range module.CodeSection {
 370		var lsn experimental.FunctionListener
 371		if i < len(listeners) {
 372			lsn = listeners[i]
 373		}
 374
 375		compiled := &funcs[i]
 376		// If this is the host function, there's nothing to do as the runtime representation of
 377		// host function in interpreter is its Go function itself as opposed to Wasm functions,
 378		// which need to be compiled down to
 379		if codeSeg := &module.CodeSection[i]; codeSeg.GoFunc != nil {
 380			compiled.hostFn = codeSeg.GoFunc
 381		} else {
 382			ir, err := irCompiler.Next()
 383			if err != nil {
 384				return err
 385			}
 386			err = e.lowerIR(ir, compiled)
 387			if err != nil {
 388				def := module.FunctionDefinition(uint32(i) + module.ImportFunctionCount)
 389				return fmt.Errorf("failed to lower func[%s] to interpreterir: %w", def.DebugName(), err)
 390			}
 391		}
 392		compiled.source = module
 393		compiled.ensureTermination = ensureTermination
 394		compiled.listener = lsn
 395		compiled.index = imported + uint32(i)
 396	}
 397	e.addCompiledFunctions(module, funcs)
 398	return nil
 399}
 400
 401// NewModuleEngine implements the same method as documented on wasm.Engine.
 402func (e *engine) NewModuleEngine(module *wasm.Module, instance *wasm.ModuleInstance) (wasm.ModuleEngine, error) {
 403	me := &moduleEngine{
 404		parentEngine: e,
 405		functions:    make([]function, len(module.FunctionSection)+int(module.ImportFunctionCount)),
 406	}
 407
 408	codes, ok := e.getCompiledFunctions(module)
 409	if !ok {
 410		return nil, errors.New("source module must be compiled before instantiation")
 411	}
 412
 413	for i := range codes {
 414		c := &codes[i]
 415		offset := i + int(module.ImportFunctionCount)
 416		typeIndex := module.FunctionSection[i]
 417		me.functions[offset] = function{
 418			moduleInstance: instance,
 419			typeID:         instance.TypeIDs[typeIndex],
 420			funcType:       &module.TypeSection[typeIndex],
 421			parent:         c,
 422		}
 423	}
 424	return me, nil
 425}
 426
 427// lowerIR lowers the interpreterir operations to engine friendly struct.
 428func (e *engine) lowerIR(ir *compilationResult, ret *compiledFunction) error {
 429	// Copy the body from the result.
 430	ret.body = make([]unionOperation, len(ir.Operations))
 431	copy(ret.body, ir.Operations)
 432	// Also copy the offsets if necessary.
 433	if offsets := ir.IROperationSourceOffsetsInWasmBinary; len(offsets) > 0 {
 434		ret.offsetsInWasmBinary = make([]uint64, len(offsets))
 435		copy(ret.offsetsInWasmBinary, offsets)
 436	}
 437
 438	labelAddressResolutions := [labelKindNum][]uint64{}
 439
 440	// First, we iterate all labels, and resolve the address.
 441	for i := range ret.body {
 442		op := &ret.body[i]
 443		switch op.Kind {
 444		case operationKindLabel:
 445			label := label(op.U1)
 446			address := uint64(i)
 447
 448			kind, fid := label.Kind(), label.FrameID()
 449			frameToAddresses := labelAddressResolutions[label.Kind()]
 450			// Expand the slice if necessary.
 451			if diff := fid - len(frameToAddresses) + 1; diff > 0 {
 452				for j := 0; j < diff; j++ {
 453					frameToAddresses = append(frameToAddresses, 0)
 454				}
 455			}
 456			frameToAddresses[fid] = address
 457			labelAddressResolutions[kind] = frameToAddresses
 458		}
 459	}
 460
 461	// Then resolve the label as the index to the body.
 462	for i := range ret.body {
 463		op := &ret.body[i]
 464		switch op.Kind {
 465		case operationKindBr:
 466			e.setLabelAddress(&op.U1, label(op.U1), labelAddressResolutions)
 467		case operationKindBrIf:
 468			e.setLabelAddress(&op.U1, label(op.U1), labelAddressResolutions)
 469			e.setLabelAddress(&op.U2, label(op.U2), labelAddressResolutions)
 470		case operationKindBrTable:
 471			for j := 0; j < len(op.Us); j += 2 {
 472				target := op.Us[j]
 473				e.setLabelAddress(&op.Us[j], label(target), labelAddressResolutions)
 474			}
 475		}
 476	}
 477	return nil
 478}
 479
 480func (e *engine) setLabelAddress(op *uint64, label label, labelAddressResolutions [labelKindNum][]uint64) {
 481	if label.IsReturnTarget() {
 482		// Jmp to the end of the possible binary.
 483		*op = math.MaxUint64
 484	} else {
 485		*op = labelAddressResolutions[label.Kind()][label.FrameID()]
 486	}
 487}
 488
 489// ResolveImportedFunction implements wasm.ModuleEngine.
 490func (e *moduleEngine) ResolveImportedFunction(index, descFunc, indexInImportedModule wasm.Index, importedModuleEngine wasm.ModuleEngine) {
 491	imported := importedModuleEngine.(*moduleEngine)
 492	e.functions[index] = imported.functions[indexInImportedModule]
 493}
 494
 495// ResolveImportedMemory implements wasm.ModuleEngine.
 496func (e *moduleEngine) ResolveImportedMemory(wasm.ModuleEngine) {}
 497
 498// DoneInstantiation implements wasm.ModuleEngine.
 499func (e *moduleEngine) DoneInstantiation() {}
 500
 501// FunctionInstanceReference implements the same method as documented on wasm.ModuleEngine.
 502func (e *moduleEngine) FunctionInstanceReference(funcIndex wasm.Index) wasm.Reference {
 503	return uintptr(unsafe.Pointer(&e.functions[funcIndex]))
 504}
 505
 506// NewFunction implements the same method as documented on wasm.ModuleEngine.
 507func (e *moduleEngine) NewFunction(index wasm.Index) (ce api.Function) {
 508	// Note: The input parameters are pre-validated, so a compiled function is only absent on close. Updates to
 509	// code on close aren't locked, neither is this read.
 510	compiled := &e.functions[index]
 511	return e.newCallEngine(compiled)
 512}
 513
 514// LookupFunction implements the same method as documented on wasm.ModuleEngine.
 515func (e *moduleEngine) LookupFunction(t *wasm.TableInstance, typeId wasm.FunctionTypeID, tableOffset wasm.Index) (*wasm.ModuleInstance, wasm.Index) {
 516	if tableOffset >= uint32(len(t.References)) {
 517		panic(wasmruntime.ErrRuntimeInvalidTableAccess)
 518	}
 519	rawPtr := t.References[tableOffset]
 520	if rawPtr == 0 {
 521		panic(wasmruntime.ErrRuntimeInvalidTableAccess)
 522	}
 523
 524	tf := functionFromUintptr(rawPtr)
 525	if tf.typeID != typeId {
 526		panic(wasmruntime.ErrRuntimeIndirectCallTypeMismatch)
 527	}
 528	return tf.moduleInstance, tf.parent.index
 529}
 530
 531// Definition implements the same method as documented on api.Function.
 532func (ce *callEngine) Definition() api.FunctionDefinition {
 533	return ce.f.definition()
 534}
 535
 536func (f *function) definition() api.FunctionDefinition {
 537	compiled := f.parent
 538	return compiled.source.FunctionDefinition(compiled.index)
 539}
 540
 541// Call implements the same method as documented on api.Function.
 542func (ce *callEngine) Call(ctx context.Context, params ...uint64) (results []uint64, err error) {
 543	ft := ce.f.funcType
 544	if n := ft.ParamNumInUint64; n != len(params) {
 545		return nil, fmt.Errorf("expected %d params, but passed %d", n, len(params))
 546	}
 547	return ce.call(ctx, params, nil)
 548}
 549
 550// CallWithStack implements the same method as documented on api.Function.
 551func (ce *callEngine) CallWithStack(ctx context.Context, stack []uint64) error {
 552	params, results, err := wasm.SplitCallStack(ce.f.funcType, stack)
 553	if err != nil {
 554		return err
 555	}
 556	_, err = ce.call(ctx, params, results)
 557	return err
 558}
 559
 560func (ce *callEngine) call(ctx context.Context, params, results []uint64) (_ []uint64, err error) {
 561	m := ce.f.moduleInstance
 562	if ce.f.parent.ensureTermination {
 563		select {
 564		case <-ctx.Done():
 565			// If the provided context is already done, close the call context
 566			// and return the error.
 567			m.CloseWithCtxErr(ctx)
 568			return nil, m.FailIfClosed()
 569		default:
 570		}
 571	}
 572
 573	if ctx.Value(expctxkeys.EnableSnapshotterKey{}) != nil {
 574		ctx = context.WithValue(ctx, expctxkeys.SnapshotterKey{}, ce)
 575	}
 576
 577	defer func() {
 578		// If the module closed during the call, and the call didn't err for another reason, set an ExitError.
 579		if err == nil {
 580			err = m.FailIfClosed()
 581		}
 582		// TODO: ^^ Will not fail if the function was imported from a closed module.
 583
 584		if v := recover(); v != nil {
 585			err = ce.recoverOnCall(ctx, m, v)
 586		}
 587	}()
 588
 589	ce.pushValues(params)
 590
 591	if ce.f.parent.ensureTermination {
 592		done := m.CloseModuleOnCanceledOrTimeout(ctx)
 593		defer done()
 594	}
 595
 596	ce.callFunction(ctx, m, ce.f)
 597
 598	// This returns a safe copy of the results, instead of a slice view. If we
 599	// returned a re-slice, the caller could accidentally or purposefully
 600	// corrupt the stack of subsequent calls.
 601	ft := ce.f.funcType
 602	if results == nil && ft.ResultNumInUint64 > 0 {
 603		results = make([]uint64, ft.ResultNumInUint64)
 604	}
 605	ce.popValues(results)
 606	return results, nil
 607}
 608
 609// functionListenerInvocation captures arguments needed to perform function
 610// listener invocations when unwinding the call stack.
 611type functionListenerInvocation struct {
 612	experimental.FunctionListener
 613	def api.FunctionDefinition
 614}
 615
 616// recoverOnCall takes the recovered value `recoverOnCall`, and wraps it
 617// with the call frame stack traces. Also, reset the state of callEngine
 618// so that it can be used for the subsequent calls.
 619func (ce *callEngine) recoverOnCall(ctx context.Context, m *wasm.ModuleInstance, v interface{}) (err error) {
 620	if s, ok := v.(*snapshot); ok {
 621		// A snapshot that wasn't handled was created by a different call engine possibly from a nested wasm invocation,
 622		// let it propagate up to be handled by the caller.
 623		panic(s)
 624	}
 625
 626	builder := wasmdebug.NewErrorBuilder()
 627	frameCount := len(ce.frames)
 628	functionListeners := make([]functionListenerInvocation, 0, 16)
 629
 630	if frameCount > wasmdebug.MaxFrames {
 631		frameCount = wasmdebug.MaxFrames
 632	}
 633	for i := 0; i < frameCount; i++ {
 634		frame := ce.popFrame()
 635		f := frame.f
 636		def := f.definition()
 637		var sources []string
 638		if parent := frame.f.parent; parent.body != nil && len(parent.offsetsInWasmBinary) > 0 {
 639			sources = parent.source.DWARFLines.Line(parent.offsetsInWasmBinary[frame.pc])
 640		}
 641		builder.AddFrame(def.DebugName(), def.ParamTypes(), def.ResultTypes(), sources)
 642		if f.parent.listener != nil {
 643			functionListeners = append(functionListeners, functionListenerInvocation{
 644				FunctionListener: f.parent.listener,
 645				def:              f.definition(),
 646			})
 647		}
 648	}
 649
 650	err = builder.FromRecovered(v)
 651	for i := range functionListeners {
 652		functionListeners[i].Abort(ctx, m, functionListeners[i].def, err)
 653	}
 654
 655	// Allows the reuse of CallEngine.
 656	ce.stack, ce.frames = ce.stack[:0], ce.frames[:0]
 657	return
 658}
 659
 660func (ce *callEngine) callFunction(ctx context.Context, m *wasm.ModuleInstance, f *function) {
 661	if f.parent.hostFn != nil {
 662		ce.callGoFuncWithStack(ctx, m, f)
 663	} else if lsn := f.parent.listener; lsn != nil {
 664		ce.callNativeFuncWithListener(ctx, m, f, lsn)
 665	} else {
 666		ce.callNativeFunc(ctx, m, f)
 667	}
 668}
 669
 670func (ce *callEngine) callGoFunc(ctx context.Context, m *wasm.ModuleInstance, f *function, stack []uint64) {
 671	typ := f.funcType
 672	lsn := f.parent.listener
 673	if lsn != nil {
 674		params := stack[:typ.ParamNumInUint64]
 675		ce.stackIterator.reset(ce.stack, ce.frames, f)
 676		lsn.Before(ctx, m, f.definition(), params, &ce.stackIterator)
 677		ce.stackIterator.clear()
 678	}
 679	frame := &callFrame{f: f, base: len(ce.stack)}
 680	ce.pushFrame(frame)
 681
 682	fn := f.parent.hostFn
 683	switch fn := fn.(type) {
 684	case api.GoModuleFunction:
 685		fn.Call(ctx, m, stack)
 686	case api.GoFunction:
 687		fn.Call(ctx, stack)
 688	}
 689
 690	ce.popFrame()
 691	if lsn != nil {
 692		// TODO: This doesn't get the error due to use of panic to propagate them.
 693		results := stack[:typ.ResultNumInUint64]
 694		lsn.After(ctx, m, f.definition(), results)
 695	}
 696}
 697
 698func (ce *callEngine) callNativeFunc(ctx context.Context, m *wasm.ModuleInstance, f *function) {
 699	frame := &callFrame{f: f, base: len(ce.stack)}
 700	moduleInst := f.moduleInstance
 701	functions := moduleInst.Engine.(*moduleEngine).functions
 702	memoryInst := moduleInst.MemoryInstance
 703	globals := moduleInst.Globals
 704	tables := moduleInst.Tables
 705	typeIDs := moduleInst.TypeIDs
 706	dataInstances := moduleInst.DataInstances
 707	elementInstances := moduleInst.ElementInstances
 708	ce.pushFrame(frame)
 709	body := frame.f.parent.body
 710	bodyLen := uint64(len(body))
 711	for frame.pc < bodyLen {
 712		op := &body[frame.pc]
 713		// TODO: add description of each operation/case
 714		// on, for example, how many args are used,
 715		// how the stack is modified, etc.
 716		switch op.Kind {
 717		case operationKindBuiltinFunctionCheckExitCode:
 718			if err := m.FailIfClosed(); err != nil {
 719				panic(err)
 720			}
 721			frame.pc++
 722		case operationKindUnreachable:
 723			panic(wasmruntime.ErrRuntimeUnreachable)
 724		case operationKindBr:
 725			frame.pc = op.U1
 726		case operationKindBrIf:
 727			if ce.popValue() > 0 {
 728				ce.drop(op.U3)
 729				frame.pc = op.U1
 730			} else {
 731				frame.pc = op.U2
 732			}
 733		case operationKindBrTable:
 734			v := ce.popValue()
 735			defaultAt := uint64(len(op.Us))/2 - 1
 736			if v > defaultAt {
 737				v = defaultAt
 738			}
 739			v *= 2
 740			ce.drop(op.Us[v+1])
 741			frame.pc = op.Us[v]
 742		case operationKindCall:
 743			func() {
 744				if ctx.Value(expctxkeys.EnableSnapshotterKey{}) != nil {
 745					defer func() {
 746						if r := recover(); r != nil {
 747							if s, ok := r.(*snapshot); ok && s.ce == ce {
 748								s.doRestore()
 749								frame = ce.frames[len(ce.frames)-1]
 750								body = frame.f.parent.body
 751								bodyLen = uint64(len(body))
 752							} else {
 753								panic(r)
 754							}
 755						}
 756					}()
 757				}
 758				ce.callFunction(ctx, f.moduleInstance, &functions[op.U1])
 759			}()
 760			frame.pc++
 761		case operationKindCallIndirect:
 762			offset := ce.popValue()
 763			table := tables[op.U2]
 764			if offset >= uint64(len(table.References)) {
 765				panic(wasmruntime.ErrRuntimeInvalidTableAccess)
 766			}
 767			rawPtr := table.References[offset]
 768			if rawPtr == 0 {
 769				panic(wasmruntime.ErrRuntimeInvalidTableAccess)
 770			}
 771
 772			tf := functionFromUintptr(rawPtr)
 773			if tf.typeID != typeIDs[op.U1] {
 774				panic(wasmruntime.ErrRuntimeIndirectCallTypeMismatch)
 775			}
 776
 777			ce.callFunction(ctx, f.moduleInstance, tf)
 778			frame.pc++
 779		case operationKindDrop:
 780			ce.drop(op.U1)
 781			frame.pc++
 782		case operationKindSelect:
 783			c := ce.popValue()
 784			if op.B3 { // Target is vector.
 785				x2Hi, x2Lo := ce.popValue(), ce.popValue()
 786				if c == 0 {
 787					_, _ = ce.popValue(), ce.popValue() // discard the x1's lo and hi bits.
 788					ce.pushValue(x2Lo)
 789					ce.pushValue(x2Hi)
 790				}
 791			} else {
 792				v2 := ce.popValue()
 793				if c == 0 {
 794					_ = ce.popValue()
 795					ce.pushValue(v2)
 796				}
 797			}
 798			frame.pc++
 799		case operationKindPick:
 800			index := len(ce.stack) - 1 - int(op.U1)
 801			ce.pushValue(ce.stack[index])
 802			if op.B3 { // V128 value target.
 803				ce.pushValue(ce.stack[index+1])
 804			}
 805			frame.pc++
 806		case operationKindSet:
 807			if op.B3 { // V128 value target.
 808				lowIndex := len(ce.stack) - 1 - int(op.U1)
 809				highIndex := lowIndex + 1
 810				hi, lo := ce.popValue(), ce.popValue()
 811				ce.stack[lowIndex], ce.stack[highIndex] = lo, hi
 812			} else {
 813				index := len(ce.stack) - 1 - int(op.U1)
 814				ce.stack[index] = ce.popValue()
 815			}
 816			frame.pc++
 817		case operationKindGlobalGet:
 818			g := globals[op.U1]
 819			ce.pushValue(g.Val)
 820			if g.Type.ValType == wasm.ValueTypeV128 {
 821				ce.pushValue(g.ValHi)
 822			}
 823			frame.pc++
 824		case operationKindGlobalSet:
 825			g := globals[op.U1]
 826			if g.Type.ValType == wasm.ValueTypeV128 {
 827				g.ValHi = ce.popValue()
 828			}
 829			g.Val = ce.popValue()
 830			frame.pc++
 831		case operationKindLoad:
 832			offset := ce.popMemoryOffset(op)
 833			switch unsignedType(op.B1) {
 834			case unsignedTypeI32, unsignedTypeF32:
 835				if val, ok := memoryInst.ReadUint32Le(offset); !ok {
 836					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
 837				} else {
 838					ce.pushValue(uint64(val))
 839				}
 840			case unsignedTypeI64, unsignedTypeF64:
 841				if val, ok := memoryInst.ReadUint64Le(offset); !ok {
 842					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
 843				} else {
 844					ce.pushValue(val)
 845				}
 846			}
 847			frame.pc++
 848		case operationKindLoad8:
 849			val, ok := memoryInst.ReadByte(ce.popMemoryOffset(op))
 850			if !ok {
 851				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
 852			}
 853
 854			switch signedInt(op.B1) {
 855			case signedInt32:
 856				ce.pushValue(uint64(uint32(int8(val))))
 857			case signedInt64:
 858				ce.pushValue(uint64(int8(val)))
 859			case signedUint32, signedUint64:
 860				ce.pushValue(uint64(val))
 861			}
 862			frame.pc++
 863		case operationKindLoad16:
 864
 865			val, ok := memoryInst.ReadUint16Le(ce.popMemoryOffset(op))
 866			if !ok {
 867				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
 868			}
 869
 870			switch signedInt(op.B1) {
 871			case signedInt32:
 872				ce.pushValue(uint64(uint32(int16(val))))
 873			case signedInt64:
 874				ce.pushValue(uint64(int16(val)))
 875			case signedUint32, signedUint64:
 876				ce.pushValue(uint64(val))
 877			}
 878			frame.pc++
 879		case operationKindLoad32:
 880			val, ok := memoryInst.ReadUint32Le(ce.popMemoryOffset(op))
 881			if !ok {
 882				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
 883			}
 884
 885			if op.B1 == 1 { // Signed
 886				ce.pushValue(uint64(int32(val)))
 887			} else {
 888				ce.pushValue(uint64(val))
 889			}
 890			frame.pc++
 891		case operationKindStore:
 892			val := ce.popValue()
 893			offset := ce.popMemoryOffset(op)
 894			switch unsignedType(op.B1) {
 895			case unsignedTypeI32, unsignedTypeF32:
 896				if !memoryInst.WriteUint32Le(offset, uint32(val)) {
 897					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
 898				}
 899			case unsignedTypeI64, unsignedTypeF64:
 900				if !memoryInst.WriteUint64Le(offset, val) {
 901					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
 902				}
 903			}
 904			frame.pc++
 905		case operationKindStore8:
 906			val := byte(ce.popValue())
 907			offset := ce.popMemoryOffset(op)
 908			if !memoryInst.WriteByte(offset, val) {
 909				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
 910			}
 911			frame.pc++
 912		case operationKindStore16:
 913			val := uint16(ce.popValue())
 914			offset := ce.popMemoryOffset(op)
 915			if !memoryInst.WriteUint16Le(offset, val) {
 916				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
 917			}
 918			frame.pc++
 919		case operationKindStore32:
 920			val := uint32(ce.popValue())
 921			offset := ce.popMemoryOffset(op)
 922			if !memoryInst.WriteUint32Le(offset, val) {
 923				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
 924			}
 925			frame.pc++
 926		case operationKindMemorySize:
 927			ce.pushValue(uint64(memoryInst.Pages()))
 928			frame.pc++
 929		case operationKindMemoryGrow:
 930			n := ce.popValue()
 931			if res, ok := memoryInst.Grow(uint32(n)); !ok {
 932				ce.pushValue(uint64(0xffffffff)) // = -1 in signed 32-bit integer.
 933			} else {
 934				ce.pushValue(uint64(res))
 935			}
 936			frame.pc++
 937		case operationKindConstI32, operationKindConstI64,
 938			operationKindConstF32, operationKindConstF64:
 939			ce.pushValue(op.U1)
 940			frame.pc++
 941		case operationKindEq:
 942			var b bool
 943			switch unsignedType(op.B1) {
 944			case unsignedTypeI32:
 945				v2, v1 := ce.popValue(), ce.popValue()
 946				b = uint32(v1) == uint32(v2)
 947			case unsignedTypeI64:
 948				v2, v1 := ce.popValue(), ce.popValue()
 949				b = v1 == v2
 950			case unsignedTypeF32:
 951				v2, v1 := ce.popValue(), ce.popValue()
 952				b = math.Float32frombits(uint32(v2)) == math.Float32frombits(uint32(v1))
 953			case unsignedTypeF64:
 954				v2, v1 := ce.popValue(), ce.popValue()
 955				b = math.Float64frombits(v2) == math.Float64frombits(v1)
 956			}
 957			if b {
 958				ce.pushValue(1)
 959			} else {
 960				ce.pushValue(0)
 961			}
 962			frame.pc++
 963		case operationKindNe:
 964			var b bool
 965			switch unsignedType(op.B1) {
 966			case unsignedTypeI32, unsignedTypeI64:
 967				v2, v1 := ce.popValue(), ce.popValue()
 968				b = v1 != v2
 969			case unsignedTypeF32:
 970				v2, v1 := ce.popValue(), ce.popValue()
 971				b = math.Float32frombits(uint32(v2)) != math.Float32frombits(uint32(v1))
 972			case unsignedTypeF64:
 973				v2, v1 := ce.popValue(), ce.popValue()
 974				b = math.Float64frombits(v2) != math.Float64frombits(v1)
 975			}
 976			if b {
 977				ce.pushValue(1)
 978			} else {
 979				ce.pushValue(0)
 980			}
 981			frame.pc++
 982		case operationKindEqz:
 983			if ce.popValue() == 0 {
 984				ce.pushValue(1)
 985			} else {
 986				ce.pushValue(0)
 987			}
 988			frame.pc++
 989		case operationKindLt:
 990			v2 := ce.popValue()
 991			v1 := ce.popValue()
 992			var b bool
 993			switch signedType(op.B1) {
 994			case signedTypeInt32:
 995				b = int32(v1) < int32(v2)
 996			case signedTypeInt64:
 997				b = int64(v1) < int64(v2)
 998			case signedTypeUint32, signedTypeUint64:
 999				b = v1 < v2
1000			case signedTypeFloat32:
1001				b = math.Float32frombits(uint32(v1)) < math.Float32frombits(uint32(v2))
1002			case signedTypeFloat64:
1003				b = math.Float64frombits(v1) < math.Float64frombits(v2)
1004			}
1005			if b {
1006				ce.pushValue(1)
1007			} else {
1008				ce.pushValue(0)
1009			}
1010			frame.pc++
1011		case operationKindGt:
1012			v2 := ce.popValue()
1013			v1 := ce.popValue()
1014			var b bool
1015			switch signedType(op.B1) {
1016			case signedTypeInt32:
1017				b = int32(v1) > int32(v2)
1018			case signedTypeInt64:
1019				b = int64(v1) > int64(v2)
1020			case signedTypeUint32, signedTypeUint64:
1021				b = v1 > v2
1022			case signedTypeFloat32:
1023				b = math.Float32frombits(uint32(v1)) > math.Float32frombits(uint32(v2))
1024			case signedTypeFloat64:
1025				b = math.Float64frombits(v1) > math.Float64frombits(v2)
1026			}
1027			if b {
1028				ce.pushValue(1)
1029			} else {
1030				ce.pushValue(0)
1031			}
1032			frame.pc++
1033		case operationKindLe:
1034			v2 := ce.popValue()
1035			v1 := ce.popValue()
1036			var b bool
1037			switch signedType(op.B1) {
1038			case signedTypeInt32:
1039				b = int32(v1) <= int32(v2)
1040			case signedTypeInt64:
1041				b = int64(v1) <= int64(v2)
1042			case signedTypeUint32, signedTypeUint64:
1043				b = v1 <= v2
1044			case signedTypeFloat32:
1045				b = math.Float32frombits(uint32(v1)) <= math.Float32frombits(uint32(v2))
1046			case signedTypeFloat64:
1047				b = math.Float64frombits(v1) <= math.Float64frombits(v2)
1048			}
1049			if b {
1050				ce.pushValue(1)
1051			} else {
1052				ce.pushValue(0)
1053			}
1054			frame.pc++
1055		case operationKindGe:
1056			v2 := ce.popValue()
1057			v1 := ce.popValue()
1058			var b bool
1059			switch signedType(op.B1) {
1060			case signedTypeInt32:
1061				b = int32(v1) >= int32(v2)
1062			case signedTypeInt64:
1063				b = int64(v1) >= int64(v2)
1064			case signedTypeUint32, signedTypeUint64:
1065				b = v1 >= v2
1066			case signedTypeFloat32:
1067				b = math.Float32frombits(uint32(v1)) >= math.Float32frombits(uint32(v2))
1068			case signedTypeFloat64:
1069				b = math.Float64frombits(v1) >= math.Float64frombits(v2)
1070			}
1071			if b {
1072				ce.pushValue(1)
1073			} else {
1074				ce.pushValue(0)
1075			}
1076			frame.pc++
1077		case operationKindAdd:
1078			v2 := ce.popValue()
1079			v1 := ce.popValue()
1080			switch unsignedType(op.B1) {
1081			case unsignedTypeI32:
1082				v := uint32(v1) + uint32(v2)
1083				ce.pushValue(uint64(v))
1084			case unsignedTypeI64:
1085				ce.pushValue(v1 + v2)
1086			case unsignedTypeF32:
1087				ce.pushValue(addFloat32bits(uint32(v1), uint32(v2)))
1088			case unsignedTypeF64:
1089				v := math.Float64frombits(v1) + math.Float64frombits(v2)
1090				ce.pushValue(math.Float64bits(v))
1091			}
1092			frame.pc++
1093		case operationKindSub:
1094			v2 := ce.popValue()
1095			v1 := ce.popValue()
1096			switch unsignedType(op.B1) {
1097			case unsignedTypeI32:
1098				ce.pushValue(uint64(uint32(v1) - uint32(v2)))
1099			case unsignedTypeI64:
1100				ce.pushValue(v1 - v2)
1101			case unsignedTypeF32:
1102				ce.pushValue(subFloat32bits(uint32(v1), uint32(v2)))
1103			case unsignedTypeF64:
1104				v := math.Float64frombits(v1) - math.Float64frombits(v2)
1105				ce.pushValue(math.Float64bits(v))
1106			}
1107			frame.pc++
1108		case operationKindMul:
1109			v2 := ce.popValue()
1110			v1 := ce.popValue()
1111			switch unsignedType(op.B1) {
1112			case unsignedTypeI32:
1113				ce.pushValue(uint64(uint32(v1) * uint32(v2)))
1114			case unsignedTypeI64:
1115				ce.pushValue(v1 * v2)
1116			case unsignedTypeF32:
1117				ce.pushValue(mulFloat32bits(uint32(v1), uint32(v2)))
1118			case unsignedTypeF64:
1119				v := math.Float64frombits(v2) * math.Float64frombits(v1)
1120				ce.pushValue(math.Float64bits(v))
1121			}
1122			frame.pc++
1123		case operationKindClz:
1124			v := ce.popValue()
1125			if op.B1 == 0 {
1126				// unsignedInt32
1127				ce.pushValue(uint64(bits.LeadingZeros32(uint32(v))))
1128			} else {
1129				// unsignedInt64
1130				ce.pushValue(uint64(bits.LeadingZeros64(v)))
1131			}
1132			frame.pc++
1133		case operationKindCtz:
1134			v := ce.popValue()
1135			if op.B1 == 0 {
1136				// unsignedInt32
1137				ce.pushValue(uint64(bits.TrailingZeros32(uint32(v))))
1138			} else {
1139				// unsignedInt64
1140				ce.pushValue(uint64(bits.TrailingZeros64(v)))
1141			}
1142			frame.pc++
1143		case operationKindPopcnt:
1144			v := ce.popValue()
1145			if op.B1 == 0 {
1146				// unsignedInt32
1147				ce.pushValue(uint64(bits.OnesCount32(uint32(v))))
1148			} else {
1149				// unsignedInt64
1150				ce.pushValue(uint64(bits.OnesCount64(v)))
1151			}
1152			frame.pc++
1153		case operationKindDiv:
1154			// If an integer, check we won't divide by zero.
1155			t := signedType(op.B1)
1156			v2, v1 := ce.popValue(), ce.popValue()
1157			switch t {
1158			case signedTypeFloat32, signedTypeFloat64: // not integers
1159			default:
1160				if v2 == 0 {
1161					panic(wasmruntime.ErrRuntimeIntegerDivideByZero)
1162				}
1163			}
1164
1165			switch t {
1166			case signedTypeInt32:
1167				d := int32(v2)
1168				n := int32(v1)
1169				if n == math.MinInt32 && d == -1 {
1170					panic(wasmruntime.ErrRuntimeIntegerOverflow)
1171				}
1172				ce.pushValue(uint64(uint32(n / d)))
1173			case signedTypeInt64:
1174				d := int64(v2)
1175				n := int64(v1)
1176				if n == math.MinInt64 && d == -1 {
1177					panic(wasmruntime.ErrRuntimeIntegerOverflow)
1178				}
1179				ce.pushValue(uint64(n / d))
1180			case signedTypeUint32:
1181				d := uint32(v2)
1182				n := uint32(v1)
1183				ce.pushValue(uint64(n / d))
1184			case signedTypeUint64:
1185				d := v2
1186				n := v1
1187				ce.pushValue(n / d)
1188			case signedTypeFloat32:
1189				ce.pushValue(divFloat32bits(uint32(v1), uint32(v2)))
1190			case signedTypeFloat64:
1191				ce.pushValue(math.Float64bits(math.Float64frombits(v1) / math.Float64frombits(v2)))
1192			}
1193			frame.pc++
1194		case operationKindRem:
1195			v2, v1 := ce.popValue(), ce.popValue()
1196			if v2 == 0 {
1197				panic(wasmruntime.ErrRuntimeIntegerDivideByZero)
1198			}
1199			switch signedInt(op.B1) {
1200			case signedInt32:
1201				d := int32(v2)
1202				n := int32(v1)
1203				ce.pushValue(uint64(uint32(n % d)))
1204			case signedInt64:
1205				d := int64(v2)
1206				n := int64(v1)
1207				ce.pushValue(uint64(n % d))
1208			case signedUint32:
1209				d := uint32(v2)
1210				n := uint32(v1)
1211				ce.pushValue(uint64(n % d))
1212			case signedUint64:
1213				d := v2
1214				n := v1
1215				ce.pushValue(n % d)
1216			}
1217			frame.pc++
1218		case operationKindAnd:
1219			v2 := ce.popValue()
1220			v1 := ce.popValue()
1221			if op.B1 == 0 {
1222				// unsignedInt32
1223				ce.pushValue(uint64(uint32(v2) & uint32(v1)))
1224			} else {
1225				// unsignedInt64
1226				ce.pushValue(uint64(v2 & v1))
1227			}
1228			frame.pc++
1229		case operationKindOr:
1230			v2 := ce.popValue()
1231			v1 := ce.popValue()
1232			if op.B1 == 0 {
1233				// unsignedInt32
1234				ce.pushValue(uint64(uint32(v2) | uint32(v1)))
1235			} else {
1236				// unsignedInt64
1237				ce.pushValue(uint64(v2 | v1))
1238			}
1239			frame.pc++
1240		case operationKindXor:
1241			v2 := ce.popValue()
1242			v1 := ce.popValue()
1243			if op.B1 == 0 {
1244				// unsignedInt32
1245				ce.pushValue(uint64(uint32(v2) ^ uint32(v1)))
1246			} else {
1247				// unsignedInt64
1248				ce.pushValue(uint64(v2 ^ v1))
1249			}
1250			frame.pc++
1251		case operationKindShl:
1252			v2 := ce.popValue()
1253			v1 := ce.popValue()
1254			if op.B1 == 0 {
1255				// unsignedInt32
1256				ce.pushValue(uint64(uint32(v1) << (uint32(v2) % 32)))
1257			} else {
1258				// unsignedInt64
1259				ce.pushValue(v1 << (v2 % 64))
1260			}
1261			frame.pc++
1262		case operationKindShr:
1263			v2 := ce.popValue()
1264			v1 := ce.popValue()
1265			switch signedInt(op.B1) {
1266			case signedInt32:
1267				ce.pushValue(uint64(uint32(int32(v1) >> (uint32(v2) % 32))))
1268			case signedInt64:
1269				ce.pushValue(uint64(int64(v1) >> (v2 % 64)))
1270			case signedUint32:
1271				ce.pushValue(uint64(uint32(v1) >> (uint32(v2) % 32)))
1272			case signedUint64:
1273				ce.pushValue(v1 >> (v2 % 64))
1274			}
1275			frame.pc++
1276		case operationKindRotl:
1277			v2 := ce.popValue()
1278			v1 := ce.popValue()
1279			if op.B1 == 0 {
1280				// unsignedInt32
1281				ce.pushValue(uint64(bits.RotateLeft32(uint32(v1), int(v2))))
1282			} else {
1283				// unsignedInt64
1284				ce.pushValue(uint64(bits.RotateLeft64(v1, int(v2))))
1285			}
1286			frame.pc++
1287		case operationKindRotr:
1288			v2 := ce.popValue()
1289			v1 := ce.popValue()
1290			if op.B1 == 0 {
1291				// unsignedInt32
1292				ce.pushValue(uint64(bits.RotateLeft32(uint32(v1), -int(v2))))
1293			} else {
1294				// unsignedInt64
1295				ce.pushValue(uint64(bits.RotateLeft64(v1, -int(v2))))
1296			}
1297			frame.pc++
1298		case operationKindAbs:
1299			if op.B1 == 0 {
1300				// float32
1301				const mask uint32 = 1 << 31
1302				ce.pushValue(uint64(uint32(ce.popValue()) &^ mask))
1303			} else {
1304				// float64
1305				const mask uint64 = 1 << 63
1306				ce.pushValue(ce.popValue() &^ mask)
1307			}
1308			frame.pc++
1309		case operationKindNeg:
1310			if op.B1 == 0 {
1311				// float32
1312				v := -math.Float32frombits(uint32(ce.popValue()))
1313				ce.pushValue(uint64(math.Float32bits(v)))
1314			} else {
1315				// float64
1316				v := -math.Float64frombits(ce.popValue())
1317				ce.pushValue(math.Float64bits(v))
1318			}
1319			frame.pc++
1320		case operationKindCeil:
1321			if op.B1 == 0 {
1322				// float32
1323				v := moremath.WasmCompatCeilF32(math.Float32frombits(uint32(ce.popValue())))
1324				ce.pushValue(uint64(math.Float32bits(v)))
1325			} else {
1326				// float64
1327				v := moremath.WasmCompatCeilF64(math.Float64frombits(ce.popValue()))
1328				ce.pushValue(math.Float64bits(v))
1329			}
1330			frame.pc++
1331		case operationKindFloor:
1332			if op.B1 == 0 {
1333				// float32
1334				v := moremath.WasmCompatFloorF32(math.Float32frombits(uint32(ce.popValue())))
1335				ce.pushValue(uint64(math.Float32bits(v)))
1336			} else {
1337				// float64
1338				v := moremath.WasmCompatFloorF64(math.Float64frombits(ce.popValue()))
1339				ce.pushValue(math.Float64bits(v))
1340			}
1341			frame.pc++
1342		case operationKindTrunc:
1343			if op.B1 == 0 {
1344				// float32
1345				v := moremath.WasmCompatTruncF32(math.Float32frombits(uint32(ce.popValue())))
1346				ce.pushValue(uint64(math.Float32bits(v)))
1347			} else {
1348				// float64
1349				v := moremath.WasmCompatTruncF64(math.Float64frombits(ce.popValue()))
1350				ce.pushValue(math.Float64bits(v))
1351			}
1352			frame.pc++
1353		case operationKindNearest:
1354			if op.B1 == 0 {
1355				// float32
1356				f := math.Float32frombits(uint32(ce.popValue()))
1357				ce.pushValue(uint64(math.Float32bits(moremath.WasmCompatNearestF32(f))))
1358			} else {
1359				// float64
1360				f := math.Float64frombits(ce.popValue())
1361				ce.pushValue(math.Float64bits(moremath.WasmCompatNearestF64(f)))
1362			}
1363			frame.pc++
1364		case operationKindSqrt:
1365			if op.B1 == 0 {
1366				// float32
1367				v := math.Sqrt(float64(math.Float32frombits(uint32(ce.popValue()))))
1368				ce.pushValue(uint64(math.Float32bits(float32(v))))
1369			} else {
1370				// float64
1371				v := math.Sqrt(math.Float64frombits(ce.popValue()))
1372				ce.pushValue(math.Float64bits(v))
1373			}
1374			frame.pc++
1375		case operationKindMin:
1376			if op.B1 == 0 {
1377				// float32
1378				ce.pushValue(wasmCompatMin32bits(uint32(ce.popValue()), uint32(ce.popValue())))
1379			} else {
1380				v2 := math.Float64frombits(ce.popValue())
1381				v1 := math.Float64frombits(ce.popValue())
1382				ce.pushValue(math.Float64bits(moremath.WasmCompatMin64(v1, v2)))
1383			}
1384			frame.pc++
1385		case operationKindMax:
1386			if op.B1 == 0 {
1387				ce.pushValue(wasmCompatMax32bits(uint32(ce.popValue()), uint32(ce.popValue())))
1388			} else {
1389				// float64
1390				v2 := math.Float64frombits(ce.popValue())
1391				v1 := math.Float64frombits(ce.popValue())
1392				ce.pushValue(math.Float64bits(moremath.WasmCompatMax64(v1, v2)))
1393			}
1394			frame.pc++
1395		case operationKindCopysign:
1396			if op.B1 == 0 {
1397				// float32
1398				v2 := uint32(ce.popValue())
1399				v1 := uint32(ce.popValue())
1400				const signbit = 1 << 31
1401				ce.pushValue(uint64(v1&^signbit | v2&signbit))
1402			} else {
1403				// float64
1404				v2 := ce.popValue()
1405				v1 := ce.popValue()
1406				const signbit = 1 << 63
1407				ce.pushValue(v1&^signbit | v2&signbit)
1408			}
1409			frame.pc++
1410		case operationKindI32WrapFromI64:
1411			ce.pushValue(uint64(uint32(ce.popValue())))
1412			frame.pc++
1413		case operationKindITruncFromF:
1414			if op.B1 == 0 {
1415				// float32
1416				switch signedInt(op.B2) {
1417				case signedInt32:
1418					v := math.Trunc(float64(math.Float32frombits(uint32(ce.popValue()))))
1419					if math.IsNaN(v) { // NaN cannot be compared with themselves, so we have to use IsNaN
1420						if op.B3 {
1421							// non-trapping conversion must cast nan to zero.
1422							v = 0
1423						} else {
1424							panic(wasmruntime.ErrRuntimeInvalidConversionToInteger)
1425						}
1426					} else if v < math.MinInt32 || v > math.MaxInt32 {
1427						if op.B3 {
1428							// non-trapping conversion must "saturate" the value for overflowing sources.
1429							if v < 0 {
1430								v = math.MinInt32
1431							} else {
1432								v = math.MaxInt32
1433							}
1434						} else {
1435							panic(wasmruntime.ErrRuntimeIntegerOverflow)
1436						}
1437					}
1438					ce.pushValue(uint64(uint32(int32(v))))
1439				case signedInt64:
1440					v := math.Trunc(float64(math.Float32frombits(uint32(ce.popValue()))))
1441					res := int64(v)
1442					if math.IsNaN(v) { // NaN cannot be compared with themselves, so we have to use IsNaN
1443						if op.B3 {
1444							// non-trapping conversion must cast nan to zero.
1445							res = 0
1446						} else {
1447							panic(wasmruntime.ErrRuntimeInvalidConversionToInteger)
1448						}
1449					} else if v < math.MinInt64 || v >= math.MaxInt64 {
1450						// Note: math.MaxInt64 is rounded up to math.MaxInt64+1 in 64-bit float representation,
1451						// and that's why we use '>=' not '>' to check overflow.
1452						if op.B3 {
1453							// non-trapping conversion must "saturate" the value for overflowing sources.
1454							if v < 0 {
1455								res = math.MinInt64
1456							} else {
1457								res = math.MaxInt64
1458							}
1459						} else {
1460							panic(wasmruntime.ErrRuntimeIntegerOverflow)
1461						}
1462					}
1463					ce.pushValue(uint64(res))
1464				case signedUint32:
1465					v := math.Trunc(float64(math.Float32frombits(uint32(ce.popValue()))))
1466					if math.IsNaN(v) { // NaN cannot be compared with themselves, so we have to use IsNaN
1467						if op.B3 {
1468							// non-trapping conversion must cast nan to zero.
1469							v = 0
1470						} else {
1471							panic(wasmruntime.ErrRuntimeInvalidConversionToInteger)
1472						}
1473					} else if v < 0 || v > math.MaxUint32 {
1474						if op.B3 {
1475							// non-trapping conversion must "saturate" the value for overflowing source.
1476							if v < 0 {
1477								v = 0
1478							} else {
1479								v = math.MaxUint32
1480							}
1481						} else {
1482							panic(wasmruntime.ErrRuntimeIntegerOverflow)
1483						}
1484					}
1485					ce.pushValue(uint64(uint32(v)))
1486				case signedUint64:
1487					v := math.Trunc(float64(math.Float32frombits(uint32(ce.popValue()))))
1488					res := uint64(v)
1489					if math.IsNaN(v) { // NaN cannot be compared with themselves, so we have to use IsNaN
1490						if op.B3 {
1491							// non-trapping conversion must cast nan to zero.
1492							res = 0
1493						} else {
1494							panic(wasmruntime.ErrRuntimeInvalidConversionToInteger)
1495						}
1496					} else if v < 0 || v >= math.MaxUint64 {
1497						// Note: math.MaxUint64 is rounded up to math.MaxUint64+1 in 64-bit float representation,
1498						// and that's why we use '>=' not '>' to check overflow.
1499						if op.B3 {
1500							// non-trapping conversion must "saturate" the value for overflowing source.
1501							if v < 0 {
1502								res = 0
1503							} else {
1504								res = math.MaxUint64
1505							}
1506						} else {
1507							panic(wasmruntime.ErrRuntimeIntegerOverflow)
1508						}
1509					}
1510					ce.pushValue(res)
1511				}
1512			} else {
1513				// float64
1514				switch signedInt(op.B2) {
1515				case signedInt32:
1516					v := math.Trunc(math.Float64frombits(ce.popValue()))
1517					if math.IsNaN(v) { // NaN cannot be compared with themselves, so we have to use IsNaN
1518						if op.B3 {
1519							// non-trapping conversion must cast nan to zero.
1520							v = 0
1521						} else {
1522							panic(wasmruntime.ErrRuntimeInvalidConversionToInteger)
1523						}
1524					} else if v < math.MinInt32 || v > math.MaxInt32 {
1525						if op.B3 {
1526							// non-trapping conversion must "saturate" the value for overflowing source.
1527							if v < 0 {
1528								v = math.MinInt32
1529							} else {
1530								v = math.MaxInt32
1531							}
1532						} else {
1533							panic(wasmruntime.ErrRuntimeIntegerOverflow)
1534						}
1535					}
1536					ce.pushValue(uint64(uint32(int32(v))))
1537				case signedInt64:
1538					v := math.Trunc(math.Float64frombits(ce.popValue()))
1539					res := int64(v)
1540					if math.IsNaN(v) { // NaN cannot be compared with themselves, so we have to use IsNaN
1541						if op.B3 {
1542							// non-trapping conversion must cast nan to zero.
1543							res = 0
1544						} else {
1545							panic(wasmruntime.ErrRuntimeInvalidConversionToInteger)
1546						}
1547					} else if v < math.MinInt64 || v >= math.MaxInt64 {
1548						// Note: math.MaxInt64 is rounded up to math.MaxInt64+1 in 64-bit float representation,
1549						// and that's why we use '>=' not '>' to check overflow.
1550						if op.B3 {
1551							// non-trapping conversion must "saturate" the value for overflowing source.
1552							if v < 0 {
1553								res = math.MinInt64
1554							} else {
1555								res = math.MaxInt64
1556							}
1557						} else {
1558							panic(wasmruntime.ErrRuntimeIntegerOverflow)
1559						}
1560					}
1561					ce.pushValue(uint64(res))
1562				case signedUint32:
1563					v := math.Trunc(math.Float64frombits(ce.popValue()))
1564					if math.IsNaN(v) { // NaN cannot be compared with themselves, so we have to use IsNaN
1565						if op.B3 {
1566							// non-trapping conversion must cast nan to zero.
1567							v = 0
1568						} else {
1569							panic(wasmruntime.ErrRuntimeInvalidConversionToInteger)
1570						}
1571					} else if v < 0 || v > math.MaxUint32 {
1572						if op.B3 {
1573							// non-trapping conversion must "saturate" the value for overflowing source.
1574							if v < 0 {
1575								v = 0
1576							} else {
1577								v = math.MaxUint32
1578							}
1579						} else {
1580							panic(wasmruntime.ErrRuntimeIntegerOverflow)
1581						}
1582					}
1583					ce.pushValue(uint64(uint32(v)))
1584				case signedUint64:
1585					v := math.Trunc(math.Float64frombits(ce.popValue()))
1586					res := uint64(v)
1587					if math.IsNaN(v) { // NaN cannot be compared with themselves, so we have to use IsNaN
1588						if op.B3 {
1589							// non-trapping conversion must cast nan to zero.
1590							res = 0
1591						} else {
1592							panic(wasmruntime.ErrRuntimeInvalidConversionToInteger)
1593						}
1594					} else if v < 0 || v >= math.MaxUint64 {
1595						// Note: math.MaxUint64 is rounded up to math.MaxUint64+1 in 64-bit float representation,
1596						// and that's why we use '>=' not '>' to check overflow.
1597						if op.B3 {
1598							// non-trapping conversion must "saturate" the value for overflowing source.
1599							if v < 0 {
1600								res = 0
1601							} else {
1602								res = math.MaxUint64
1603							}
1604						} else {
1605							panic(wasmruntime.ErrRuntimeIntegerOverflow)
1606						}
1607					}
1608					ce.pushValue(res)
1609				}
1610			}
1611			frame.pc++
1612		case operationKindFConvertFromI:
1613			switch signedInt(op.B1) {
1614			case signedInt32:
1615				if op.B2 == 0 {
1616					// float32
1617					v := float32(int32(ce.popValue()))
1618					ce.pushValue(uint64(math.Float32bits(v)))
1619				} else {
1620					// float64
1621					v := float64(int32(ce.popValue()))
1622					ce.pushValue(math.Float64bits(v))
1623				}
1624			case signedInt64:
1625				if op.B2 == 0 {
1626					// float32
1627					v := float32(int64(ce.popValue()))
1628					ce.pushValue(uint64(math.Float32bits(v)))
1629				} else {
1630					// float64
1631					v := float64(int64(ce.popValue()))
1632					ce.pushValue(math.Float64bits(v))
1633				}
1634			case signedUint32:
1635				if op.B2 == 0 {
1636					// float32
1637					v := float32(uint32(ce.popValue()))
1638					ce.pushValue(uint64(math.Float32bits(v)))
1639				} else {
1640					// float64
1641					v := float64(uint32(ce.popValue()))
1642					ce.pushValue(math.Float64bits(v))
1643				}
1644			case signedUint64:
1645				if op.B2 == 0 {
1646					// float32
1647					v := float32(ce.popValue())
1648					ce.pushValue(uint64(math.Float32bits(v)))
1649				} else {
1650					// float64
1651					v := float64(ce.popValue())
1652					ce.pushValue(math.Float64bits(v))
1653				}
1654			}
1655			frame.pc++
1656		case operationKindF32DemoteFromF64:
1657			v := float32(math.Float64frombits(ce.popValue()))
1658			ce.pushValue(uint64(math.Float32bits(v)))
1659			frame.pc++
1660		case operationKindF64PromoteFromF32:
1661			v := float64(math.Float32frombits(uint32(ce.popValue())))
1662			ce.pushValue(math.Float64bits(v))
1663			frame.pc++
1664		case operationKindExtend:
1665			if op.B1 == 1 {
1666				// Signed.
1667				v := int64(int32(ce.popValue()))
1668				ce.pushValue(uint64(v))
1669			} else {
1670				v := uint64(uint32(ce.popValue()))
1671				ce.pushValue(v)
1672			}
1673			frame.pc++
1674		case operationKindSignExtend32From8:
1675			v := uint32(int8(ce.popValue()))
1676			ce.pushValue(uint64(v))
1677			frame.pc++
1678		case operationKindSignExtend32From16:
1679			v := uint32(int16(ce.popValue()))
1680			ce.pushValue(uint64(v))
1681			frame.pc++
1682		case operationKindSignExtend64From8:
1683			v := int64(int8(ce.popValue()))
1684			ce.pushValue(uint64(v))
1685			frame.pc++
1686		case operationKindSignExtend64From16:
1687			v := int64(int16(ce.popValue()))
1688			ce.pushValue(uint64(v))
1689			frame.pc++
1690		case operationKindSignExtend64From32:
1691			v := int64(int32(ce.popValue()))
1692			ce.pushValue(uint64(v))
1693			frame.pc++
1694		case operationKindMemoryInit:
1695			dataInstance := dataInstances[op.U1]
1696			copySize := ce.popValue()
1697			inDataOffset := ce.popValue()
1698			inMemoryOffset := ce.popValue()
1699			if inDataOffset+copySize > uint64(len(dataInstance)) ||
1700				inMemoryOffset+copySize > uint64(len(memoryInst.Buffer)) {
1701				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
1702			} else if copySize != 0 {
1703				copy(memoryInst.Buffer[inMemoryOffset:inMemoryOffset+copySize], dataInstance[inDataOffset:])
1704			}
1705			frame.pc++
1706		case operationKindDataDrop:
1707			dataInstances[op.U1] = nil
1708			frame.pc++
1709		case operationKindMemoryCopy:
1710			memLen := uint64(len(memoryInst.Buffer))
1711			copySize := ce.popValue()
1712			sourceOffset := ce.popValue()
1713			destinationOffset := ce.popValue()
1714			if sourceOffset+copySize > memLen || destinationOffset+copySize > memLen {
1715				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
1716			} else if copySize != 0 {
1717				copy(memoryInst.Buffer[destinationOffset:],
1718					memoryInst.Buffer[sourceOffset:sourceOffset+copySize])
1719			}
1720			frame.pc++
1721		case operationKindMemoryFill:
1722			fillSize := ce.popValue()
1723			value := byte(ce.popValue())
1724			offset := ce.popValue()
1725			if fillSize+offset > uint64(len(memoryInst.Buffer)) {
1726				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
1727			} else if fillSize != 0 {
1728				// Uses the copy trick for faster filling buffer.
1729				// https://gist.github.com/taylorza/df2f89d5f9ab3ffd06865062a4cf015d
1730				buf := memoryInst.Buffer[offset : offset+fillSize]
1731				buf[0] = value
1732				for i := 1; i < len(buf); i *= 2 {
1733					copy(buf[i:], buf[:i])
1734				}
1735			}
1736			frame.pc++
1737		case operationKindTableInit:
1738			elementInstance := elementInstances[op.U1]
1739			copySize := ce.popValue()
1740			inElementOffset := ce.popValue()
1741			inTableOffset := ce.popValue()
1742			table := tables[op.U2]
1743			if inElementOffset+copySize > uint64(len(elementInstance)) ||
1744				inTableOffset+copySize > uint64(len(table.References)) {
1745				panic(wasmruntime.ErrRuntimeInvalidTableAccess)
1746			} else if copySize != 0 {
1747				copy(table.References[inTableOffset:inTableOffset+copySize], elementInstance[inElementOffset:])
1748			}
1749			frame.pc++
1750		case operationKindElemDrop:
1751			elementInstances[op.U1] = nil
1752			frame.pc++
1753		case operationKindTableCopy:
1754			srcTable, dstTable := tables[op.U1].References, tables[op.U2].References
1755			copySize := ce.popValue()
1756			sourceOffset := ce.popValue()
1757			destinationOffset := ce.popValue()
1758			if sourceOffset+copySize > uint64(len(srcTable)) || destinationOffset+copySize > uint64(len(dstTable)) {
1759				panic(wasmruntime.ErrRuntimeInvalidTableAccess)
1760			} else if copySize != 0 {
1761				copy(dstTable[destinationOffset:], srcTable[sourceOffset:sourceOffset+copySize])
1762			}
1763			frame.pc++
1764		case operationKindRefFunc:
1765			ce.pushValue(uint64(uintptr(unsafe.Pointer(&functions[op.U1]))))
1766			frame.pc++
1767		case operationKindTableGet:
1768			table := tables[op.U1]
1769
1770			offset := ce.popValue()
1771			if offset >= uint64(len(table.References)) {
1772				panic(wasmruntime.ErrRuntimeInvalidTableAccess)
1773			}
1774
1775			ce.pushValue(uint64(table.References[offset]))
1776			frame.pc++
1777		case operationKindTableSet:
1778			table := tables[op.U1]
1779			ref := ce.popValue()
1780
1781			offset := ce.popValue()
1782			if offset >= uint64(len(table.References)) {
1783				panic(wasmruntime.ErrRuntimeInvalidTableAccess)
1784			}
1785
1786			table.References[offset] = uintptr(ref) // externrefs are opaque uint64.
1787			frame.pc++
1788		case operationKindTableSize:
1789			table := tables[op.U1]
1790			ce.pushValue(uint64(len(table.References)))
1791			frame.pc++
1792		case operationKindTableGrow:
1793			table := tables[op.U1]
1794			num, ref := ce.popValue(), ce.popValue()
1795			ret := table.Grow(uint32(num), uintptr(ref))
1796			ce.pushValue(uint64(ret))
1797			frame.pc++
1798		case operationKindTableFill:
1799			table := tables[op.U1]
1800			num := ce.popValue()
1801			ref := uintptr(ce.popValue())
1802			offset := ce.popValue()
1803			if num+offset > uint64(len(table.References)) {
1804				panic(wasmruntime.ErrRuntimeInvalidTableAccess)
1805			} else if num > 0 {
1806				// Uses the copy trick for faster filling the region with the value.
1807				// https://gist.github.com/taylorza/df2f89d5f9ab3ffd06865062a4cf015d
1808				targetRegion := table.References[offset : offset+num]
1809				targetRegion[0] = ref
1810				for i := 1; i < len(targetRegion); i *= 2 {
1811					copy(targetRegion[i:], targetRegion[:i])
1812				}
1813			}
1814			frame.pc++
1815		case operationKindV128Const:
1816			lo, hi := op.U1, op.U2
1817			ce.pushValue(lo)
1818			ce.pushValue(hi)
1819			frame.pc++
1820		case operationKindV128Add:
1821			yHigh, yLow := ce.popValue(), ce.popValue()
1822			xHigh, xLow := ce.popValue(), ce.popValue()
1823			switch op.B1 {
1824			case shapeI8x16:
1825				ce.pushValue(
1826					uint64(uint8(xLow>>8)+uint8(yLow>>8))<<8 | uint64(uint8(xLow)+uint8(yLow)) |
1827						uint64(uint8(xLow>>24)+uint8(yLow>>24))<<24 | uint64(uint8(xLow>>16)+uint8(yLow>>16))<<16 |
1828						uint64(uint8(xLow>>40)+uint8(yLow>>40))<<40 | uint64(uint8(xLow>>32)+uint8(yLow>>32))<<32 |
1829						uint64(uint8(xLow>>56)+uint8(yLow>>56))<<56 | uint64(uint8(xLow>>48)+uint8(yLow>>48))<<48,
1830				)
1831				ce.pushValue(
1832					uint64(uint8(xHigh>>8)+uint8(yHigh>>8))<<8 | uint64(uint8(xHigh)+uint8(yHigh)) |
1833						uint64(uint8(xHigh>>24)+uint8(yHigh>>24))<<24 | uint64(uint8(xHigh>>16)+uint8(yHigh>>16))<<16 |
1834						uint64(uint8(xHigh>>40)+uint8(yHigh>>40))<<40 | uint64(uint8(xHigh>>32)+uint8(yHigh>>32))<<32 |
1835						uint64(uint8(xHigh>>56)+uint8(yHigh>>56))<<56 | uint64(uint8(xHigh>>48)+uint8(yHigh>>48))<<48,
1836				)
1837			case shapeI16x8:
1838				ce.pushValue(
1839					uint64(uint16(xLow>>16+yLow>>16))<<16 | uint64(uint16(xLow)+uint16(yLow)) |
1840						uint64(uint16(xLow>>48+yLow>>48))<<48 | uint64(uint16(xLow>>32+yLow>>32))<<32,
1841				)
1842				ce.pushValue(
1843					uint64(uint16(xHigh>>16)+uint16(yHigh>>16))<<16 | uint64(uint16(xHigh)+uint16(yHigh)) |
1844						uint64(uint16(xHigh>>48)+uint16(yHigh>>48))<<48 | uint64(uint16(xHigh>>32)+uint16(yHigh>>32))<<32,
1845				)
1846			case shapeI32x4:
1847				ce.pushValue(uint64(uint32(xLow>>32)+uint32(yLow>>32))<<32 | uint64(uint32(xLow)+uint32(yLow)))
1848				ce.pushValue(uint64(uint32(xHigh>>32)+uint32(yHigh>>32))<<32 | uint64(uint32(xHigh)+uint32(yHigh)))
1849			case shapeI64x2:
1850				ce.pushValue(xLow + yLow)
1851				ce.pushValue(xHigh + yHigh)
1852			case shapeF32x4:
1853				ce.pushValue(
1854					addFloat32bits(uint32(xLow), uint32(yLow)) | addFloat32bits(uint32(xLow>>32), uint32(yLow>>32))<<32,
1855				)
1856				ce.pushValue(
1857					addFloat32bits(uint32(xHigh), uint32(yHigh)) | addFloat32bits(uint32(xHigh>>32), uint32(yHigh>>32))<<32,
1858				)
1859			case shapeF64x2:
1860				ce.pushValue(math.Float64bits(math.Float64frombits(xLow) + math.Float64frombits(yLow)))
1861				ce.pushValue(math.Float64bits(math.Float64frombits(xHigh) + math.Float64frombits(yHigh)))
1862			}
1863			frame.pc++
1864		case operationKindV128Sub:
1865			yHigh, yLow := ce.popValue(), ce.popValue()
1866			xHigh, xLow := ce.popValue(), ce.popValue()
1867			switch op.B1 {
1868			case shapeI8x16:
1869				ce.pushValue(
1870					uint64(uint8(xLow>>8)-uint8(yLow>>8))<<8 | uint64(uint8(xLow)-uint8(yLow)) |
1871						uint64(uint8(xLow>>24)-uint8(yLow>>24))<<24 | uint64(uint8(xLow>>16)-uint8(yLow>>16))<<16 |
1872						uint64(uint8(xLow>>40)-uint8(yLow>>40))<<40 | uint64(uint8(xLow>>32)-uint8(yLow>>32))<<32 |
1873						uint64(uint8(xLow>>56)-uint8(yLow>>56))<<56 | uint64(uint8(xLow>>48)-uint8(yLow>>48))<<48,
1874				)
1875				ce.pushValue(
1876					uint64(uint8(xHigh>>8)-uint8(yHigh>>8))<<8 | uint64(uint8(xHigh)-uint8(yHigh)) |
1877						uint64(uint8(xHigh>>24)-uint8(yHigh>>24))<<24 | uint64(uint8(xHigh>>16)-uint8(yHigh>>16))<<16 |
1878						uint64(uint8(xHigh>>40)-uint8(yHigh>>40))<<40 | uint64(uint8(xHigh>>32)-uint8(yHigh>>32))<<32 |
1879						uint64(uint8(xHigh>>56)-uint8(yHigh>>56))<<56 | uint64(uint8(xHigh>>48)-uint8(yHigh>>48))<<48,
1880				)
1881			case shapeI16x8:
1882				ce.pushValue(
1883					uint64(uint16(xLow>>16)-uint16(yLow>>16))<<16 | uint64(uint16(xLow)-uint16(yLow)) |
1884						uint64(uint16(xLow>>48)-uint16(yLow>>48))<<48 | uint64(uint16(xLow>>32)-uint16(yLow>>32))<<32,
1885				)
1886				ce.pushValue(
1887					uint64(uint16(xHigh>>16)-uint16(yHigh>>16))<<16 | uint64(uint16(xHigh)-uint16(yHigh)) |
1888						uint64(uint16(xHigh>>48)-uint16(yHigh>>48))<<48 | uint64(uint16(xHigh>>32)-uint16(yHigh>>32))<<32,
1889				)
1890			case shapeI32x4:
1891				ce.pushValue(uint64(uint32(xLow>>32-yLow>>32))<<32 | uint64(uint32(xLow)-uint32(yLow)))
1892				ce.pushValue(uint64(uint32(xHigh>>32-yHigh>>32))<<32 | uint64(uint32(xHigh)-uint32(yHigh)))
1893			case shapeI64x2:
1894				ce.pushValue(xLow - yLow)
1895				ce.pushValue(xHigh - yHigh)
1896			case shapeF32x4:
1897				ce.pushValue(
1898					subFloat32bits(uint32(xLow), uint32(yLow)) | subFloat32bits(uint32(xLow>>32), uint32(yLow>>32))<<32,
1899				)
1900				ce.pushValue(
1901					subFloat32bits(uint32(xHigh), uint32(yHigh)) | subFloat32bits(uint32(xHigh>>32), uint32(yHigh>>32))<<32,
1902				)
1903			case shapeF64x2:
1904				ce.pushValue(math.Float64bits(math.Float64frombits(xLow) - math.Float64frombits(yLow)))
1905				ce.pushValue(math.Float64bits(math.Float64frombits(xHigh) - math.Float64frombits(yHigh)))
1906			}
1907			frame.pc++
1908		case operationKindV128Load:
1909			offset := ce.popMemoryOffset(op)
1910			switch op.B1 {
1911			case v128LoadType128:
1912				lo, ok := memoryInst.ReadUint64Le(offset)
1913				if !ok {
1914					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
1915				}
1916				ce.pushValue(lo)
1917				hi, ok := memoryInst.ReadUint64Le(offset + 8)
1918				if !ok {
1919					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
1920				}
1921				ce.pushValue(hi)
1922			case v128LoadType8x8s:
1923				data, ok := memoryInst.Read(offset, 8)
1924				if !ok {
1925					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
1926				}
1927				ce.pushValue(
1928					uint64(uint16(int8(data[3])))<<48 | uint64(uint16(int8(data[2])))<<32 | uint64(uint16(int8(data[1])))<<16 | uint64(uint16(int8(data[0]))),
1929				)
1930				ce.pushValue(
1931					uint64(uint16(int8(data[7])))<<48 | uint64(uint16(int8(data[6])))<<32 | uint64(uint16(int8(data[5])))<<16 | uint64(uint16(int8(data[4]))),
1932				)
1933			case v128LoadType8x8u:
1934				data, ok := memoryInst.Read(offset, 8)
1935				if !ok {
1936					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
1937				}
1938				ce.pushValue(
1939					uint64(data[3])<<48 | uint64(data[2])<<32 | uint64(data[1])<<16 | uint64(data[0]),
1940				)
1941				ce.pushValue(
1942					uint64(data[7])<<48 | uint64(data[6])<<32 | uint64(data[5])<<16 | uint64(data[4]),
1943				)
1944			case v128LoadType16x4s:
1945				data, ok := memoryInst.Read(offset, 8)
1946				if !ok {
1947					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
1948				}
1949				ce.pushValue(
1950					uint64(int16(binary.LittleEndian.Uint16(data[2:])))<<32 |
1951						uint64(uint32(int16(binary.LittleEndian.Uint16(data)))),
1952				)
1953				ce.pushValue(
1954					uint64(uint32(int16(binary.LittleEndian.Uint16(data[6:]))))<<32 |
1955						uint64(uint32(int16(binary.LittleEndian.Uint16(data[4:])))),
1956				)
1957			case v128LoadType16x4u:
1958				data, ok := memoryInst.Read(offset, 8)
1959				if !ok {
1960					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
1961				}
1962				ce.pushValue(
1963					uint64(binary.LittleEndian.Uint16(data[2:]))<<32 | uint64(binary.LittleEndian.Uint16(data)),
1964				)
1965				ce.pushValue(
1966					uint64(binary.LittleEndian.Uint16(data[6:]))<<32 | uint64(binary.LittleEndian.Uint16(data[4:])),
1967				)
1968			case v128LoadType32x2s:
1969				data, ok := memoryInst.Read(offset, 8)
1970				if !ok {
1971					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
1972				}
1973				ce.pushValue(uint64(int32(binary.LittleEndian.Uint32(data))))
1974				ce.pushValue(uint64(int32(binary.LittleEndian.Uint32(data[4:]))))
1975			case v128LoadType32x2u:
1976				data, ok := memoryInst.Read(offset, 8)
1977				if !ok {
1978					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
1979				}
1980				ce.pushValue(uint64(binary.LittleEndian.Uint32(data)))
1981				ce.pushValue(uint64(binary.LittleEndian.Uint32(data[4:])))
1982			case v128LoadType8Splat:
1983				v, ok := memoryInst.ReadByte(offset)
1984				if !ok {
1985					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
1986				}
1987				v8 := uint64(v)<<56 | uint64(v)<<48 | uint64(v)<<40 | uint64(v)<<32 |
1988					uint64(v)<<24 | uint64(v)<<16 | uint64(v)<<8 | uint64(v)
1989				ce.pushValue(v8)
1990				ce.pushValue(v8)
1991			case v128LoadType16Splat:
1992				v, ok := memoryInst.ReadUint16Le(offset)
1993				if !ok {
1994					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
1995				}
1996				v4 := uint64(v)<<48 | uint64(v)<<32 | uint64(v)<<16 | uint64(v)
1997				ce.pushValue(v4)
1998				ce.pushValue(v4)
1999			case v128LoadType32Splat:
2000				v, ok := memoryInst.ReadUint32Le(offset)
2001				if !ok {
2002					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
2003				}
2004				vv := uint64(v)<<32 | uint64(v)
2005				ce.pushValue(vv)
2006				ce.pushValue(vv)
2007			case v128LoadType64Splat:
2008				lo, ok := memoryInst.ReadUint64Le(offset)
2009				if !ok {
2010					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
2011				}
2012				ce.pushValue(lo)
2013				ce.pushValue(lo)
2014			case v128LoadType32zero:
2015				lo, ok := memoryInst.ReadUint32Le(offset)
2016				if !ok {
2017					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
2018				}
2019				ce.pushValue(uint64(lo))
2020				ce.pushValue(0)
2021			case v128LoadType64zero:
2022				lo, ok := memoryInst.ReadUint64Le(offset)
2023				if !ok {
2024					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
2025				}
2026				ce.pushValue(lo)
2027				ce.pushValue(0)
2028			}
2029			frame.pc++
2030		case operationKindV128LoadLane:
2031			hi, lo := ce.popValue(), ce.popValue()
2032			offset := ce.popMemoryOffset(op)
2033			switch op.B1 {
2034			case 8:
2035				b, ok := memoryInst.ReadByte(offset)
2036				if !ok {
2037					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
2038				}
2039				if op.B2 < 8 {
2040					s := op.B2 << 3
2041					lo = (lo & ^(0xff << s)) | uint64(b)<<s
2042				} else {
2043					s := (op.B2 - 8) << 3
2044					hi = (hi & ^(0xff << s)) | uint64(b)<<s
2045				}
2046			case 16:
2047				b, ok := memoryInst.ReadUint16Le(offset)
2048				if !ok {
2049					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
2050				}
2051				if op.B2 < 4 {
2052					s := op.B2 << 4
2053					lo = (lo & ^(0xff_ff << s)) | uint64(b)<<s
2054				} else {
2055					s := (op.B2 - 4) << 4
2056					hi = (hi & ^(0xff_ff << s)) | uint64(b)<<s
2057				}
2058			case 32:
2059				b, ok := memoryInst.ReadUint32Le(offset)
2060				if !ok {
2061					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
2062				}
2063				if op.B2 < 2 {
2064					s := op.B2 << 5
2065					lo = (lo & ^(0xff_ff_ff_ff << s)) | uint64(b)<<s
2066				} else {
2067					s := (op.B2 - 2) << 5
2068					hi = (hi & ^(0xff_ff_ff_ff << s)) | uint64(b)<<s
2069				}
2070			case 64:
2071				b, ok := memoryInst.ReadUint64Le(offset)
2072				if !ok {
2073					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
2074				}
2075				if op.B2 == 0 {
2076					lo = b
2077				} else {
2078					hi = b
2079				}
2080			}
2081			ce.pushValue(lo)
2082			ce.pushValue(hi)
2083			frame.pc++
2084		case operationKindV128Store:
2085			hi, lo := ce.popValue(), ce.popValue()
2086			offset := ce.popMemoryOffset(op)
2087			// Write the upper bytes first to trigger an early error if the memory access is out of bounds.
2088			// Otherwise, the lower bytes might be written to memory, but the upper bytes might not.
2089			if uint64(offset)+8 > math.MaxUint32 {
2090				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
2091			}
2092			if ok := memoryInst.WriteUint64Le(offset+8, hi); !ok {
2093				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
2094			}
2095			if ok := memoryInst.WriteUint64Le(offset, lo); !ok {
2096				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
2097			}
2098			frame.pc++
2099		case operationKindV128StoreLane:
2100			hi, lo := ce.popValue(), ce.popValue()
2101			offset := ce.popMemoryOffset(op)
2102			var ok bool
2103			switch op.B1 {
2104			case 8:
2105				if op.B2 < 8 {
2106					ok = memoryInst.WriteByte(offset, byte(lo>>(op.B2*8)))
2107				} else {
2108					ok = memoryInst.WriteByte(offset, byte(hi>>((op.B2-8)*8)))
2109				}
2110			case 16:
2111				if op.B2 < 4 {
2112					ok = memoryInst.WriteUint16Le(offset, uint16(lo>>(op.B2*16)))
2113				} else {
2114					ok = memoryInst.WriteUint16Le(offset, uint16(hi>>((op.B2-4)*16)))
2115				}
2116			case 32:
2117				if op.B2 < 2 {
2118					ok = memoryInst.WriteUint32Le(offset, uint32(lo>>(op.B2*32)))
2119				} else {
2120					ok = memoryInst.WriteUint32Le(offset, uint32(hi>>((op.B2-2)*32)))
2121				}
2122			case 64:
2123				if op.B2 == 0 {
2124					ok = memoryInst.WriteUint64Le(offset, lo)
2125				} else {
2126					ok = memoryInst.WriteUint64Le(offset, hi)
2127				}
2128			}
2129			if !ok {
2130				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
2131			}
2132			frame.pc++
2133		case operationKindV128ReplaceLane:
2134			v := ce.popValue()
2135			hi, lo := ce.popValue(), ce.popValue()
2136			switch op.B1 {
2137			case shapeI8x16:
2138				if op.B2 < 8 {
2139					s := op.B2 << 3
2140					lo = (lo & ^(0xff << s)) | uint64(byte(v))<<s
2141				} else {
2142					s := (op.B2 - 8) << 3
2143					hi = (hi & ^(0xff << s)) | uint64(byte(v))<<s
2144				}
2145			case shapeI16x8:
2146				if op.B2 < 4 {
2147					s := op.B2 << 4
2148					lo = (lo & ^(0xff_ff << s)) | uint64(uint16(v))<<s
2149				} else {
2150					s := (op.B2 - 4) << 4
2151					hi = (hi & ^(0xff_ff << s)) | uint64(uint16(v))<<s
2152				}
2153			case shapeI32x4, shapeF32x4:
2154				if op.B2 < 2 {
2155					s := op.B2 << 5
2156					lo = (lo & ^(0xff_ff_ff_ff << s)) | uint64(uint32(v))<<s
2157				} else {
2158					s := (op.B2 - 2) << 5
2159					hi = (hi & ^(0xff_ff_ff_ff << s)) | uint64(uint32(v))<<s
2160				}
2161			case shapeI64x2, shapeF64x2:
2162				if op.B2 == 0 {
2163					lo = v
2164				} else {
2165					hi = v
2166				}
2167			}
2168			ce.pushValue(lo)
2169			ce.pushValue(hi)
2170			frame.pc++
2171		case operationKindV128ExtractLane:
2172			hi, lo := ce.popValue(), ce.popValue()
2173			var v uint64
2174			switch op.B1 {
2175			case shapeI8x16:
2176				var u8 byte
2177				if op.B2 < 8 {
2178					u8 = byte(lo >> (op.B2 * 8))
2179				} else {
2180					u8 = byte(hi >> ((op.B2 - 8) * 8))
2181				}
2182				if op.B3 {
2183					// sign-extend.
2184					v = uint64(uint32(int8(u8)))
2185				} else {
2186					v = uint64(u8)
2187				}
2188			case shapeI16x8:
2189				var u16 uint16
2190				if op.B2 < 4 {
2191					u16 = uint16(lo >> (op.B2 * 16))
2192				} else {
2193					u16 = uint16(hi >> ((op.B2 - 4) * 16))
2194				}
2195				if op.B3 {
2196					// sign-extend.
2197					v = uint64(uint32(int16(u16)))
2198				} else {
2199					v = uint64(u16)
2200				}
2201			case shapeI32x4, shapeF32x4:
2202				if op.B2 < 2 {
2203					v = uint64(uint32(lo >> (op.B2 * 32)))
2204				} else {
2205					v = uint64(uint32(hi >> ((op.B2 - 2) * 32)))
2206				}
2207			case shapeI64x2, shapeF64x2:
2208				if op.B2 == 0 {
2209					v = lo
2210				} else {
2211					v = hi
2212				}
2213			}
2214			ce.pushValue(v)
2215			frame.pc++
2216		case operationKindV128Splat:
2217			v := ce.popValue()
2218			var hi, lo uint64
2219			switch op.B1 {
2220			case shapeI8x16:
2221				v8 := uint64(byte(v))<<56 | uint64(byte(v))<<48 | uint64(byte(v))<<40 | uint64(byte(v))<<32 |
2222					uint64(byte(v))<<24 | uint64(byte(v))<<16 | uint64(byte(v))<<8 | uint64(byte(v))
2223				hi, lo = v8, v8
2224			case shapeI16x8:
2225				v4 := uint64(uint16(v))<<48 | uint64(uint16(v))<<32 | uint64(uint16(v))<<16 | uint64(uint16(v))
2226				hi, lo = v4, v4
2227			case shapeI32x4, shapeF32x4:
2228				v2 := uint64(uint32(v))<<32 | uint64(uint32(v))
2229				lo, hi = v2, v2
2230			case shapeI64x2, shapeF64x2:
2231				lo, hi = v, v
2232			}
2233			ce.pushValue(lo)
2234			ce.pushValue(hi)
2235			frame.pc++
2236		case operationKindV128Swizzle:
2237			idxHi, idxLo := ce.popValue(), ce.popValue()
2238			baseHi, baseLo := ce.popValue(), ce.popValue()
2239			var newVal [16]byte
2240			for i := 0; i < 16; i++ {
2241				var id byte
2242				if i < 8 {
2243					id = byte(idxLo >> (i * 8))
2244				} else {
2245					id = byte(idxHi >> ((i - 8) * 8))
2246				}
2247				if id < 8 {
2248					newVal[i] = byte(baseLo >> (id * 8))
2249				} else if id < 16 {
2250					newVal[i] = byte(baseHi >> ((id - 8) * 8))
2251				}
2252			}
2253			ce.pushValue(binary.LittleEndian.Uint64(newVal[:8]))
2254			ce.pushValue(binary.LittleEndian.Uint64(newVal[8:]))
2255			frame.pc++
2256		case operationKindV128Shuffle:
2257			xHi, xLo, yHi, yLo := ce.popValue(), ce.popValue(), ce.popValue(), ce.popValue()
2258			var newVal [16]byte
2259			for i, l := range op.Us {
2260				if l < 8 {
2261					newVal[i] = byte(yLo >> (l * 8))
2262				} else if l < 16 {
2263					newVal[i] = byte(yHi >> ((l - 8) * 8))
2264				} else if l < 24 {
2265					newVal[i] = byte(xLo >> ((l - 16) * 8))
2266				} else if l < 32 {
2267					newVal[i] = byte(xHi >> ((l - 24) * 8))
2268				}
2269			}
2270			ce.pushValue(binary.LittleEndian.Uint64(newVal[:8]))
2271			ce.pushValue(binary.LittleEndian.Uint64(newVal[8:]))
2272			frame.pc++
2273		case operationKindV128AnyTrue:
2274			hi, lo := ce.popValue(), ce.popValue()
2275			if hi != 0 || lo != 0 {
2276				ce.pushValue(1)
2277			} else {
2278				ce.pushValue(0)
2279			}
2280			frame.pc++
2281		case operationKindV128AllTrue:
2282			hi, lo := ce.popValue(), ce.popValue()
2283			var ret bool
2284			switch op.B1 {
2285			case shapeI8x16:
2286				ret = (uint8(lo) != 0) && (uint8(lo>>8) != 0) && (uint8(lo>>16) != 0) && (uint8(lo>>24) != 0) &&
2287					(uint8(lo>>32) != 0) && (uint8(lo>>40) != 0) && (uint8(lo>>48) != 0) && (uint8(lo>>56) != 0) &&
2288					(uint8(hi) != 0) && (uint8(hi>>8) != 0) && (uint8(hi>>16) != 0) && (uint8(hi>>24) != 0) &&
2289					(uint8(hi>>32) != 0) && (uint8(hi>>40) != 0) && (uint8(hi>>48) != 0) && (uint8(hi>>56) != 0)
2290			case shapeI16x8:
2291				ret = (uint16(lo) != 0) && (uint16(lo>>16) != 0) && (uint16(lo>>32) != 0) && (uint16(lo>>48) != 0) &&
2292					(uint16(hi) != 0) && (uint16(hi>>16) != 0) && (uint16(hi>>32) != 0) && (uint16(hi>>48) != 0)
2293			case shapeI32x4:
2294				ret = (uint32(lo) != 0) && (uint32(lo>>32) != 0) &&
2295					(uint32(hi) != 0) && (uint32(hi>>32) != 0)
2296			case shapeI64x2:
2297				ret = (lo != 0) &&
2298					(hi != 0)
2299			}
2300			if ret {
2301				ce.pushValue(1)
2302			} else {
2303				ce.pushValue(0)
2304			}
2305			frame.pc++
2306		case operationKindV128BitMask:
2307			// https://github.com/WebAssembly/spec/blob/wg-2.0.draft1/proposals/simd/SIMD.md#bitmask-extraction
2308			hi, lo := ce.popValue(), ce.popValue()
2309			var res uint64
2310			switch op.B1 {
2311			case shapeI8x16:
2312				for i := 0; i < 8; i++ {
2313					if int8(lo>>(i*8)) < 0 {
2314						res |= 1 << i
2315					}
2316				}
2317				for i := 0; i < 8; i++ {
2318					if int8(hi>>(i*8)) < 0 {
2319						res |= 1 << (i + 8)
2320					}
2321				}
2322			case shapeI16x8:
2323				for i := 0; i < 4; i++ {
2324					if int16(lo>>(i*16)) < 0 {
2325						res |= 1 << i
2326					}
2327				}
2328				for i := 0; i < 4; i++ {
2329					if int16(hi>>(i*16)) < 0 {
2330						res |= 1 << (i + 4)
2331					}
2332				}
2333			case shapeI32x4:
2334				for i := 0; i < 2; i++ {
2335					if int32(lo>>(i*32)) < 0 {
2336						res |= 1 << i
2337					}
2338				}
2339				for i := 0; i < 2; i++ {
2340					if int32(hi>>(i*32)) < 0 {
2341						res |= 1 << (i + 2)
2342					}
2343				}
2344			case shapeI64x2:
2345				if int64(lo) < 0 {
2346					res |= 0b01
2347				}
2348				if int(hi) < 0 {
2349					res |= 0b10
2350				}
2351			}
2352			ce.pushValue(res)
2353			frame.pc++
2354		case operationKindV128And:
2355			x2Hi, x2Lo := ce.popValue(), ce.popValue()
2356			x1Hi, x1Lo := ce.popValue(), ce.popValue()
2357			ce.pushValue(x1Lo & x2Lo)
2358			ce.pushValue(x1Hi & x2Hi)
2359			frame.pc++
2360		case operationKindV128Not:
2361			hi, lo := ce.popValue(), ce.popValue()
2362			ce.pushValue(^lo)
2363			ce.pushValue(^hi)
2364			frame.pc++
2365		case operationKindV128Or:
2366			x2Hi, x2Lo := ce.popValue(), ce.popValue()
2367			x1Hi, x1Lo := ce.popValue(), ce.popValue()
2368			ce.pushValue(x1Lo | x2Lo)
2369			ce.pushValue(x1Hi | x2Hi)
2370			frame.pc++
2371		case operationKindV128Xor:
2372			x2Hi, x2Lo := ce.popValue(), ce.popValue()
2373			x1Hi, x1Lo := ce.popValue(), ce.popValue()
2374			ce.pushValue(x1Lo ^ x2Lo)
2375			ce.pushValue(x1Hi ^ x2Hi)
2376			frame.pc++
2377		case operationKindV128Bitselect:
2378			// https://github.com/WebAssembly/spec/blob/wg-2.0.draft1/proposals/simd/SIMD.md#bitwise-select
2379			cHi, cLo := ce.popValue(), ce.popValue()
2380			x2Hi, x2Lo := ce.popValue(), ce.popValue()
2381			x1Hi, x1Lo := ce.popValue(), ce.popValue()
2382			// v128.or(v128.and(v1, c), v128.and(v2, v128.not(c)))
2383			ce.pushValue((x1Lo & cLo) | (x2Lo & (^cLo)))
2384			ce.pushValue((x1Hi & cHi) | (x2Hi & (^cHi)))
2385			frame.pc++
2386		case operationKindV128AndNot:
2387			x2Hi, x2Lo := ce.popValue(), ce.popValue()
2388			x1Hi, x1Lo := ce.popValue(), ce.popValue()
2389			ce.pushValue(x1Lo & (^x2Lo))
2390			ce.pushValue(x1Hi & (^x2Hi))
2391			frame.pc++
2392		case operationKindV128Shl:
2393			s := ce.popValue()
2394			hi, lo := ce.popValue(), ce.popValue()
2395			switch op.B1 {
2396			case shapeI8x16:
2397				s = s % 8
2398				lo = uint64(uint8(lo<<s)) |
2399					uint64(uint8((lo>>8)<<s))<<8 |
2400					uint64(uint8((lo>>16)<<s))<<16 |
2401					uint64(uint8((lo>>24)<<s))<<24 |
2402					uint64(uint8((lo>>32)<<s))<<32 |
2403					uint64(uint8((lo>>40)<<s))<<40 |
2404					uint64(uint8((lo>>48)<<s))<<48 |
2405					uint64(uint8((lo>>56)<<s))<<56
2406				hi = uint64(uint8(hi<<s)) |
2407					uint64(uint8((hi>>8)<<s))<<8 |
2408					uint64(uint8((hi>>16)<<s))<<16 |
2409					uint64(uint8((hi>>24)<<s))<<24 |
2410					uint64(uint8((hi>>32)<<s))<<32 |
2411					uint64(uint8((hi>>40)<<s))<<40 |
2412					uint64(uint8((hi>>48)<<s))<<48 |
2413					uint64(uint8((hi>>56)<<s))<<56
2414			case shapeI16x8:
2415				s = s % 16
2416				lo = uint64(uint16(lo<<s)) |
2417					uint64(uint16((lo>>16)<<s))<<16 |
2418					uint64(uint16((lo>>32)<<s))<<32 |
2419					uint64(uint16((lo>>48)<<s))<<48
2420				hi = uint64(uint16(hi<<s)) |
2421					uint64(uint16((hi>>16)<<s))<<16 |
2422					uint64(uint16((hi>>32)<<s))<<32 |
2423					uint64(uint16((hi>>48)<<s))<<48
2424			case shapeI32x4:
2425				s = s % 32
2426				lo = uint64(uint32(lo<<s)) | uint64(uint32((lo>>32)<<s))<<32
2427				hi = uint64(uint32(hi<<s)) | uint64(uint32((hi>>32)<<s))<<32
2428			case shapeI64x2:
2429				s = s % 64
2430				lo = lo << s
2431				hi = hi << s
2432			}
2433			ce.pushValue(lo)
2434			ce.pushValue(hi)
2435			frame.pc++
2436		case operationKindV128Shr:
2437			s := ce.popValue()
2438			hi, lo := ce.popValue(), ce.popValue()
2439			switch op.B1 {
2440			case shapeI8x16:
2441				s = s % 8
2442				if op.B3 { // signed
2443					lo = uint64(uint8(int8(lo)>>s)) |
2444						uint64(uint8(int8(lo>>8)>>s))<<8 |
2445						uint64(uint8(int8(lo>>16)>>s))<<16 |
2446						uint64(uint8(int8(lo>>24)>>s))<<24 |
2447						uint64(uint8(int8(lo>>32)>>s))<<32 |
2448						uint64(uint8(int8(lo>>40)>>s))<<40 |
2449						uint64(uint8(int8(lo>>48)>>s))<<48 |
2450						uint64(uint8(int8(lo>>56)>>s))<<56
2451					hi = uint64(uint8(int8(hi)>>s)) |
2452						uint64(uint8(int8(hi>>8)>>s))<<8 |
2453						uint64(uint8(int8(hi>>16)>>s))<<16 |
2454						uint64(uint8(int8(hi>>24)>>s))<<24 |
2455						uint64(uint8(int8(hi>>32)>>s))<<32 |
2456						uint64(uint8(int8(hi>>40)>>s))<<40 |
2457						uint64(uint8(int8(hi>>48)>>s))<<48 |
2458						uint64(uint8(int8(hi>>56)>>s))<<56
2459				} else {
2460					lo = uint64(uint8(lo)>>s) |
2461						uint64(uint8(lo>>8)>>s)<<8 |
2462						uint64(uint8(lo>>16)>>s)<<16 |
2463						uint64(uint8(lo>>24)>>s)<<24 |
2464						uint64(uint8(lo>>32)>>s)<<32 |
2465						uint64(uint8(lo>>40)>>s)<<40 |
2466						uint64(uint8(lo>>48)>>s)<<48 |
2467						uint64(uint8(lo>>56)>>s)<<56
2468					hi = uint64(uint8(hi)>>s) |
2469						uint64(uint8(hi>>8)>>s)<<8 |
2470						uint64(uint8(hi>>16)>>s)<<16 |
2471						uint64(uint8(hi>>24)>>s)<<24 |
2472						uint64(uint8(hi>>32)>>s)<<32 |
2473						uint64(uint8(hi>>40)>>s)<<40 |
2474						uint64(uint8(hi>>48)>>s)<<48 |
2475						uint64(uint8(hi>>56)>>s)<<56
2476				}
2477			case shapeI16x8:
2478				s = s % 16
2479				if op.B3 { // signed
2480					lo = uint64(uint16(int16(lo)>>s)) |
2481						uint64(uint16(int16(lo>>16)>>s))<<16 |
2482						uint64(uint16(int16(lo>>32)>>s))<<32 |
2483						uint64(uint16(int16(lo>>48)>>s))<<48
2484					hi = uint64(uint16(int16(hi)>>s)) |
2485						uint64(uint16(int16(hi>>16)>>s))<<16 |
2486						uint64(uint16(int16(hi>>32)>>s))<<32 |
2487						uint64(uint16(int16(hi>>48)>>s))<<48
2488				} else {
2489					lo = uint64(uint16(lo)>>s) |
2490						uint64(uint16(lo>>16)>>s)<<16 |
2491						uint64(uint16(lo>>32)>>s)<<32 |
2492						uint64(uint16(lo>>48)>>s)<<48
2493					hi = uint64(uint16(hi)>>s) |
2494						uint64(uint16(hi>>16)>>s)<<16 |
2495						uint64(uint16(hi>>32)>>s)<<32 |
2496						uint64(uint16(hi>>48)>>s)<<48
2497				}
2498			case shapeI32x4:
2499				s = s % 32
2500				if op.B3 {
2501					lo = uint64(uint32(int32(lo)>>s)) | uint64(uint32(int32(lo>>32)>>s))<<32
2502					hi = uint64(uint32(int32(hi)>>s)) | uint64(uint32(int32(hi>>32)>>s))<<32
2503				} else {
2504					lo = uint64(uint32(lo)>>s) | uint64(uint32(lo>>32)>>s)<<32
2505					hi = uint64(uint32(hi)>>s) | uint64(uint32(hi>>32)>>s)<<32
2506				}
2507			case shapeI64x2:
2508				s = s % 64
2509				if op.B3 { // signed
2510					lo = uint64(int64(lo) >> s)
2511					hi = uint64(int64(hi) >> s)
2512				} else {
2513					lo = lo >> s
2514					hi = hi >> s
2515				}
2516
2517			}
2518			ce.pushValue(lo)
2519			ce.pushValue(hi)
2520			frame.pc++
2521		case operationKindV128Cmp:
2522			x2Hi, x2Lo := ce.popValue(), ce.popValue()
2523			x1Hi, x1Lo := ce.popValue(), ce.popValue()
2524			var result []bool
2525			switch op.B1 {
2526			case v128CmpTypeI8x16Eq:
2527				result = []bool{
2528					byte(x1Lo>>0) == byte(x2Lo>>0), byte(x1Lo>>8) == byte(x2Lo>>8),
2529					byte(x1Lo>>16) == byte(x2Lo>>16), byte(x1Lo>>24) == byte(x2Lo>>24),
2530					byte(x1Lo>>32) == byte(x2Lo>>32), byte(x1Lo>>40) == byte(x2Lo>>40),
2531					byte(x1Lo>>48) == byte(x2Lo>>48), byte(x1Lo>>56) == byte(x2Lo>>56),
2532					byte(x1Hi>>0) == byte(x2Hi>>0), byte(x1Hi>>8) == byte(x2Hi>>8),
2533					byte(x1Hi>>16) == byte(x2Hi>>16), byte(x1Hi>>24) == byte(x2Hi>>24),
2534					byte(x1Hi>>32) == byte(x2Hi>>32), byte(x1Hi>>40) == byte(x2Hi>>40),
2535					byte(x1Hi>>48) == byte(x2Hi>>48), byte(x1Hi>>56) == byte(x2Hi>>56),
2536				}
2537			case v128CmpTypeI8x16Ne:
2538				result = []bool{
2539					byte(x1Lo>>0) != byte(x2Lo>>0), byte(x1Lo>>8) != byte(x2Lo>>8),
2540					byte(x1Lo>>16) != byte(x2Lo>>16), byte(x1Lo>>24) != byte(x2Lo>>24),
2541					byte(x1Lo>>32) != byte(x2Lo>>32), byte(x1Lo>>40) != byte(x2Lo>>40),
2542					byte(x1Lo>>48) != byte(x2Lo>>48), byte(x1Lo>>56) != byte(x2Lo>>56),
2543					byte(x1Hi>>0) != byte(x2Hi>>0), byte(x1Hi>>8) != byte(x2Hi>>8),
2544					byte(x1Hi>>16) != byte(x2Hi>>16), byte(x1Hi>>24) != byte(x2Hi>>24),
2545					byte(x1Hi>>32) != byte(x2Hi>>32), byte(x1Hi>>40) != byte(x2Hi>>40),
2546					byte(x1Hi>>48) != byte(x2Hi>>48), byte(x1Hi>>56) != byte(x2Hi>>56),
2547				}
2548			case v128CmpTypeI8x16LtS:
2549				result = []bool{
2550					int8(x1Lo>>0) < int8(x2Lo>>0), int8(x1Lo>>8) < int8(x2Lo>>8),
2551					int8(x1Lo>>16) < int8(x2Lo>>16), int8(x1Lo>>24) < int8(x2Lo>>24),
2552					int8(x1Lo>>32) < int8(x2Lo>>32), int8(x1Lo>>40) < int8(x2Lo>>40),
2553					int8(x1Lo>>48) < int8(x2Lo>>48), int8(x1Lo>>56) < int8(x2Lo>>56),
2554					int8(x1Hi>>0) < int8(x2Hi>>0), int8(x1Hi>>8) < int8(x2Hi>>8),
2555					int8(x1Hi>>16) < int8(x2Hi>>16), int8(x1Hi>>24) < int8(x2Hi>>24),
2556					int8(x1Hi>>32) < int8(x2Hi>>32), int8(x1Hi>>40) < int8(x2Hi>>40),
2557					int8(x1Hi>>48) < int8(x2Hi>>48), int8(x1Hi>>56) < int8(x2Hi>>56),
2558				}
2559			case v128CmpTypeI8x16LtU:
2560				result = []bool{
2561					byte(x1Lo>>0) < byte(x2Lo>>0), byte(x1Lo>>8) < byte(x2Lo>>8),
2562					byte(x1Lo>>16) < byte(x2Lo>>16), byte(x1Lo>>24) < byte(x2Lo>>24),
2563					byte(x1Lo>>32) < byte(x2Lo>>32), byte(x1Lo>>40) < byte(x2Lo>>40),
2564					byte(x1Lo>>48) < byte(x2Lo>>48), byte(x1Lo>>56) < byte(x2Lo>>56),
2565					byte(x1Hi>>0) < byte(x2Hi>>0), byte(x1Hi>>8) < byte(x2Hi>>8),
2566					byte(x1Hi>>16) < byte(x2Hi>>16), byte(x1Hi>>24) < byte(x2Hi>>24),
2567					byte(x1Hi>>32) < byte(x2Hi>>32), byte(x1Hi>>40) < byte(x2Hi>>40),
2568					byte(x1Hi>>48) < byte(x2Hi>>48), byte(x1Hi>>56) < byte(x2Hi>>56),
2569				}
2570			case v128CmpTypeI8x16GtS:
2571				result = []bool{
2572					int8(x1Lo>>0) > int8(x2Lo>>0), int8(x1Lo>>8) > int8(x2Lo>>8),
2573					int8(x1Lo>>16) > int8(x2Lo>>16), int8(x1Lo>>24) > int8(x2Lo>>24),
2574					int8(x1Lo>>32) > int8(x2Lo>>32), int8(x1Lo>>40) > int8(x2Lo>>40),
2575					int8(x1Lo>>48) > int8(x2Lo>>48), int8(x1Lo>>56) > int8(x2Lo>>56),
2576					int8(x1Hi>>0) > int8(x2Hi>>0), int8(x1Hi>>8) > int8(x2Hi>>8),
2577					int8(x1Hi>>16) > int8(x2Hi>>16), int8(x1Hi>>24) > int8(x2Hi>>24),
2578					int8(x1Hi>>32) > int8(x2Hi>>32), int8(x1Hi>>40) > int8(x2Hi>>40),
2579					int8(x1Hi>>48) > int8(x2Hi>>48), int8(x1Hi>>56) > int8(x2Hi>>56),
2580				}
2581			case v128CmpTypeI8x16GtU:
2582				result = []bool{
2583					byte(x1Lo>>0) > byte(x2Lo>>0), byte(x1Lo>>8) > byte(x2Lo>>8),
2584					byte(x1Lo>>16) > byte(x2Lo>>16), byte(x1Lo>>24) > byte(x2Lo>>24),
2585					byte(x1Lo>>32) > byte(x2Lo>>32), byte(x1Lo>>40) > byte(x2Lo>>40),
2586					byte(x1Lo>>48) > byte(x2Lo>>48), byte(x1Lo>>56) > byte(x2Lo>>56),
2587					byte(x1Hi>>0) > byte(x2Hi>>0), byte(x1Hi>>8) > byte(x2Hi>>8),
2588					byte(x1Hi>>16) > byte(x2Hi>>16), byte(x1Hi>>24) > byte(x2Hi>>24),
2589					byte(x1Hi>>32) > byte(x2Hi>>32), byte(x1Hi>>40) > byte(x2Hi>>40),
2590					byte(x1Hi>>48) > byte(x2Hi>>48), byte(x1Hi>>56) > byte(x2Hi>>56),
2591				}
2592			case v128CmpTypeI8x16LeS:
2593				result = []bool{
2594					int8(x1Lo>>0) <= int8(x2Lo>>0), int8(x1Lo>>8) <= int8(x2Lo>>8),
2595					int8(x1Lo>>16) <= int8(x2Lo>>16), int8(x1Lo>>24) <= int8(x2Lo>>24),
2596					int8(x1Lo>>32) <= int8(x2Lo>>32), int8(x1Lo>>40) <= int8(x2Lo>>40),
2597					int8(x1Lo>>48) <= int8(x2Lo>>48), int8(x1Lo>>56) <= int8(x2Lo>>56),
2598					int8(x1Hi>>0) <= int8(x2Hi>>0), int8(x1Hi>>8) <= int8(x2Hi>>8),
2599					int8(x1Hi>>16) <= int8(x2Hi>>16), int8(x1Hi>>24) <= int8(x2Hi>>24),
2600					int8(x1Hi>>32) <= int8(x2Hi>>32), int8(x1Hi>>40) <= int8(x2Hi>>40),
2601					int8(x1Hi>>48) <= int8(x2Hi>>48), int8(x1Hi>>56) <= int8(x2Hi>>56),
2602				}
2603			case v128CmpTypeI8x16LeU:
2604				result = []bool{
2605					byte(x1Lo>>0) <= byte(x2Lo>>0), byte(x1Lo>>8) <= byte(x2Lo>>8),
2606					byte(x1Lo>>16) <= byte(x2Lo>>16), byte(x1Lo>>24) <= byte(x2Lo>>24),
2607					byte(x1Lo>>32) <= byte(x2Lo>>32), byte(x1Lo>>40) <= byte(x2Lo>>40),
2608					byte(x1Lo>>48) <= byte(x2Lo>>48), byte(x1Lo>>56) <= byte(x2Lo>>56),
2609					byte(x1Hi>>0) <= byte(x2Hi>>0), byte(x1Hi>>8) <= byte(x2Hi>>8),
2610					byte(x1Hi>>16) <= byte(x2Hi>>16), byte(x1Hi>>24) <= byte(x2Hi>>24),
2611					byte(x1Hi>>32) <= byte(x2Hi>>32), byte(x1Hi>>40) <= byte(x2Hi>>40),
2612					byte(x1Hi>>48) <= byte(x2Hi>>48), byte(x1Hi>>56) <= byte(x2Hi>>56),
2613				}
2614			case v128CmpTypeI8x16GeS:
2615				result = []bool{
2616					int8(x1Lo>>0) >= int8(x2Lo>>0), int8(x1Lo>>8) >= int8(x2Lo>>8),
2617					int8(x1Lo>>16) >= int8(x2Lo>>16), int8(x1Lo>>24) >= int8(x2Lo>>24),
2618					int8(x1Lo>>32) >= int8(x2Lo>>32), int8(x1Lo>>40) >= int8(x2Lo>>40),
2619					int8(x1Lo>>48) >= int8(x2Lo>>48), int8(x1Lo>>56) >= int8(x2Lo>>56),
2620					int8(x1Hi>>0) >= int8(x2Hi>>0), int8(x1Hi>>8) >= int8(x2Hi>>8),
2621					int8(x1Hi>>16) >= int8(x2Hi>>16), int8(x1Hi>>24) >= int8(x2Hi>>24),
2622					int8(x1Hi>>32) >= int8(x2Hi>>32), int8(x1Hi>>40) >= int8(x2Hi>>40),
2623					int8(x1Hi>>48) >= int8(x2Hi>>48), int8(x1Hi>>56) >= int8(x2Hi>>56),
2624				}
2625			case v128CmpTypeI8x16GeU:
2626				result = []bool{
2627					byte(x1Lo>>0) >= byte(x2Lo>>0), byte(x1Lo>>8) >= byte(x2Lo>>8),
2628					byte(x1Lo>>16) >= byte(x2Lo>>16), byte(x1Lo>>24) >= byte(x2Lo>>24),
2629					byte(x1Lo>>32) >= byte(x2Lo>>32), byte(x1Lo>>40) >= byte(x2Lo>>40),
2630					byte(x1Lo>>48) >= byte(x2Lo>>48), byte(x1Lo>>56) >= byte(x2Lo>>56),
2631					byte(x1Hi>>0) >= byte(x2Hi>>0), byte(x1Hi>>8) >= byte(x2Hi>>8),
2632					byte(x1Hi>>16) >= byte(x2Hi>>16), byte(x1Hi>>24) >= byte(x2Hi>>24),
2633					byte(x1Hi>>32) >= byte(x2Hi>>32), byte(x1Hi>>40) >= byte(x2Hi>>40),
2634					byte(x1Hi>>48) >= byte(x2Hi>>48), byte(x1Hi>>56) >= byte(x2Hi>>56),
2635				}
2636			case v128CmpTypeI16x8Eq:
2637				result = []bool{
2638					uint16(x1Lo>>0) == uint16(x2Lo>>0), uint16(x1Lo>>16) == uint16(x2Lo>>16),
2639					uint16(x1Lo>>32) == uint16(x2Lo>>32), uint16(x1Lo>>48) == uint16(x2Lo>>48),
2640					uint16(x1Hi>>0) == uint16(x2Hi>>0), uint16(x1Hi>>16) == uint16(x2Hi>>16),
2641					uint16(x1Hi>>32) == uint16(x2Hi>>32), uint16(x1Hi>>48) == uint16(x2Hi>>48),
2642				}
2643			case v128CmpTypeI16x8Ne:
2644				result = []bool{
2645					uint16(x1Lo>>0) != uint16(x2Lo>>0), uint16(x1Lo>>16) != uint16(x2Lo>>16),
2646					uint16(x1Lo>>32) != uint16(x2Lo>>32), uint16(x1Lo>>48) != uint16(x2Lo>>48),
2647					uint16(x1Hi>>0) != uint16(x2Hi>>0), uint16(x1Hi>>16) != uint16(x2Hi>>16),
2648					uint16(x1Hi>>32) != uint16(x2Hi>>32), uint16(x1Hi>>48) != uint16(x2Hi>>48),
2649				}
2650			case v128CmpTypeI16x8LtS:
2651				result = []bool{
2652					int16(x1Lo>>0) < int16(x2Lo>>0), int16(x1Lo>>16) < int16(x2Lo>>16),
2653					int16(x1Lo>>32) < int16(x2Lo>>32), int16(x1Lo>>48) < int16(x2Lo>>48),
2654					int16(x1Hi>>0) < int16(x2Hi>>0), int16(x1Hi>>16) < int16(x2Hi>>16),
2655					int16(x1Hi>>32) < int16(x2Hi>>32), int16(x1Hi>>48) < int16(x2Hi>>48),
2656				}
2657			case v128CmpTypeI16x8LtU:
2658				result = []bool{
2659					uint16(x1Lo>>0) < uint16(x2Lo>>0), uint16(x1Lo>>16) < uint16(x2Lo>>16),
2660					uint16(x1Lo>>32) < uint16(x2Lo>>32), uint16(x1Lo>>48) < uint16(x2Lo>>48),
2661					uint16(x1Hi>>0) < uint16(x2Hi>>0), uint16(x1Hi>>16) < uint16(x2Hi>>16),
2662					uint16(x1Hi>>32) < uint16(x2Hi>>32), uint16(x1Hi>>48) < uint16(x2Hi>>48),
2663				}
2664			case v128CmpTypeI16x8GtS:
2665				result = []bool{
2666					int16(x1Lo>>0) > int16(x2Lo>>0), int16(x1Lo>>16) > int16(x2Lo>>16),
2667					int16(x1Lo>>32) > int16(x2Lo>>32), int16(x1Lo>>48) > int16(x2Lo>>48),
2668					int16(x1Hi>>0) > int16(x2Hi>>0), int16(x1Hi>>16) > int16(x2Hi>>16),
2669					int16(x1Hi>>32) > int16(x2Hi>>32), int16(x1Hi>>48) > int16(x2Hi>>48),
2670				}
2671			case v128CmpTypeI16x8GtU:
2672				result = []bool{
2673					uint16(x1Lo>>0) > uint16(x2Lo>>0), uint16(x1Lo>>16) > uint16(x2Lo>>16),
2674					uint16(x1Lo>>32) > uint16(x2Lo>>32), uint16(x1Lo>>48) > uint16(x2Lo>>48),
2675					uint16(x1Hi>>0) > uint16(x2Hi>>0), uint16(x1Hi>>16) > uint16(x2Hi>>16),
2676					uint16(x1Hi>>32) > uint16(x2Hi>>32), uint16(x1Hi>>48) > uint16(x2Hi>>48),
2677				}
2678			case v128CmpTypeI16x8LeS:
2679				result = []bool{
2680					int16(x1Lo>>0) <= int16(x2Lo>>0), int16(x1Lo>>16) <= int16(x2Lo>>16),
2681					int16(x1Lo>>32) <= int16(x2Lo>>32), int16(x1Lo>>48) <= int16(x2Lo>>48),
2682					int16(x1Hi>>0) <= int16(x2Hi>>0), int16(x1Hi>>16) <= int16(x2Hi>>16),
2683					int16(x1Hi>>32) <= int16(x2Hi>>32), int16(x1Hi>>48) <= int16(x2Hi>>48),
2684				}
2685			case v128CmpTypeI16x8LeU:
2686				result = []bool{
2687					uint16(x1Lo>>0) <= uint16(x2Lo>>0), uint16(x1Lo>>16) <= uint16(x2Lo>>16),
2688					uint16(x1Lo>>32) <= uint16(x2Lo>>32), uint16(x1Lo>>48) <= uint16(x2Lo>>48),
2689					uint16(x1Hi>>0) <= uint16(x2Hi>>0), uint16(x1Hi>>16) <= uint16(x2Hi>>16),
2690					uint16(x1Hi>>32) <= uint16(x2Hi>>32), uint16(x1Hi>>48) <= uint16(x2Hi>>48),
2691				}
2692			case v128CmpTypeI16x8GeS:
2693				result = []bool{
2694					int16(x1Lo>>0) >= int16(x2Lo>>0), int16(x1Lo>>16) >= int16(x2Lo>>16),
2695					int16(x1Lo>>32) >= int16(x2Lo>>32), int16(x1Lo>>48) >= int16(x2Lo>>48),
2696					int16(x1Hi>>0) >= int16(x2Hi>>0), int16(x1Hi>>16) >= int16(x2Hi>>16),
2697					int16(x1Hi>>32) >= int16(x2Hi>>32), int16(x1Hi>>48) >= int16(x2Hi>>48),
2698				}
2699			case v128CmpTypeI16x8GeU:
2700				result = []bool{
2701					uint16(x1Lo>>0) >= uint16(x2Lo>>0), uint16(x1Lo>>16) >= uint16(x2Lo>>16),
2702					uint16(x1Lo>>32) >= uint16(x2Lo>>32), uint16(x1Lo>>48) >= uint16(x2Lo>>48),
2703					uint16(x1Hi>>0) >= uint16(x2Hi>>0), uint16(x1Hi>>16) >= uint16(x2Hi>>16),
2704					uint16(x1Hi>>32) >= uint16(x2Hi>>32), uint16(x1Hi>>48) >= uint16(x2Hi>>48),
2705				}
2706			case v128CmpTypeI32x4Eq:
2707				result = []bool{
2708					uint32(x1Lo>>0) == uint32(x2Lo>>0), uint32(x1Lo>>32) == uint32(x2Lo>>32),
2709					uint32(x1Hi>>0) == uint32(x2Hi>>0), uint32(x1Hi>>32) == uint32(x2Hi>>32),
2710				}
2711			case v128CmpTypeI32x4Ne:
2712				result = []bool{
2713					uint32(x1Lo>>0) != uint32(x2Lo>>0), uint32(x1Lo>>32) != uint32(x2Lo>>32),
2714					uint32(x1Hi>>0) != uint32(x2Hi>>0), uint32(x1Hi>>32) != uint32(x2Hi>>32),
2715				}
2716			case v128CmpTypeI32x4LtS:
2717				result = []bool{
2718					int32(x1Lo>>0) < int32(x2Lo>>0), int32(x1Lo>>32) < int32(x2Lo>>32),
2719					int32(x1Hi>>0) < int32(x2Hi>>0), int32(x1Hi>>32) < int32(x2Hi>>32),
2720				}
2721			case v128CmpTypeI32x4LtU:
2722				result = []bool{
2723					uint32(x1Lo>>0) < uint32(x2Lo>>0), uint32(x1Lo>>32) < uint32(x2Lo>>32),
2724					uint32(x1Hi>>0) < uint32(x2Hi>>0), uint32(x1Hi>>32) < uint32(x2Hi>>32),
2725				}
2726			case v128CmpTypeI32x4GtS:
2727				result = []bool{
2728					int32(x1Lo>>0) > int32(x2Lo>>0), int32(x1Lo>>32) > int32(x2Lo>>32),
2729					int32(x1Hi>>0) > int32(x2Hi>>0), int32(x1Hi>>32) > int32(x2Hi>>32),
2730				}
2731			case v128CmpTypeI32x4GtU:
2732				result = []bool{
2733					uint32(x1Lo>>0) > uint32(x2Lo>>0), uint32(x1Lo>>32) > uint32(x2Lo>>32),
2734					uint32(x1Hi>>0) > uint32(x2Hi>>0), uint32(x1Hi>>32) > uint32(x2Hi>>32),
2735				}
2736			case v128CmpTypeI32x4LeS:
2737				result = []bool{
2738					int32(x1Lo>>0) <= int32(x2Lo>>0), int32(x1Lo>>32) <= int32(x2Lo>>32),
2739					int32(x1Hi>>0) <= int32(x2Hi>>0), int32(x1Hi>>32) <= int32(x2Hi>>32),
2740				}
2741			case v128CmpTypeI32x4LeU:
2742				result = []bool{
2743					uint32(x1Lo>>0) <= uint32(x2Lo>>0), uint32(x1Lo>>32) <= uint32(x2Lo>>32),
2744					uint32(x1Hi>>0) <= uint32(x2Hi>>0), uint32(x1Hi>>32) <= uint32(x2Hi>>32),
2745				}
2746			case v128CmpTypeI32x4GeS:
2747				result = []bool{
2748					int32(x1Lo>>0) >= int32(x2Lo>>0), int32(x1Lo>>32) >= int32(x2Lo>>32),
2749					int32(x1Hi>>0) >= int32(x2Hi>>0), int32(x1Hi>>32) >= int32(x2Hi>>32),
2750				}
2751			case v128CmpTypeI32x4GeU:
2752				result = []bool{
2753					uint32(x1Lo>>0) >= uint32(x2Lo>>0), uint32(x1Lo>>32) >= uint32(x2Lo>>32),
2754					uint32(x1Hi>>0) >= uint32(x2Hi>>0), uint32(x1Hi>>32) >= uint32(x2Hi>>32),
2755				}
2756			case v128CmpTypeI64x2Eq:
2757				result = []bool{x1Lo == x2Lo, x1Hi == x2Hi}
2758			case v128CmpTypeI64x2Ne:
2759				result = []bool{x1Lo != x2Lo, x1Hi != x2Hi}
2760			case v128CmpTypeI64x2LtS:
2761				result = []bool{int64(x1Lo) < int64(x2Lo), int64(x1Hi) < int64(x2Hi)}
2762			case v128CmpTypeI64x2GtS:
2763				result = []bool{int64(x1Lo) > int64(x2Lo), int64(x1Hi) > int64(x2Hi)}
2764			case v128CmpTypeI64x2LeS:
2765				result = []bool{int64(x1Lo) <= int64(x2Lo), int64(x1Hi) <= int64(x2Hi)}
2766			case v128CmpTypeI64x2GeS:
2767				result = []bool{int64(x1Lo) >= int64(x2Lo), int64(x1Hi) >= int64(x2Hi)}
2768			case v128CmpTypeF32x4Eq:
2769				result = []bool{
2770					math.Float32frombits(uint32(x1Lo>>0)) == math.Float32frombits(uint32(x2Lo>>0)),
2771					math.Float32frombits(uint32(x1Lo>>32)) == math.Float32frombits(uint32(x2Lo>>32)),
2772					math.Float32frombits(uint32(x1Hi>>0)) == math.Float32frombits(uint32(x2Hi>>0)),
2773					math.Float32frombits(uint32(x1Hi>>32)) == math.Float32frombits(uint32(x2Hi>>32)),
2774				}
2775			case v128CmpTypeF32x4Ne:
2776				result = []bool{
2777					math.Float32frombits(uint32(x1Lo>>0)) != math.Float32frombits(uint32(x2Lo>>0)),
2778					math.Float32frombits(uint32(x1Lo>>32)) != math.Float32frombits(uint32(x2Lo>>32)),
2779					math.Float32frombits(uint32(x1Hi>>0)) != math.Float32frombits(uint32(x2Hi>>0)),
2780					math.Float32frombits(uint32(x1Hi>>32)) != math.Float32frombits(uint32(x2Hi>>32)),
2781				}
2782			case v128CmpTypeF32x4Lt:
2783				result = []bool{
2784					math.Float32frombits(uint32(x1Lo>>0)) < math.Float32frombits(uint32(x2Lo>>0)),
2785					math.Float32frombits(uint32(x1Lo>>32)) < math.Float32frombits(uint32(x2Lo>>32)),
2786					math.Float32frombits(uint32(x1Hi>>0)) < math.Float32frombits(uint32(x2Hi>>0)),
2787					math.Float32frombits(uint32(x1Hi>>32)) < math.Float32frombits(uint32(x2Hi>>32)),
2788				}
2789			case v128CmpTypeF32x4Gt:
2790				result = []bool{
2791					math.Float32frombits(uint32(x1Lo>>0)) > math.Float32frombits(uint32(x2Lo>>0)),
2792					math.Float32frombits(uint32(x1Lo>>32)) > math.Float32frombits(uint32(x2Lo>>32)),
2793					math.Float32frombits(uint32(x1Hi>>0)) > math.Float32frombits(uint32(x2Hi>>0)),
2794					math.Float32frombits(uint32(x1Hi>>32)) > math.Float32frombits(uint32(x2Hi>>32)),
2795				}
2796			case v128CmpTypeF32x4Le:
2797				result = []bool{
2798					math.Float32frombits(uint32(x1Lo>>0)) <= math.Float32frombits(uint32(x2Lo>>0)),
2799					math.Float32frombits(uint32(x1Lo>>32)) <= math.Float32frombits(uint32(x2Lo>>32)),
2800					math.Float32frombits(uint32(x1Hi>>0)) <= math.Float32frombits(uint32(x2Hi>>0)),
2801					math.Float32frombits(uint32(x1Hi>>32)) <= math.Float32frombits(uint32(x2Hi>>32)),
2802				}
2803			case v128CmpTypeF32x4Ge:
2804				result = []bool{
2805					math.Float32frombits(uint32(x1Lo>>0)) >= math.Float32frombits(uint32(x2Lo>>0)),
2806					math.Float32frombits(uint32(x1Lo>>32)) >= math.Float32frombits(uint32(x2Lo>>32)),
2807					math.Float32frombits(uint32(x1Hi>>0)) >= math.Float32frombits(uint32(x2Hi>>0)),
2808					math.Float32frombits(uint32(x1Hi>>32)) >= math.Float32frombits(uint32(x2Hi>>32)),
2809				}
2810			case v128CmpTypeF64x2Eq:
2811				result = []bool{
2812					math.Float64frombits(x1Lo) == math.Float64frombits(x2Lo),
2813					math.Float64frombits(x1Hi) == math.Float64frombits(x2Hi),
2814				}
2815			case v128CmpTypeF64x2Ne:
2816				result = []bool{
2817					math.Float64frombits(x1Lo) != math.Float64frombits(x2Lo),
2818					math.Float64frombits(x1Hi) != math.Float64frombits(x2Hi),
2819				}
2820			case v128CmpTypeF64x2Lt:
2821				result = []bool{
2822					math.Float64frombits(x1Lo) < math.Float64frombits(x2Lo),
2823					math.Float64frombits(x1Hi) < math.Float64frombits(x2Hi),
2824				}
2825			case v128CmpTypeF64x2Gt:
2826				result = []bool{
2827					math.Float64frombits(x1Lo) > math.Float64frombits(x2Lo),
2828					math.Float64frombits(x1Hi) > math.Float64frombits(x2Hi),
2829				}
2830			case v128CmpTypeF64x2Le:
2831				result = []bool{
2832					math.Float64frombits(x1Lo) <= math.Float64frombits(x2Lo),
2833					math.Float64frombits(x1Hi) <= math.Float64frombits(x2Hi),
2834				}
2835			case v128CmpTypeF64x2Ge:
2836				result = []bool{
2837					math.Float64frombits(x1Lo) >= math.Float64frombits(x2Lo),
2838					math.Float64frombits(x1Hi) >= math.Float64frombits(x2Hi),
2839				}
2840			}
2841
2842			var retLo, retHi uint64
2843			laneNum := len(result)
2844			switch laneNum {
2845			case 16:
2846				for i, b := range result {
2847					if b {
2848						if i < 8 {
2849							retLo |= 0xff << (i * 8)
2850						} else {
2851							retHi |= 0xff << ((i - 8) * 8)
2852						}
2853					}
2854				}
2855			case 8:
2856				for i, b := range result {
2857					if b {
2858						if i < 4 {
2859							retLo |= 0xffff << (i * 16)
2860						} else {
2861							retHi |= 0xffff << ((i - 4) * 16)
2862						}
2863					}
2864				}
2865			case 4:
2866				for i, b := range result {
2867					if b {
2868						if i < 2 {
2869							retLo |= 0xffff_ffff << (i * 32)
2870						} else {
2871							retHi |= 0xffff_ffff << ((i - 2) * 32)
2872						}
2873					}
2874				}
2875			case 2:
2876				if result[0] {
2877					retLo = ^uint64(0)
2878				}
2879				if result[1] {
2880					retHi = ^uint64(0)
2881				}
2882			}
2883
2884			ce.pushValue(retLo)
2885			ce.pushValue(retHi)
2886			frame.pc++
2887		case operationKindV128AddSat:
2888			x2hi, x2Lo := ce.popValue(), ce.popValue()
2889			x1hi, x1Lo := ce.popValue(), ce.popValue()
2890
2891			var retLo, retHi uint64
2892
2893			// Lane-wise addition while saturating the overflowing values.
2894			// https://github.com/WebAssembly/spec/blob/wg-2.0.draft1/proposals/simd/SIMD.md#saturating-integer-addition
2895			switch op.B1 {
2896			case shapeI8x16:
2897				for i := 0; i < 16; i++ {
2898					var v, w byte
2899					if i < 8 {
2900						v, w = byte(x1Lo>>(i*8)), byte(x2Lo>>(i*8))
2901					} else {
2902						v, w = byte(x1hi>>((i-8)*8)), byte(x2hi>>((i-8)*8))
2903					}
2904
2905					var uv uint64
2906					if op.B3 { // signed
2907						if subbed := int64(int8(v)) + int64(int8(w)); subbed < math.MinInt8 {
2908							uv = uint64(byte(0x80))
2909						} else if subbed > math.MaxInt8 {
2910							uv = uint64(byte(0x7f))
2911						} else {
2912							uv = uint64(byte(int8(subbed)))
2913						}
2914					} else {
2915						if subbed := int64(v) + int64(w); subbed < 0 {
2916							uv = uint64(byte(0))
2917						} else if subbed > math.MaxUint8 {
2918							uv = uint64(byte(0xff))
2919						} else {
2920							uv = uint64(byte(subbed))
2921						}
2922					}
2923
2924					if i < 8 { // first 8 lanes are on lower 64bits.
2925						retLo |= uv << (i * 8)
2926					} else {
2927						retHi |= uv << ((i - 8) * 8)
2928					}
2929				}
2930			case shapeI16x8:
2931				for i := 0; i < 8; i++ {
2932					var v, w uint16
2933					if i < 4 {
2934						v, w = uint16(x1Lo>>(i*16)), uint16(x2Lo>>(i*16))
2935					} else {
2936						v, w = uint16(x1hi>>((i-4)*16)), uint16(x2hi>>((i-4)*16))
2937					}
2938
2939					var uv uint64
2940					if op.B3 { // signed
2941						if added := int64(int16(v)) + int64(int16(w)); added < math.MinInt16 {
2942							uv = uint64(uint16(0x8000))
2943						} else if added > math.MaxInt16 {
2944							uv = uint64(uint16(0x7fff))
2945						} else {
2946							uv = uint64(uint16(int16(added)))
2947						}
2948					} else {
2949						if added := int64(v) + int64(w); added < 0 {
2950							uv = uint64(uint16(0))
2951						} else if added > math.MaxUint16 {
2952							uv = uint64(uint16(0xffff))
2953						} else {
2954							uv = uint64(uint16(added))
2955						}
2956					}
2957
2958					if i < 4 { // first 4 lanes are on lower 64bits.
2959						retLo |= uv << (i * 16)
2960					} else {
2961						retHi |= uv << ((i - 4) * 16)
2962					}
2963				}
2964			}
2965
2966			ce.pushValue(retLo)
2967			ce.pushValue(retHi)
2968			frame.pc++
2969		case operationKindV128SubSat:
2970			x2hi, x2Lo := ce.popValue(), ce.popValue()
2971			x1hi, x1Lo := ce.popValue(), ce.popValue()
2972
2973			var retLo, retHi uint64
2974
2975			// Lane-wise subtraction while saturating the overflowing values.
2976			// https://github.com/WebAssembly/spec/blob/wg-2.0.draft1/proposals/simd/SIMD.md#saturating-integer-subtraction
2977			switch op.B1 {
2978			case shapeI8x16:
2979				for i := 0; i < 16; i++ {
2980					var v, w byte
2981					if i < 8 {
2982						v, w = byte(x1Lo>>(i*8)), byte(x2Lo>>(i*8))
2983					} else {
2984						v, w = byte(x1hi>>((i-8)*8)), byte(x2hi>>((i-8)*8))
2985					}
2986
2987					var uv uint64
2988					if op.B3 { // signed
2989						if subbed := int64(int8(v)) - int64(int8(w)); subbed < math.MinInt8 {
2990							uv = uint64(byte(0x80))
2991						} else if subbed > math.MaxInt8 {
2992							uv = uint64(byte(0x7f))
2993						} else {
2994							uv = uint64(byte(int8(subbed)))
2995						}
2996					} else {
2997						if subbed := int64(v) - int64(w); subbed < 0 {
2998							uv = uint64(byte(0))
2999						} else if subbed > math.MaxUint8 {
3000							uv = uint64(byte(0xff))
3001						} else {
3002							uv = uint64(byte(subbed))
3003						}
3004					}
3005
3006					if i < 8 {
3007						retLo |= uv << (i * 8)
3008					} else {
3009						retHi |= uv << ((i - 8) * 8)
3010					}
3011				}
3012			case shapeI16x8:
3013				for i := 0; i < 8; i++ {
3014					var v, w uint16
3015					if i < 4 {
3016						v, w = uint16(x1Lo>>(i*16)), uint16(x2Lo>>(i*16))
3017					} else {
3018						v, w = uint16(x1hi>>((i-4)*16)), uint16(x2hi>>((i-4)*16))
3019					}
3020
3021					var uv uint64
3022					if op.B3 { // signed
3023						if subbed := int64(int16(v)) - int64(int16(w)); subbed < math.MinInt16 {
3024							uv = uint64(uint16(0x8000))
3025						} else if subbed > math.MaxInt16 {
3026							uv = uint64(uint16(0x7fff))
3027						} else {
3028							uv = uint64(uint16(int16(subbed)))
3029						}
3030					} else {
3031						if subbed := int64(v) - int64(w); subbed < 0 {
3032							uv = uint64(uint16(0))
3033						} else if subbed > math.MaxUint16 {
3034							uv = uint64(uint16(0xffff))
3035						} else {
3036							uv = uint64(uint16(subbed))
3037						}
3038					}
3039
3040					if i < 4 {
3041						retLo |= uv << (i * 16)
3042					} else {
3043						retHi |= uv << ((i - 4) * 16)
3044					}
3045				}
3046			}
3047
3048			ce.pushValue(retLo)
3049			ce.pushValue(retHi)
3050			frame.pc++
3051		case operationKindV128Mul:
3052			x2hi, x2lo := ce.popValue(), ce.popValue()
3053			x1hi, x1lo := ce.popValue(), ce.popValue()
3054			var retLo, retHi uint64
3055			switch op.B1 {
3056			case shapeI16x8:
3057				retHi = uint64(uint16(x1hi)*uint16(x2hi)) | (uint64(uint16(x1hi>>16)*uint16(x2hi>>16)) << 16) |
3058					(uint64(uint16(x1hi>>32)*uint16(x2hi>>32)) << 32) | (uint64(uint16(x1hi>>48)*uint16(x2hi>>48)) << 48)
3059				retLo = uint64(uint16(x1lo)*uint16(x2lo)) | (uint64(uint16(x1lo>>16)*uint16(x2lo>>16)) << 16) |
3060					(uint64(uint16(x1lo>>32)*uint16(x2lo>>32)) << 32) | (uint64(uint16(x1lo>>48)*uint16(x2lo>>48)) << 48)
3061			case shapeI32x4:
3062				retHi = uint64(uint32(x1hi)*uint32(x2hi)) | (uint64(uint32(x1hi>>32)*uint32(x2hi>>32)) << 32)
3063				retLo = uint64(uint32(x1lo)*uint32(x2lo)) | (uint64(uint32(x1lo>>32)*uint32(x2lo>>32)) << 32)
3064			case shapeI64x2:
3065				retHi = x1hi * x2hi
3066				retLo = x1lo * x2lo
3067			case shapeF32x4:
3068				retHi = mulFloat32bits(uint32(x1hi), uint32(x2hi)) | mulFloat32bits(uint32(x1hi>>32), uint32(x2hi>>32))<<32
3069				retLo = mulFloat32bits(uint32(x1lo), uint32(x2lo)) | mulFloat32bits(uint32(x1lo>>32), uint32(x2lo>>32))<<32
3070			case shapeF64x2:
3071				retHi = math.Float64bits(math.Float64frombits(x1hi) * math.Float64frombits(x2hi))
3072				retLo = math.Float64bits(math.Float64frombits(x1lo) * math.Float64frombits(x2lo))
3073			}
3074			ce.pushValue(retLo)
3075			ce.pushValue(retHi)
3076			frame.pc++
3077		case operationKindV128Div:
3078			x2hi, x2lo := ce.popValue(), ce.popValue()
3079			x1hi, x1lo := ce.popValue(), ce.popValue()
3080			var retLo, retHi uint64
3081			if op.B1 == shapeF64x2 {
3082				retHi = math.Float64bits(math.Float64frombits(x1hi) / math.Float64frombits(x2hi))
3083				retLo = math.Float64bits(math.Float64frombits(x1lo) / math.Float64frombits(x2lo))
3084			} else {
3085				retHi = divFloat32bits(uint32(x1hi), uint32(x2hi)) | divFloat32bits(uint32(x1hi>>32), uint32(x2hi>>32))<<32
3086				retLo = divFloat32bits(uint32(x1lo), uint32(x2lo)) | divFloat32bits(uint32(x1lo>>32), uint32(x2lo>>32))<<32
3087			}
3088			ce.pushValue(retLo)
3089			ce.pushValue(retHi)
3090			frame.pc++
3091		case operationKindV128Neg:
3092			hi, lo := ce.popValue(), ce.popValue()
3093			switch op.B1 {
3094			case shapeI8x16:
3095				lo = uint64(-byte(lo)) | (uint64(-byte(lo>>8)) << 8) |
3096					(uint64(-byte(lo>>16)) << 16) | (uint64(-byte(lo>>24)) << 24) |
3097					(uint64(-byte(lo>>32)) << 32) | (uint64(-byte(lo>>40)) << 40) |
3098					(uint64(-byte(lo>>48)) << 48) | (uint64(-byte(lo>>56)) << 56)
3099				hi = uint64(-byte(hi)) | (uint64(-byte(hi>>8)) << 8) |
3100					(uint64(-byte(hi>>16)) << 16) | (uint64(-byte(hi>>24)) << 24) |
3101					(uint64(-byte(hi>>32)) << 32) | (uint64(-byte(hi>>40)) << 40) |
3102					(uint64(-byte(hi>>48)) << 48) | (uint64(-byte(hi>>56)) << 56)
3103			case shapeI16x8:
3104				hi = uint64(-uint16(hi)) | (uint64(-uint16(hi>>16)) << 16) |
3105					(uint64(-uint16(hi>>32)) << 32) | (uint64(-uint16(hi>>48)) << 48)
3106				lo = uint64(-uint16(lo)) | (uint64(-uint16(lo>>16)) << 16) |
3107					(uint64(-uint16(lo>>32)) << 32) | (uint64(-uint16(lo>>48)) << 48)
3108			case shapeI32x4:
3109				hi = uint64(-uint32(hi)) | (uint64(-uint32(hi>>32)) << 32)
3110				lo = uint64(-uint32(lo)) | (uint64(-uint32(lo>>32)) << 32)
3111			case shapeI64x2:
3112				hi = -hi
3113				lo = -lo
3114			case shapeF32x4:
3115				hi = uint64(math.Float32bits(-math.Float32frombits(uint32(hi)))) |
3116					(uint64(math.Float32bits(-math.Float32frombits(uint32(hi>>32)))) << 32)
3117				lo = uint64(math.Float32bits(-math.Float32frombits(uint32(lo)))) |
3118					(uint64(math.Float32bits(-math.Float32frombits(uint32(lo>>32)))) << 32)
3119			case shapeF64x2:
3120				hi = math.Float64bits(-math.Float64frombits(hi))
3121				lo = math.Float64bits(-math.Float64frombits(lo))
3122			}
3123			ce.pushValue(lo)
3124			ce.pushValue(hi)
3125			frame.pc++
3126		case operationKindV128Sqrt:
3127			hi, lo := ce.popValue(), ce.popValue()
3128			if op.B1 == shapeF64x2 {
3129				hi = math.Float64bits(math.Sqrt(math.Float64frombits(hi)))
3130				lo = math.Float64bits(math.Sqrt(math.Float64frombits(lo)))
3131			} else {
3132				hi = uint64(math.Float32bits(float32(math.Sqrt(float64(math.Float32frombits(uint32(hi))))))) |
3133					(uint64(math.Float32bits(float32(math.Sqrt(float64(math.Float32frombits(uint32(hi>>32))))))) << 32)
3134				lo = uint64(math.Float32bits(float32(math.Sqrt(float64(math.Float32frombits(uint32(lo))))))) |
3135					(uint64(math.Float32bits(float32(math.Sqrt(float64(math.Float32frombits(uint32(lo>>32))))))) << 32)
3136			}
3137			ce.pushValue(lo)
3138			ce.pushValue(hi)
3139			frame.pc++
3140		case operationKindV128Abs:
3141			hi, lo := ce.popValue(), ce.popValue()
3142			switch op.B1 {
3143			case shapeI8x16:
3144				lo = uint64(i8Abs(byte(lo))) | (uint64(i8Abs(byte(lo>>8))) << 8) |
3145					(uint64(i8Abs(byte(lo>>16))) << 16) | (uint64(i8Abs(byte(lo>>24))) << 24) |
3146					(uint64(i8Abs(byte(lo>>32))) << 32) | (uint64(i8Abs(byte(lo>>40))) << 40) |
3147					(uint64(i8Abs(byte(lo>>48))) << 48) | (uint64(i8Abs(byte(lo>>56))) << 56)
3148				hi = uint64(i8Abs(byte(hi))) | (uint64(i8Abs(byte(hi>>8))) << 8) |
3149					(uint64(i8Abs(byte(hi>>16))) << 16) | (uint64(i8Abs(byte(hi>>24))) << 24) |
3150					(uint64(i8Abs(byte(hi>>32))) << 32) | (uint64(i8Abs(byte(hi>>40))) << 40) |
3151					(uint64(i8Abs(byte(hi>>48))) << 48) | (uint64(i8Abs(byte(hi>>56))) << 56)
3152			case shapeI16x8:
3153				hi = uint64(i16Abs(uint16(hi))) | (uint64(i16Abs(uint16(hi>>16))) << 16) |
3154					(uint64(i16Abs(uint16(hi>>32))) << 32) | (uint64(i16Abs(uint16(hi>>48))) << 48)
3155				lo = uint64(i16Abs(uint16(lo))) | (uint64(i16Abs(uint16(lo>>16))) << 16) |
3156					(uint64(i16Abs(uint16(lo>>32))) << 32) | (uint64(i16Abs(uint16(lo>>48))) << 48)
3157			case shapeI32x4:
3158				hi = uint64(i32Abs(uint32(hi))) | (uint64(i32Abs(uint32(hi>>32))) << 32)
3159				lo = uint64(i32Abs(uint32(lo))) | (uint64(i32Abs(uint32(lo>>32))) << 32)
3160			case shapeI64x2:
3161				if int64(hi) < 0 {
3162					hi = -hi
3163				}
3164				if int64(lo) < 0 {
3165					lo = -lo
3166				}
3167			case shapeF32x4:
3168				hi = hi &^ (1<<31 | 1<<63)
3169				lo = lo &^ (1<<31 | 1<<63)
3170			case shapeF64x2:
3171				hi = hi &^ (1 << 63)
3172				lo = lo &^ (1 << 63)
3173			}
3174			ce.pushValue(lo)
3175			ce.pushValue(hi)
3176			frame.pc++
3177		case operationKindV128Popcnt:
3178			hi, lo := ce.popValue(), ce.popValue()
3179			var retLo, retHi uint64
3180			for i := 0; i < 16; i++ {
3181				var v byte
3182				if i < 8 {
3183					v = byte(lo >> (i * 8))
3184				} else {
3185					v = byte(hi >> ((i - 8) * 8))
3186				}
3187
3188				var cnt uint64
3189				for i := 0; i < 8; i++ {
3190					if (v>>i)&0b1 != 0 {
3191						cnt++
3192					}
3193				}
3194
3195				if i < 8 {
3196					retLo |= cnt << (i * 8)
3197				} else {
3198					retHi |= cnt << ((i - 8) * 8)
3199				}
3200			}
3201			ce.pushValue(retLo)
3202			ce.pushValue(retHi)
3203			frame.pc++
3204		case operationKindV128Min:
3205			x2hi, x2lo := ce.popValue(), ce.popValue()
3206			x1hi, x1lo := ce.popValue(), ce.popValue()
3207			var retLo, retHi uint64
3208			switch op.B1 {
3209			case shapeI8x16:
3210				if op.B3 { // signed
3211					retLo = uint64(i8MinS(uint8(x1lo>>8), uint8(x2lo>>8)))<<8 | uint64(i8MinS(uint8(x1lo), uint8(x2lo))) |
3212						uint64(i8MinS(uint8(x1lo>>24), uint8(x2lo>>24)))<<24 | uint64(i8MinS(uint8(x1lo>>16), uint8(x2lo>>16)))<<16 |
3213						uint64(i8MinS(uint8(x1lo>>40), uint8(x2lo>>40)))<<40 | uint64(i8MinS(uint8(x1lo>>32), uint8(x2lo>>32)))<<32 |
3214						uint64(i8MinS(uint8(x1lo>>56), uint8(x2lo>>56)))<<56 | uint64(i8MinS(uint8(x1lo>>48), uint8(x2lo>>48)))<<48
3215					retHi = uint64(i8MinS(uint8(x1hi>>8), uint8(x2hi>>8)))<<8 | uint64(i8MinS(uint8(x1hi), uint8(x2hi))) |
3216						uint64(i8MinS(uint8(x1hi>>24), uint8(x2hi>>24)))<<24 | uint64(i8MinS(uint8(x1hi>>16), uint8(x2hi>>16)))<<16 |
3217						uint64(i8MinS(uint8(x1hi>>40), uint8(x2hi>>40)))<<40 | uint64(i8MinS(uint8(x1hi>>32), uint8(x2hi>>32)))<<32 |
3218						uint64(i8MinS(uint8(x1hi>>56), uint8(x2hi>>56)))<<56 | uint64(i8MinS(uint8(x1hi>>48), uint8(x2hi>>48)))<<48
3219				} else {
3220					retLo = uint64(i8MinU(uint8(x1lo>>8), uint8(x2lo>>8)))<<8 | uint64(i8MinU(uint8(x1lo), uint8(x2lo))) |
3221						uint64(i8MinU(uint8(x1lo>>24), uint8(x2lo>>24)))<<24 | uint64(i8MinU(uint8(x1lo>>16), uint8(x2lo>>16)))<<16 |
3222						uint64(i8MinU(uint8(x1lo>>40), uint8(x2lo>>40)))<<40 | uint64(i8MinU(uint8(x1lo>>32), uint8(x2lo>>32)))<<32 |
3223						uint64(i8MinU(uint8(x1lo>>56), uint8(x2lo>>56)))<<56 | uint64(i8MinU(uint8(x1lo>>48), uint8(x2lo>>48)))<<48
3224					retHi = uint64(i8MinU(uint8(x1hi>>8), uint8(x2hi>>8)))<<8 | uint64(i8MinU(uint8(x1hi), uint8(x2hi))) |
3225						uint64(i8MinU(uint8(x1hi>>24), uint8(x2hi>>24)))<<24 | uint64(i8MinU(uint8(x1hi>>16), uint8(x2hi>>16)))<<16 |
3226						uint64(i8MinU(uint8(x1hi>>40), uint8(x2hi>>40)))<<40 | uint64(i8MinU(uint8(x1hi>>32), uint8(x2hi>>32)))<<32 |
3227						uint64(i8MinU(uint8(x1hi>>56), uint8(x2hi>>56)))<<56 | uint64(i8MinU(uint8(x1hi>>48), uint8(x2hi>>48)))<<48
3228				}
3229			case shapeI16x8:
3230				if op.B3 { // signed
3231					retLo = uint64(i16MinS(uint16(x1lo), uint16(x2lo))) |
3232						uint64(i16MinS(uint16(x1lo>>16), uint16(x2lo>>16)))<<16 |
3233						uint64(i16MinS(uint16(x1lo>>32), uint16(x2lo>>32)))<<32 |
3234						uint64(i16MinS(uint16(x1lo>>48), uint16(x2lo>>48)))<<48
3235					retHi = uint64(i16MinS(uint16(x1hi), uint16(x2hi))) |
3236						uint64(i16MinS(uint16(x1hi>>16), uint16(x2hi>>16)))<<16 |
3237						uint64(i16MinS(uint16(x1hi>>32), uint16(x2hi>>32)))<<32 |
3238						uint64(i16MinS(uint16(x1hi>>48), uint16(x2hi>>48)))<<48
3239				} else {
3240					retLo = uint64(i16MinU(uint16(x1lo), uint16(x2lo))) |
3241						uint64(i16MinU(uint16(x1lo>>16), uint16(x2lo>>16)))<<16 |
3242						uint64(i16MinU(uint16(x1lo>>32), uint16(x2lo>>32)))<<32 |
3243						uint64(i16MinU(uint16(x1lo>>48), uint16(x2lo>>48)))<<48
3244					retHi = uint64(i16MinU(uint16(x1hi), uint16(x2hi))) |
3245						uint64(i16MinU(uint16(x1hi>>16), uint16(x2hi>>16)))<<16 |
3246						uint64(i16MinU(uint16(x1hi>>32), uint16(x2hi>>32)))<<32 |
3247						uint64(i16MinU(uint16(x1hi>>48), uint16(x2hi>>48)))<<48
3248				}
3249			case shapeI32x4:
3250				if op.B3 { // signed
3251					retLo = uint64(i32MinS(uint32(x1lo), uint32(x2lo))) |
3252						uint64(i32MinS(uint32(x1lo>>32), uint32(x2lo>>32)))<<32
3253					retHi = uint64(i32MinS(uint32(x1hi), uint32(x2hi))) |
3254						uint64(i32MinS(uint32(x1hi>>32), uint32(x2hi>>32)))<<32
3255				} else {
3256					retLo = uint64(i32MinU(uint32(x1lo), uint32(x2lo))) |
3257						uint64(i32MinU(uint32(x1lo>>32), uint32(x2lo>>32)))<<32
3258					retHi = uint64(i32MinU(uint32(x1hi), uint32(x2hi))) |
3259						uint64(i32MinU(uint32(x1hi>>32), uint32(x2hi>>32)))<<32
3260				}
3261			case shapeF32x4:
3262				retHi = wasmCompatMin32bits(uint32(x1hi), uint32(x2hi)) |
3263					wasmCompatMin32bits(uint32(x1hi>>32), uint32(x2hi>>32))<<32
3264				retLo = wasmCompatMin32bits(uint32(x1lo), uint32(x2lo)) |
3265					wasmCompatMin32bits(uint32(x1lo>>32), uint32(x2lo>>32))<<32
3266			case shapeF64x2:
3267				retHi = math.Float64bits(moremath.WasmCompatMin64(
3268					math.Float64frombits(x1hi),
3269					math.Float64frombits(x2hi),
3270				))
3271				retLo = math.Float64bits(moremath.WasmCompatMin64(
3272					math.Float64frombits(x1lo),
3273					math.Float64frombits(x2lo),
3274				))
3275			}
3276			ce.pushValue(retLo)
3277			ce.pushValue(retHi)
3278			frame.pc++
3279		case operationKindV128Max:
3280			x2hi, x2lo := ce.popValue(), ce.popValue()
3281			x1hi, x1lo := ce.popValue(), ce.popValue()
3282			var retLo, retHi uint64
3283			switch op.B1 {
3284			case shapeI8x16:
3285				if op.B3 { // signed
3286					retLo = uint64(i8MaxS(uint8(x1lo>>8), uint8(x2lo>>8)))<<8 | uint64(i8MaxS(uint8(x1lo), uint8(x2lo))) |
3287						uint64(i8MaxS(uint8(x1lo>>24), uint8(x2lo>>24)))<<24 | uint64(i8MaxS(uint8(x1lo>>16), uint8(x2lo>>16)))<<16 |
3288						uint64(i8MaxS(uint8(x1lo>>40), uint8(x2lo>>40)))<<40 | uint64(i8MaxS(uint8(x1lo>>32), uint8(x2lo>>32)))<<32 |
3289						uint64(i8MaxS(uint8(x1lo>>56), uint8(x2lo>>56)))<<56 | uint64(i8MaxS(uint8(x1lo>>48), uint8(x2lo>>48)))<<48
3290					retHi = uint64(i8MaxS(uint8(x1hi>>8), uint8(x2hi>>8)))<<8 | uint64(i8MaxS(uint8(x1hi), uint8(x2hi))) |
3291						uint64(i8MaxS(uint8(x1hi>>24), uint8(x2hi>>24)))<<24 | uint64(i8MaxS(uint8(x1hi>>16), uint8(x2hi>>16)))<<16 |
3292						uint64(i8MaxS(uint8(x1hi>>40), uint8(x2hi>>40)))<<40 | uint64(i8MaxS(uint8(x1hi>>32), uint8(x2hi>>32)))<<32 |
3293						uint64(i8MaxS(uint8(x1hi>>56), uint8(x2hi>>56)))<<56 | uint64(i8MaxS(uint8(x1hi>>48), uint8(x2hi>>48)))<<48
3294				} else {
3295					retLo = uint64(i8MaxU(uint8(x1lo>>8), uint8(x2lo>>8)))<<8 | uint64(i8MaxU(uint8(x1lo), uint8(x2lo))) |
3296						uint64(i8MaxU(uint8(x1lo>>24), uint8(x2lo>>24)))<<24 | uint64(i8MaxU(uint8(x1lo>>16), uint8(x2lo>>16)))<<16 |
3297						uint64(i8MaxU(uint8(x1lo>>40), uint8(x2lo>>40)))<<40 | uint64(i8MaxU(uint8(x1lo>>32), uint8(x2lo>>32)))<<32 |
3298						uint64(i8MaxU(uint8(x1lo>>56), uint8(x2lo>>56)))<<56 | uint64(i8MaxU(uint8(x1lo>>48), uint8(x2lo>>48)))<<48
3299					retHi = uint64(i8MaxU(uint8(x1hi>>8), uint8(x2hi>>8)))<<8 | uint64(i8MaxU(uint8(x1hi), uint8(x2hi))) |
3300						uint64(i8MaxU(uint8(x1hi>>24), uint8(x2hi>>24)))<<24 | uint64(i8MaxU(uint8(x1hi>>16), uint8(x2hi>>16)))<<16 |
3301						uint64(i8MaxU(uint8(x1hi>>40), uint8(x2hi>>40)))<<40 | uint64(i8MaxU(uint8(x1hi>>32), uint8(x2hi>>32)))<<32 |
3302						uint64(i8MaxU(uint8(x1hi>>56), uint8(x2hi>>56)))<<56 | uint64(i8MaxU(uint8(x1hi>>48), uint8(x2hi>>48)))<<48
3303				}
3304			case shapeI16x8:
3305				if op.B3 { // signed
3306					retLo = uint64(i16MaxS(uint16(x1lo), uint16(x2lo))) |
3307						uint64(i16MaxS(uint16(x1lo>>16), uint16(x2lo>>16)))<<16 |
3308						uint64(i16MaxS(uint16(x1lo>>32), uint16(x2lo>>32)))<<32 |
3309						uint64(i16MaxS(uint16(x1lo>>48), uint16(x2lo>>48)))<<48
3310					retHi = uint64(i16MaxS(uint16(x1hi), uint16(x2hi))) |
3311						uint64(i16MaxS(uint16(x1hi>>16), uint16(x2hi>>16)))<<16 |
3312						uint64(i16MaxS(uint16(x1hi>>32), uint16(x2hi>>32)))<<32 |
3313						uint64(i16MaxS(uint16(x1hi>>48), uint16(x2hi>>48)))<<48
3314				} else {
3315					retLo = uint64(i16MaxU(uint16(x1lo), uint16(x2lo))) |
3316						uint64(i16MaxU(uint16(x1lo>>16), uint16(x2lo>>16)))<<16 |
3317						uint64(i16MaxU(uint16(x1lo>>32), uint16(x2lo>>32)))<<32 |
3318						uint64(i16MaxU(uint16(x1lo>>48), uint16(x2lo>>48)))<<48
3319					retHi = uint64(i16MaxU(uint16(x1hi), uint16(x2hi))) |
3320						uint64(i16MaxU(uint16(x1hi>>16), uint16(x2hi>>16)))<<16 |
3321						uint64(i16MaxU(uint16(x1hi>>32), uint16(x2hi>>32)))<<32 |
3322						uint64(i16MaxU(uint16(x1hi>>48), uint16(x2hi>>48)))<<48
3323				}
3324			case shapeI32x4:
3325				if op.B3 { // signed
3326					retLo = uint64(i32MaxS(uint32(x1lo), uint32(x2lo))) |
3327						uint64(i32MaxS(uint32(x1lo>>32), uint32(x2lo>>32)))<<32
3328					retHi = uint64(i32MaxS(uint32(x1hi), uint32(x2hi))) |
3329						uint64(i32MaxS(uint32(x1hi>>32), uint32(x2hi>>32)))<<32
3330				} else {
3331					retLo = uint64(i32MaxU(uint32(x1lo), uint32(x2lo))) |
3332						uint64(i32MaxU(uint32(x1lo>>32), uint32(x2lo>>32)))<<32
3333					retHi = uint64(i32MaxU(uint32(x1hi), uint32(x2hi))) |
3334						uint64(i32MaxU(uint32(x1hi>>32), uint32(x2hi>>32)))<<32
3335				}
3336			case shapeF32x4:
3337				retHi = wasmCompatMax32bits(uint32(x1hi), uint32(x2hi)) |
3338					wasmCompatMax32bits(uint32(x1hi>>32), uint32(x2hi>>32))<<32
3339				retLo = wasmCompatMax32bits(uint32(x1lo), uint32(x2lo)) |
3340					wasmCompatMax32bits(uint32(x1lo>>32), uint32(x2lo>>32))<<32
3341			case shapeF64x2:
3342				retHi = math.Float64bits(moremath.WasmCompatMax64(
3343					math.Float64frombits(x1hi),
3344					math.Float64frombits(x2hi),
3345				))
3346				retLo = math.Float64bits(moremath.WasmCompatMax64(
3347					math.Float64frombits(x1lo),
3348					math.Float64frombits(x2lo),
3349				))
3350			}
3351			ce.pushValue(retLo)
3352			ce.pushValue(retHi)
3353			frame.pc++
3354		case operationKindV128AvgrU:
3355			x2hi, x2lo := ce.popValue(), ce.popValue()
3356			x1hi, x1lo := ce.popValue(), ce.popValue()
3357			var retLo, retHi uint64
3358			switch op.B1 {
3359			case shapeI8x16:
3360				retLo = uint64(i8RoundingAverage(uint8(x1lo>>8), uint8(x2lo>>8)))<<8 | uint64(i8RoundingAverage(uint8(x1lo), uint8(x2lo))) |
3361					uint64(i8RoundingAverage(uint8(x1lo>>24), uint8(x2lo>>24)))<<24 | uint64(i8RoundingAverage(uint8(x1lo>>16), uint8(x2lo>>16)))<<16 |
3362					uint64(i8RoundingAverage(uint8(x1lo>>40), uint8(x2lo>>40)))<<40 | uint64(i8RoundingAverage(uint8(x1lo>>32), uint8(x2lo>>32)))<<32 |
3363					uint64(i8RoundingAverage(uint8(x1lo>>56), uint8(x2lo>>56)))<<56 | uint64(i8RoundingAverage(uint8(x1lo>>48), uint8(x2lo>>48)))<<48
3364				retHi = uint64(i8RoundingAverage(uint8(x1hi>>8), uint8(x2hi>>8)))<<8 | uint64(i8RoundingAverage(uint8(x1hi), uint8(x2hi))) |
3365					uint64(i8RoundingAverage(uint8(x1hi>>24), uint8(x2hi>>24)))<<24 | uint64(i8RoundingAverage(uint8(x1hi>>16), uint8(x2hi>>16)))<<16 |
3366					uint64(i8RoundingAverage(uint8(x1hi>>40), uint8(x2hi>>40)))<<40 | uint64(i8RoundingAverage(uint8(x1hi>>32), uint8(x2hi>>32)))<<32 |
3367					uint64(i8RoundingAverage(uint8(x1hi>>56), uint8(x2hi>>56)))<<56 | uint64(i8RoundingAverage(uint8(x1hi>>48), uint8(x2hi>>48)))<<48
3368			case shapeI16x8:
3369				retLo = uint64(i16RoundingAverage(uint16(x1lo), uint16(x2lo))) |
3370					uint64(i16RoundingAverage(uint16(x1lo>>16), uint16(x2lo>>16)))<<16 |
3371					uint64(i16RoundingAverage(uint16(x1lo>>32), uint16(x2lo>>32)))<<32 |
3372					uint64(i16RoundingAverage(uint16(x1lo>>48), uint16(x2lo>>48)))<<48
3373				retHi = uint64(i16RoundingAverage(uint16(x1hi), uint16(x2hi))) |
3374					uint64(i16RoundingAverage(uint16(x1hi>>16), uint16(x2hi>>16)))<<16 |
3375					uint64(i16RoundingAverage(uint16(x1hi>>32), uint16(x2hi>>32)))<<32 |
3376					uint64(i16RoundingAverage(uint16(x1hi>>48), uint16(x2hi>>48)))<<48
3377			}
3378			ce.pushValue(retLo)
3379			ce.pushValue(retHi)
3380			frame.pc++
3381		case operationKindV128Pmin:
3382			x2hi, x2lo := ce.popValue(), ce.popValue()
3383			x1hi, x1lo := ce.popValue(), ce.popValue()
3384			var retLo, retHi uint64
3385			if op.B1 == shapeF32x4 {
3386				if flt32(math.Float32frombits(uint32(x2lo)), math.Float32frombits(uint32(x1lo))) {
3387					retLo = x2lo & 0x00000000_ffffffff
3388				} else {
3389					retLo = x1lo & 0x00000000_ffffffff
3390				}
3391				if flt32(math.Float32frombits(uint32(x2lo>>32)), math.Float32frombits(uint32(x1lo>>32))) {
3392					retLo |= x2lo & 0xffffffff_00000000
3393				} else {
3394					retLo |= x1lo & 0xffffffff_00000000
3395				}
3396				if flt32(math.Float32frombits(uint32(x2hi)), math.Float32frombits(uint32(x1hi))) {
3397					retHi = x2hi & 0x00000000_ffffffff
3398				} else {
3399					retHi = x1hi & 0x00000000_ffffffff
3400				}
3401				if flt32(math.Float32frombits(uint32(x2hi>>32)), math.Float32frombits(uint32(x1hi>>32))) {
3402					retHi |= x2hi & 0xffffffff_00000000
3403				} else {
3404					retHi |= x1hi & 0xffffffff_00000000
3405				}
3406			} else {
3407				if flt64(math.Float64frombits(x2lo), math.Float64frombits(x1lo)) {
3408					retLo = x2lo
3409				} else {
3410					retLo = x1lo
3411				}
3412				if flt64(math.Float64frombits(x2hi), math.Float64frombits(x1hi)) {
3413					retHi = x2hi
3414				} else {
3415					retHi = x1hi
3416				}
3417			}
3418			ce.pushValue(retLo)
3419			ce.pushValue(retHi)
3420			frame.pc++
3421		case operationKindV128Pmax:
3422			x2hi, x2lo := ce.popValue(), ce.popValue()
3423			x1hi, x1lo := ce.popValue(), ce.popValue()
3424			var retLo, retHi uint64
3425			if op.B1 == shapeF32x4 {
3426				if flt32(math.Float32frombits(uint32(x1lo)), math.Float32frombits(uint32(x2lo))) {
3427					retLo = x2lo & 0x00000000_ffffffff
3428				} else {
3429					retLo = x1lo & 0x00000000_ffffffff
3430				}
3431				if flt32(math.Float32frombits(uint32(x1lo>>32)), math.Float32frombits(uint32(x2lo>>32))) {
3432					retLo |= x2lo & 0xffffffff_00000000
3433				} else {
3434					retLo |= x1lo & 0xffffffff_00000000
3435				}
3436				if flt32(math.Float32frombits(uint32(x1hi)), math.Float32frombits(uint32(x2hi))) {
3437					retHi = x2hi & 0x00000000_ffffffff
3438				} else {
3439					retHi = x1hi & 0x00000000_ffffffff
3440				}
3441				if flt32(math.Float32frombits(uint32(x1hi>>32)), math.Float32frombits(uint32(x2hi>>32))) {
3442					retHi |= x2hi & 0xffffffff_00000000
3443				} else {
3444					retHi |= x1hi & 0xffffffff_00000000
3445				}
3446			} else {
3447				if flt64(math.Float64frombits(x1lo), math.Float64frombits(x2lo)) {
3448					retLo = x2lo
3449				} else {
3450					retLo = x1lo
3451				}
3452				if flt64(math.Float64frombits(x1hi), math.Float64frombits(x2hi)) {
3453					retHi = x2hi
3454				} else {
3455					retHi = x1hi
3456				}
3457			}
3458			ce.pushValue(retLo)
3459			ce.pushValue(retHi)
3460			frame.pc++
3461		case operationKindV128Ceil:
3462			hi, lo := ce.popValue(), ce.popValue()
3463			if op.B1 == shapeF32x4 {
3464				lo = uint64(math.Float32bits(moremath.WasmCompatCeilF32(math.Float32frombits(uint32(lo))))) |
3465					(uint64(math.Float32bits(moremath.WasmCompatCeilF32(math.Float32frombits(uint32(lo>>32))))) << 32)
3466				hi = uint64(math.Float32bits(moremath.WasmCompatCeilF32(math.Float32frombits(uint32(hi))))) |
3467					(uint64(math.Float32bits(moremath.WasmCompatCeilF32(math.Float32frombits(uint32(hi>>32))))) << 32)
3468			} else {
3469				lo = math.Float64bits(moremath.WasmCompatCeilF64(math.Float64frombits(lo)))
3470				hi = math.Float64bits(moremath.WasmCompatCeilF64(math.Float64frombits(hi)))
3471			}
3472			ce.pushValue(lo)
3473			ce.pushValue(hi)
3474			frame.pc++
3475		case operationKindV128Floor:
3476			hi, lo := ce.popValue(), ce.popValue()
3477			if op.B1 == shapeF32x4 {
3478				lo = uint64(math.Float32bits(moremath.WasmCompatFloorF32(math.Float32frombits(uint32(lo))))) |
3479					(uint64(math.Float32bits(moremath.WasmCompatFloorF32(math.Float32frombits(uint32(lo>>32))))) << 32)
3480				hi = uint64(math.Float32bits(moremath.WasmCompatFloorF32(math.Float32frombits(uint32(hi))))) |
3481					(uint64(math.Float32bits(moremath.WasmCompatFloorF32(math.Float32frombits(uint32(hi>>32))))) << 32)
3482			} else {
3483				lo = math.Float64bits(moremath.WasmCompatFloorF64(math.Float64frombits(lo)))
3484				hi = math.Float64bits(moremath.WasmCompatFloorF64(math.Float64frombits(hi)))
3485			}
3486			ce.pushValue(lo)
3487			ce.pushValue(hi)
3488			frame.pc++
3489		case operationKindV128Trunc:
3490			hi, lo := ce.popValue(), ce.popValue()
3491			if op.B1 == shapeF32x4 {
3492				lo = uint64(math.Float32bits(moremath.WasmCompatTruncF32(math.Float32frombits(uint32(lo))))) |
3493					(uint64(math.Float32bits(moremath.WasmCompatTruncF32(math.Float32frombits(uint32(lo>>32))))) << 32)
3494				hi = uint64(math.Float32bits(moremath.WasmCompatTruncF32(math.Float32frombits(uint32(hi))))) |
3495					(uint64(math.Float32bits(moremath.WasmCompatTruncF32(math.Float32frombits(uint32(hi>>32))))) << 32)
3496			} else {
3497				lo = math.Float64bits(moremath.WasmCompatTruncF64(math.Float64frombits(lo)))
3498				hi = math.Float64bits(moremath.WasmCompatTruncF64(math.Float64frombits(hi)))
3499			}
3500			ce.pushValue(lo)
3501			ce.pushValue(hi)
3502			frame.pc++
3503		case operationKindV128Nearest:
3504			hi, lo := ce.popValue(), ce.popValue()
3505			if op.B1 == shapeF32x4 {
3506				lo = uint64(math.Float32bits(moremath.WasmCompatNearestF32(math.Float32frombits(uint32(lo))))) |
3507					(uint64(math.Float32bits(moremath.WasmCompatNearestF32(math.Float32frombits(uint32(lo>>32))))) << 32)
3508				hi = uint64(math.Float32bits(moremath.WasmCompatNearestF32(math.Float32frombits(uint32(hi))))) |
3509					(uint64(math.Float32bits(moremath.WasmCompatNearestF32(math.Float32frombits(uint32(hi>>32))))) << 32)
3510			} else {
3511				lo = math.Float64bits(moremath.WasmCompatNearestF64(math.Float64frombits(lo)))
3512				hi = math.Float64bits(moremath.WasmCompatNearestF64(math.Float64frombits(hi)))
3513			}
3514			ce.pushValue(lo)
3515			ce.pushValue(hi)
3516			frame.pc++
3517		case operationKindV128Extend:
3518			hi, lo := ce.popValue(), ce.popValue()
3519			var origin uint64
3520			if op.B3 { // use lower 64 bits
3521				origin = lo
3522			} else {
3523				origin = hi
3524			}
3525
3526			signed := op.B2 == 1
3527
3528			var retHi, retLo uint64
3529			switch op.B1 {
3530			case shapeI8x16:
3531				for i := 0; i < 8; i++ {
3532					v8 := byte(origin >> (i * 8))
3533
3534					var v16 uint16
3535					if signed {
3536						v16 = uint16(int8(v8))
3537					} else {
3538						v16 = uint16(v8)
3539					}
3540
3541					if i < 4 {
3542						retLo |= uint64(v16) << (i * 16)
3543					} else {
3544						retHi |= uint64(v16) << ((i - 4) * 16)
3545					}
3546				}
3547			case shapeI16x8:
3548				for i := 0; i < 4; i++ {
3549					v16 := uint16(origin >> (i * 16))
3550
3551					var v32 uint32
3552					if signed {
3553						v32 = uint32(int16(v16))
3554					} else {
3555						v32 = uint32(v16)
3556					}
3557
3558					if i < 2 {
3559						retLo |= uint64(v32) << (i * 32)
3560					} else {
3561						retHi |= uint64(v32) << ((i - 2) * 32)
3562					}
3563				}
3564			case shapeI32x4:
3565				v32Lo := uint32(origin)
3566				v32Hi := uint32(origin >> 32)
3567				if signed {
3568					retLo = uint64(int32(v32Lo))
3569					retHi = uint64(int32(v32Hi))
3570				} else {
3571					retLo = uint64(v32Lo)
3572					retHi = uint64(v32Hi)
3573				}
3574			}
3575			ce.pushValue(retLo)
3576			ce.pushValue(retHi)
3577			frame.pc++
3578		case operationKindV128ExtMul:
3579			x2Hi, x2Lo := ce.popValue(), ce.popValue()
3580			x1Hi, x1Lo := ce.popValue(), ce.popValue()
3581			var x1, x2 uint64
3582			if op.B3 { // use lower 64 bits
3583				x1, x2 = x1Lo, x2Lo
3584			} else {
3585				x1, x2 = x1Hi, x2Hi
3586			}
3587
3588			signed := op.B2 == 1
3589
3590			var retLo, retHi uint64
3591			switch op.B1 {
3592			case shapeI8x16:
3593				for i := 0; i < 8; i++ {
3594					v1, v2 := byte(x1>>(i*8)), byte(x2>>(i*8))
3595
3596					var v16 uint16
3597					if signed {
3598						v16 = uint16(int16(int8(v1)) * int16(int8(v2)))
3599					} else {
3600						v16 = uint16(v1) * uint16(v2)
3601					}
3602
3603					if i < 4 {
3604						retLo |= uint64(v16) << (i * 16)
3605					} else {
3606						retHi |= uint64(v16) << ((i - 4) * 16)
3607					}
3608				}
3609			case shapeI16x8:
3610				for i := 0; i < 4; i++ {
3611					v1, v2 := uint16(x1>>(i*16)), uint16(x2>>(i*16))
3612
3613					var v32 uint32
3614					if signed {
3615						v32 = uint32(int32(int16(v1)) * int32(int16(v2)))
3616					} else {
3617						v32 = uint32(v1) * uint32(v2)
3618					}
3619
3620					if i < 2 {
3621						retLo |= uint64(v32) << (i * 32)
3622					} else {
3623						retHi |= uint64(v32) << ((i - 2) * 32)
3624					}
3625				}
3626			case shapeI32x4:
3627				v1Lo, v2Lo := uint32(x1), uint32(x2)
3628				v1Hi, v2Hi := uint32(x1>>32), uint32(x2>>32)
3629				if signed {
3630					retLo = uint64(int64(int32(v1Lo)) * int64(int32(v2Lo)))
3631					retHi = uint64(int64(int32(v1Hi)) * int64(int32(v2Hi)))
3632				} else {
3633					retLo = uint64(v1Lo) * uint64(v2Lo)
3634					retHi = uint64(v1Hi) * uint64(v2Hi)
3635				}
3636			}
3637
3638			ce.pushValue(retLo)
3639			ce.pushValue(retHi)
3640			frame.pc++
3641		case operationKindV128Q15mulrSatS:
3642			x2hi, x2Lo := ce.popValue(), ce.popValue()
3643			x1hi, x1Lo := ce.popValue(), ce.popValue()
3644			var retLo, retHi uint64
3645			for i := 0; i < 8; i++ {
3646				var v, w int16
3647				if i < 4 {
3648					v, w = int16(uint16(x1Lo>>(i*16))), int16(uint16(x2Lo>>(i*16)))
3649				} else {
3650					v, w = int16(uint16(x1hi>>((i-4)*16))), int16(uint16(x2hi>>((i-4)*16)))
3651				}
3652
3653				var uv uint64
3654				// https://github.com/WebAssembly/spec/blob/wg-2.0.draft1/proposals/simd/SIMD.md#saturating-integer-q-format-rounding-multiplication
3655				if calc := ((int32(v) * int32(w)) + 0x4000) >> 15; calc < math.MinInt16 {
3656					uv = uint64(uint16(0x8000))
3657				} else if calc > math.MaxInt16 {
3658					uv = uint64(uint16(0x7fff))
3659				} else {
3660					uv = uint64(uint16(int16(calc)))
3661				}
3662
3663				if i < 4 {
3664					retLo |= uv << (i * 16)
3665				} else {
3666					retHi |= uv << ((i - 4) * 16)
3667				}
3668			}
3669
3670			ce.pushValue(retLo)
3671			ce.pushValue(retHi)
3672			frame.pc++
3673		case operationKindV128ExtAddPairwise:
3674			hi, lo := ce.popValue(), ce.popValue()
3675
3676			signed := op.B3
3677
3678			var retLo, retHi uint64
3679			switch op.B1 {
3680			case shapeI8x16:
3681				for i := 0; i < 8; i++ {
3682					var v1, v2 byte
3683					if i < 4 {
3684						v1, v2 = byte(lo>>((i*2)*8)), byte(lo>>((i*2+1)*8))
3685					} else {
3686						v1, v2 = byte(hi>>(((i-4)*2)*8)), byte(hi>>(((i-4)*2+1)*8))
3687					}
3688
3689					var v16 uint16
3690					if signed {
3691						v16 = uint16(int16(int8(v1)) + int16(int8(v2)))
3692					} else {
3693						v16 = uint16(v1) + uint16(v2)
3694					}
3695
3696					if i < 4 {
3697						retLo |= uint64(v16) << (i * 16)
3698					} else {
3699						retHi |= uint64(v16) << ((i - 4) * 16)
3700					}
3701				}
3702			case shapeI16x8:
3703				for i := 0; i < 4; i++ {
3704					var v1, v2 uint16
3705					if i < 2 {
3706						v1, v2 = uint16(lo>>((i*2)*16)), uint16(lo>>((i*2+1)*16))
3707					} else {
3708						v1, v2 = uint16(hi>>(((i-2)*2)*16)), uint16(hi>>(((i-2)*2+1)*16))
3709					}
3710
3711					var v32 uint32
3712					if signed {
3713						v32 = uint32(int32(int16(v1)) + int32(int16(v2)))
3714					} else {
3715						v32 = uint32(v1) + uint32(v2)
3716					}
3717
3718					if i < 2 {
3719						retLo |= uint64(v32) << (i * 32)
3720					} else {
3721						retHi |= uint64(v32) << ((i - 2) * 32)
3722					}
3723				}
3724			}
3725			ce.pushValue(retLo)
3726			ce.pushValue(retHi)
3727			frame.pc++
3728		case operationKindV128FloatPromote:
3729			_, toPromote := ce.popValue(), ce.popValue()
3730			ce.pushValue(math.Float64bits(float64(math.Float32frombits(uint32(toPromote)))))
3731			ce.pushValue(math.Float64bits(float64(math.Float32frombits(uint32(toPromote >> 32)))))
3732			frame.pc++
3733		case operationKindV128FloatDemote:
3734			hi, lo := ce.popValue(), ce.popValue()
3735			ce.pushValue(
3736				uint64(math.Float32bits(float32(math.Float64frombits(lo)))) |
3737					(uint64(math.Float32bits(float32(math.Float64frombits(hi)))) << 32),
3738			)
3739			ce.pushValue(0)
3740			frame.pc++
3741		case operationKindV128FConvertFromI:
3742			hi, lo := ce.popValue(), ce.popValue()
3743			v1, v2, v3, v4 := uint32(lo), uint32(lo>>32), uint32(hi), uint32(hi>>32)
3744			signed := op.B3
3745
3746			var retLo, retHi uint64
3747			switch op.B1 { // Destination shape.
3748			case shapeF32x4: // f32x4 from signed/unsigned i32x4
3749				if signed {
3750					retLo = uint64(math.Float32bits(float32(int32(v1)))) |
3751						(uint64(math.Float32bits(float32(int32(v2)))) << 32)
3752					retHi = uint64(math.Float32bits(float32(int32(v3)))) |
3753						(uint64(math.Float32bits(float32(int32(v4)))) << 32)
3754				} else {
3755					retLo = uint64(math.Float32bits(float32(v1))) |
3756						(uint64(math.Float32bits(float32(v2))) << 32)
3757					retHi = uint64(math.Float32bits(float32(v3))) |
3758						(uint64(math.Float32bits(float32(v4))) << 32)
3759				}
3760			case shapeF64x2: // f64x2 from signed/unsigned i32x4
3761				if signed {
3762					retLo, retHi = math.Float64bits(float64(int32(v1))), math.Float64bits(float64(int32(v2)))
3763				} else {
3764					retLo, retHi = math.Float64bits(float64(v1)), math.Float64bits(float64(v2))
3765				}
3766			}
3767
3768			ce.pushValue(retLo)
3769			ce.pushValue(retHi)
3770			frame.pc++
3771		case operationKindV128Narrow:
3772			x2Hi, x2Lo := ce.popValue(), ce.popValue()
3773			x1Hi, x1Lo := ce.popValue(), ce.popValue()
3774			signed := op.B3
3775
3776			var retLo, retHi uint64
3777			switch op.B1 {
3778			case shapeI16x8: // signed/unsigned i16x8 to i8x16
3779				for i := 0; i < 8; i++ {
3780					var v16 uint16
3781					if i < 4 {
3782						v16 = uint16(x1Lo >> (i * 16))
3783					} else {
3784						v16 = uint16(x1Hi >> ((i - 4) * 16))
3785					}
3786
3787					var v byte
3788					if signed {
3789						if s := int16(v16); s > math.MaxInt8 {
3790							v = math.MaxInt8
3791						} else if s < math.MinInt8 {
3792							s = math.MinInt8
3793							v = byte(s)
3794						} else {
3795							v = byte(v16)
3796						}
3797					} else {
3798						if s := int16(v16); s > math.MaxUint8 {
3799							v = math.MaxUint8
3800						} else if s < 0 {
3801							v = 0
3802						} else {
3803							v = byte(v16)
3804						}
3805					}
3806					retLo |= uint64(v) << (i * 8)
3807				}
3808				for i := 0; i < 8; i++ {
3809					var v16 uint16
3810					if i < 4 {
3811						v16 = uint16(x2Lo >> (i * 16))
3812					} else {
3813						v16 = uint16(x2Hi >> ((i - 4) * 16))
3814					}
3815
3816					var v byte
3817					if signed {
3818						if s := int16(v16); s > math.MaxInt8 {
3819							v = math.MaxInt8
3820						} else if s < math.MinInt8 {
3821							s = math.MinInt8
3822							v = byte(s)
3823						} else {
3824							v = byte(v16)
3825						}
3826					} else {
3827						if s := int16(v16); s > math.MaxUint8 {
3828							v = math.MaxUint8
3829						} else if s < 0 {
3830							v = 0
3831						} else {
3832							v = byte(v16)
3833						}
3834					}
3835					retHi |= uint64(v) << (i * 8)
3836				}
3837			case shapeI32x4: // signed/unsigned i32x4 to i16x8
3838				for i := 0; i < 4; i++ {
3839					var v32 uint32
3840					if i < 2 {
3841						v32 = uint32(x1Lo >> (i * 32))
3842					} else {
3843						v32 = uint32(x1Hi >> ((i - 2) * 32))
3844					}
3845
3846					var v uint16
3847					if signed {
3848						if s := int32(v32); s > math.MaxInt16 {
3849							v = math.MaxInt16
3850						} else if s < math.MinInt16 {
3851							s = math.MinInt16
3852							v = uint16(s)
3853						} else {
3854							v = uint16(v32)
3855						}
3856					} else {
3857						if s := int32(v32); s > math.MaxUint16 {
3858							v = math.MaxUint16
3859						} else if s < 0 {
3860							v = 0
3861						} else {
3862							v = uint16(v32)
3863						}
3864					}
3865					retLo |= uint64(v) << (i * 16)
3866				}
3867
3868				for i := 0; i < 4; i++ {
3869					var v32 uint32
3870					if i < 2 {
3871						v32 = uint32(x2Lo >> (i * 32))
3872					} else {
3873						v32 = uint32(x2Hi >> ((i - 2) * 32))
3874					}
3875
3876					var v uint16
3877					if signed {
3878						if s := int32(v32); s > math.MaxInt16 {
3879							v = math.MaxInt16
3880						} else if s < math.MinInt16 {
3881							s = math.MinInt16
3882							v = uint16(s)
3883						} else {
3884							v = uint16(v32)
3885						}
3886					} else {
3887						if s := int32(v32); s > math.MaxUint16 {
3888							v = math.MaxUint16
3889						} else if s < 0 {
3890							v = 0
3891						} else {
3892							v = uint16(v32)
3893						}
3894					}
3895					retHi |= uint64(v) << (i * 16)
3896				}
3897			}
3898			ce.pushValue(retLo)
3899			ce.pushValue(retHi)
3900			frame.pc++
3901		case operationKindV128Dot:
3902			x2Hi, x2Lo := ce.popValue(), ce.popValue()
3903			x1Hi, x1Lo := ce.popValue(), ce.popValue()
3904			lo, hi := v128Dot(x1Hi, x1Lo, x2Hi, x2Lo)
3905			ce.pushValue(lo)
3906			ce.pushValue(hi)
3907			frame.pc++
3908		case operationKindV128ITruncSatFromF:
3909			hi, lo := ce.popValue(), ce.popValue()
3910			signed := op.B3
3911			var retLo, retHi uint64
3912
3913			switch op.B1 {
3914			case shapeF32x4: // f32x4 to i32x4
3915				for i, f64 := range [4]float64{
3916					math.Trunc(float64(math.Float32frombits(uint32(lo)))),
3917					math.Trunc(float64(math.Float32frombits(uint32(lo >> 32)))),
3918					math.Trunc(float64(math.Float32frombits(uint32(hi)))),
3919					math.Trunc(float64(math.Float32frombits(uint32(hi >> 32)))),
3920				} {
3921
3922					var v uint32
3923					if math.IsNaN(f64) {
3924						v = 0
3925					} else if signed {
3926						if f64 < math.MinInt32 {
3927							f64 = math.MinInt32
3928						} else if f64 > math.MaxInt32 {
3929							f64 = math.MaxInt32
3930						}
3931						v = uint32(int32(f64))
3932					} else {
3933						if f64 < 0 {
3934							f64 = 0
3935						} else if f64 > math.MaxUint32 {
3936							f64 = math.MaxUint32
3937						}
3938						v = uint32(f64)
3939					}
3940
3941					if i < 2 {
3942						retLo |= uint64(v) << (i * 32)
3943					} else {
3944						retHi |= uint64(v) << ((i - 2) * 32)
3945					}
3946				}
3947
3948			case shapeF64x2: // f64x2 to i32x4
3949				for i, f := range [2]float64{
3950					math.Trunc(math.Float64frombits(lo)),
3951					math.Trunc(math.Float64frombits(hi)),
3952				} {
3953					var v uint32
3954					if math.IsNaN(f) {
3955						v = 0
3956					} else if signed {
3957						if f < math.MinInt32 {
3958							f = math.MinInt32
3959						} else if f > math.MaxInt32 {
3960							f = math.MaxInt32
3961						}
3962						v = uint32(int32(f))
3963					} else {
3964						if f < 0 {
3965							f = 0
3966						} else if f > math.MaxUint32 {
3967							f = math.MaxUint32
3968						}
3969						v = uint32(f)
3970					}
3971
3972					retLo |= uint64(v) << (i * 32)
3973				}
3974			}
3975
3976			ce.pushValue(retLo)
3977			ce.pushValue(retHi)
3978			frame.pc++
3979		case operationKindAtomicMemoryWait:
3980			timeout := int64(ce.popValue())
3981			exp := ce.popValue()
3982			offset := ce.popMemoryOffset(op)
3983			// Runtime instead of validation error because the spec intends to allow binaries to include
3984			// such instructions as long as they are not executed.
3985			if !memoryInst.Shared {
3986				panic(wasmruntime.ErrRuntimeExpectedSharedMemory)
3987			}
3988
3989			switch unsignedType(op.B1) {
3990			case unsignedTypeI32:
3991				if offset%4 != 0 {
3992					panic(wasmruntime.ErrRuntimeUnalignedAtomic)
3993				}
3994				if int(offset) > len(memoryInst.Buffer)-4 {
3995					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
3996				}
3997				ce.pushValue(memoryInst.Wait32(offset, uint32(exp), timeout, func(mem *wasm.MemoryInstance, offset uint32) uint32 {
3998					mem.Mux.Lock()
3999					defer mem.Mux.Unlock()
4000					value, _ := mem.ReadUint32Le(offset)
4001					return value
4002				}))
4003			case unsignedTypeI64:
4004				if offset%8 != 0 {
4005					panic(wasmruntime.ErrRuntimeUnalignedAtomic)
4006				}
4007				if int(offset) > len(memoryInst.Buffer)-8 {
4008					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
4009				}
4010				ce.pushValue(memoryInst.Wait64(offset, exp, timeout, func(mem *wasm.MemoryInstance, offset uint32) uint64 {
4011					mem.Mux.Lock()
4012					defer mem.Mux.Unlock()
4013					value, _ := mem.ReadUint64Le(offset)
4014					return value
4015				}))
4016			}
4017			frame.pc++
4018		case operationKindAtomicMemoryNotify:
4019			count := ce.popValue()
4020			offset := ce.popMemoryOffset(op)
4021			if offset%4 != 0 {
4022				panic(wasmruntime.ErrRuntimeUnalignedAtomic)
4023			}
4024			// Just a bounds check
4025			if offset >= memoryInst.Size() {
4026				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
4027			}
4028			res := memoryInst.Notify(offset, uint32(count))
4029			ce.pushValue(uint64(res))
4030			frame.pc++
4031		case operationKindAtomicFence:
4032			// Memory not required for fence only
4033			if memoryInst != nil {
4034				// An empty critical section can be used as a synchronization primitive, which is what
4035				// fence is. Probably, there are no spectests or defined behavior to confirm this yet.
4036				memoryInst.Mux.Lock()
4037				memoryInst.Mux.Unlock() //nolint:staticcheck
4038			}
4039			frame.pc++
4040		case operationKindAtomicLoad:
4041			offset := ce.popMemoryOffset(op)
4042			switch unsignedType(op.B1) {
4043			case unsignedTypeI32:
4044				if offset%4 != 0 {
4045					panic(wasmruntime.ErrRuntimeUnalignedAtomic)
4046				}
4047				memoryInst.Mux.Lock()
4048				val, ok := memoryInst.ReadUint32Le(offset)
4049				memoryInst.Mux.Unlock()
4050				if !ok {
4051					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
4052				}
4053				ce.pushValue(uint64(val))
4054			case unsignedTypeI64:
4055				if offset%8 != 0 {
4056					panic(wasmruntime.ErrRuntimeUnalignedAtomic)
4057				}
4058				memoryInst.Mux.Lock()
4059				val, ok := memoryInst.ReadUint64Le(offset)
4060				memoryInst.Mux.Unlock()
4061				if !ok {
4062					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
4063				}
4064				ce.pushValue(val)
4065			}
4066			frame.pc++
4067		case operationKindAtomicLoad8:
4068			offset := ce.popMemoryOffset(op)
4069			memoryInst.Mux.Lock()
4070			val, ok := memoryInst.ReadByte(offset)
4071			memoryInst.Mux.Unlock()
4072			if !ok {
4073				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
4074			}
4075			ce.pushValue(uint64(val))
4076			frame.pc++
4077		case operationKindAtomicLoad16:
4078			offset := ce.popMemoryOffset(op)
4079			if offset%2 != 0 {
4080				panic(wasmruntime.ErrRuntimeUnalignedAtomic)
4081			}
4082			memoryInst.Mux.Lock()
4083			val, ok := memoryInst.ReadUint16Le(offset)
4084			memoryInst.Mux.Unlock()
4085			if !ok {
4086				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
4087			}
4088			ce.pushValue(uint64(val))
4089			frame.pc++
4090		case operationKindAtomicStore:
4091			val := ce.popValue()
4092			offset := ce.popMemoryOffset(op)
4093			switch unsignedType(op.B1) {
4094			case unsignedTypeI32:
4095				if offset%4 != 0 {
4096					panic(wasmruntime.ErrRuntimeUnalignedAtomic)
4097				}
4098				memoryInst.Mux.Lock()
4099				ok := memoryInst.WriteUint32Le(offset, uint32(val))
4100				memoryInst.Mux.Unlock()
4101				if !ok {
4102					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
4103				}
4104			case unsignedTypeI64:
4105				if offset%8 != 0 {
4106					panic(wasmruntime.ErrRuntimeUnalignedAtomic)
4107				}
4108				memoryInst.Mux.Lock()
4109				ok := memoryInst.WriteUint64Le(offset, val)
4110				memoryInst.Mux.Unlock()
4111				if !ok {
4112					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
4113				}
4114			}
4115			frame.pc++
4116		case operationKindAtomicStore8:
4117			val := byte(ce.popValue())
4118			offset := ce.popMemoryOffset(op)
4119			memoryInst.Mux.Lock()
4120			ok := memoryInst.WriteByte(offset, val)
4121			memoryInst.Mux.Unlock()
4122			if !ok {
4123				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
4124			}
4125			frame.pc++
4126		case operationKindAtomicStore16:
4127			val := uint16(ce.popValue())
4128			offset := ce.popMemoryOffset(op)
4129			if offset%2 != 0 {
4130				panic(wasmruntime.ErrRuntimeUnalignedAtomic)
4131			}
4132			memoryInst.Mux.Lock()
4133			ok := memoryInst.WriteUint16Le(offset, val)
4134			memoryInst.Mux.Unlock()
4135			if !ok {
4136				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
4137			}
4138			frame.pc++
4139		case operationKindAtomicRMW:
4140			val := ce.popValue()
4141			offset := ce.popMemoryOffset(op)
4142			switch unsignedType(op.B1) {
4143			case unsignedTypeI32:
4144				if offset%4 != 0 {
4145					panic(wasmruntime.ErrRuntimeUnalignedAtomic)
4146				}
4147				memoryInst.Mux.Lock()
4148				old, ok := memoryInst.ReadUint32Le(offset)
4149				if !ok {
4150					memoryInst.Mux.Unlock()
4151					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
4152				}
4153				var newVal uint32
4154				switch atomicArithmeticOp(op.B2) {
4155				case atomicArithmeticOpAdd:
4156					newVal = old + uint32(val)
4157				case atomicArithmeticOpSub:
4158					newVal = old - uint32(val)
4159				case atomicArithmeticOpAnd:
4160					newVal = old & uint32(val)
4161				case atomicArithmeticOpOr:
4162					newVal = old | uint32(val)
4163				case atomicArithmeticOpXor:
4164					newVal = old ^ uint32(val)
4165				case atomicArithmeticOpNop:
4166					newVal = uint32(val)
4167				}
4168				memoryInst.WriteUint32Le(offset, newVal)
4169				memoryInst.Mux.Unlock()
4170				ce.pushValue(uint64(old))
4171			case unsignedTypeI64:
4172				if offset%8 != 0 {
4173					panic(wasmruntime.ErrRuntimeUnalignedAtomic)
4174				}
4175				memoryInst.Mux.Lock()
4176				old, ok := memoryInst.ReadUint64Le(offset)
4177				if !ok {
4178					memoryInst.Mux.Unlock()
4179					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
4180				}
4181				var newVal uint64
4182				switch atomicArithmeticOp(op.B2) {
4183				case atomicArithmeticOpAdd:
4184					newVal = old + val
4185				case atomicArithmeticOpSub:
4186					newVal = old - val
4187				case atomicArithmeticOpAnd:
4188					newVal = old & val
4189				case atomicArithmeticOpOr:
4190					newVal = old | val
4191				case atomicArithmeticOpXor:
4192					newVal = old ^ val
4193				case atomicArithmeticOpNop:
4194					newVal = val
4195				}
4196				memoryInst.WriteUint64Le(offset, newVal)
4197				memoryInst.Mux.Unlock()
4198				ce.pushValue(old)
4199			}
4200			frame.pc++
4201		case operationKindAtomicRMW8:
4202			val := ce.popValue()
4203			offset := ce.popMemoryOffset(op)
4204			memoryInst.Mux.Lock()
4205			old, ok := memoryInst.ReadByte(offset)
4206			if !ok {
4207				memoryInst.Mux.Unlock()
4208				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
4209			}
4210			arg := byte(val)
4211			var newVal byte
4212			switch atomicArithmeticOp(op.B2) {
4213			case atomicArithmeticOpAdd:
4214				newVal = old + arg
4215			case atomicArithmeticOpSub:
4216				newVal = old - arg
4217			case atomicArithmeticOpAnd:
4218				newVal = old & arg
4219			case atomicArithmeticOpOr:
4220				newVal = old | arg
4221			case atomicArithmeticOpXor:
4222				newVal = old ^ arg
4223			case atomicArithmeticOpNop:
4224				newVal = arg
4225			}
4226			memoryInst.WriteByte(offset, newVal)
4227			memoryInst.Mux.Unlock()
4228			ce.pushValue(uint64(old))
4229			frame.pc++
4230		case operationKindAtomicRMW16:
4231			val := ce.popValue()
4232			offset := ce.popMemoryOffset(op)
4233			if offset%2 != 0 {
4234				panic(wasmruntime.ErrRuntimeUnalignedAtomic)
4235			}
4236			memoryInst.Mux.Lock()
4237			old, ok := memoryInst.ReadUint16Le(offset)
4238			if !ok {
4239				memoryInst.Mux.Unlock()
4240				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
4241			}
4242			arg := uint16(val)
4243			var newVal uint16
4244			switch atomicArithmeticOp(op.B2) {
4245			case atomicArithmeticOpAdd:
4246				newVal = old + arg
4247			case atomicArithmeticOpSub:
4248				newVal = old - arg
4249			case atomicArithmeticOpAnd:
4250				newVal = old & arg
4251			case atomicArithmeticOpOr:
4252				newVal = old | arg
4253			case atomicArithmeticOpXor:
4254				newVal = old ^ arg
4255			case atomicArithmeticOpNop:
4256				newVal = arg
4257			}
4258			memoryInst.WriteUint16Le(offset, newVal)
4259			memoryInst.Mux.Unlock()
4260			ce.pushValue(uint64(old))
4261			frame.pc++
4262		case operationKindAtomicRMWCmpxchg:
4263			rep := ce.popValue()
4264			exp := ce.popValue()
4265			offset := ce.popMemoryOffset(op)
4266			switch unsignedType(op.B1) {
4267			case unsignedTypeI32:
4268				if offset%4 != 0 {
4269					panic(wasmruntime.ErrRuntimeUnalignedAtomic)
4270				}
4271				memoryInst.Mux.Lock()
4272				old, ok := memoryInst.ReadUint32Le(offset)
4273				if !ok {
4274					memoryInst.Mux.Unlock()
4275					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
4276				}
4277				if old == uint32(exp) {
4278					memoryInst.WriteUint32Le(offset, uint32(rep))
4279				}
4280				memoryInst.Mux.Unlock()
4281				ce.pushValue(uint64(old))
4282			case unsignedTypeI64:
4283				if offset%8 != 0 {
4284					panic(wasmruntime.ErrRuntimeUnalignedAtomic)
4285				}
4286				memoryInst.Mux.Lock()
4287				old, ok := memoryInst.ReadUint64Le(offset)
4288				if !ok {
4289					memoryInst.Mux.Unlock()
4290					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
4291				}
4292				if old == exp {
4293					memoryInst.WriteUint64Le(offset, rep)
4294				}
4295				memoryInst.Mux.Unlock()
4296				ce.pushValue(old)
4297			}
4298			frame.pc++
4299		case operationKindAtomicRMW8Cmpxchg:
4300			rep := byte(ce.popValue())
4301			exp := byte(ce.popValue())
4302			offset := ce.popMemoryOffset(op)
4303			memoryInst.Mux.Lock()
4304			old, ok := memoryInst.ReadByte(offset)
4305			if !ok {
4306				memoryInst.Mux.Unlock()
4307				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
4308			}
4309			if old == exp {
4310				memoryInst.WriteByte(offset, rep)
4311			}
4312			memoryInst.Mux.Unlock()
4313			ce.pushValue(uint64(old))
4314			frame.pc++
4315		case operationKindAtomicRMW16Cmpxchg:
4316			rep := uint16(ce.popValue())
4317			exp := uint16(ce.popValue())
4318			offset := ce.popMemoryOffset(op)
4319			if offset%2 != 0 {
4320				panic(wasmruntime.ErrRuntimeUnalignedAtomic)
4321			}
4322			memoryInst.Mux.Lock()
4323			old, ok := memoryInst.ReadUint16Le(offset)
4324			if !ok {
4325				memoryInst.Mux.Unlock()
4326				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
4327			}
4328			if old == exp {
4329				memoryInst.WriteUint16Le(offset, rep)
4330			}
4331			memoryInst.Mux.Unlock()
4332			ce.pushValue(uint64(old))
4333			frame.pc++
4334		default:
4335			frame.pc++
4336		}
4337	}
4338	ce.popFrame()
4339}
4340
4341func wasmCompatMax32bits(v1, v2 uint32) uint64 {
4342	return uint64(math.Float32bits(moremath.WasmCompatMax32(
4343		math.Float32frombits(v1),
4344		math.Float32frombits(v2),
4345	)))
4346}
4347
4348func wasmCompatMin32bits(v1, v2 uint32) uint64 {
4349	return uint64(math.Float32bits(moremath.WasmCompatMin32(
4350		math.Float32frombits(v1),
4351		math.Float32frombits(v2),
4352	)))
4353}
4354
4355func addFloat32bits(v1, v2 uint32) uint64 {
4356	return uint64(math.Float32bits(math.Float32frombits(v1) + math.Float32frombits(v2)))
4357}
4358
4359func subFloat32bits(v1, v2 uint32) uint64 {
4360	return uint64(math.Float32bits(math.Float32frombits(v1) - math.Float32frombits(v2)))
4361}
4362
4363func mulFloat32bits(v1, v2 uint32) uint64 {
4364	return uint64(math.Float32bits(math.Float32frombits(v1) * math.Float32frombits(v2)))
4365}
4366
4367func divFloat32bits(v1, v2 uint32) uint64 {
4368	return uint64(math.Float32bits(math.Float32frombits(v1) / math.Float32frombits(v2)))
4369}
4370
4371// https://www.w3.org/TR/2022/WD-wasm-core-2-20220419/exec/numerics.html#xref-exec-numerics-op-flt-mathrm-flt-n-z-1-z-2
4372func flt32(z1, z2 float32) bool {
4373	if z1 != z1 || z2 != z2 {
4374		return false
4375	} else if z1 == z2 {
4376		return false
4377	} else if math.IsInf(float64(z1), 1) {
4378		return false
4379	} else if math.IsInf(float64(z1), -1) {
4380		return true
4381	} else if math.IsInf(float64(z2), 1) {
4382		return true
4383	} else if math.IsInf(float64(z2), -1) {
4384		return false
4385	}
4386	return z1 < z2
4387}
4388
4389// https://www.w3.org/TR/2022/WD-wasm-core-2-20220419/exec/numerics.html#xref-exec-numerics-op-flt-mathrm-flt-n-z-1-z-2
4390func flt64(z1, z2 float64) bool {
4391	if z1 != z1 || z2 != z2 {
4392		return false
4393	} else if z1 == z2 {
4394		return false
4395	} else if math.IsInf(z1, 1) {
4396		return false
4397	} else if math.IsInf(z1, -1) {
4398		return true
4399	} else if math.IsInf(z2, 1) {
4400		return true
4401	} else if math.IsInf(z2, -1) {
4402		return false
4403	}
4404	return z1 < z2
4405}
4406
4407func i8RoundingAverage(v1, v2 byte) byte {
4408	// https://github.com/WebAssembly/spec/blob/wg-2.0.draft1/proposals/simd/SIMD.md#lane-wise-integer-rounding-average
4409	return byte((uint16(v1) + uint16(v2) + uint16(1)) / 2)
4410}
4411
4412func i16RoundingAverage(v1, v2 uint16) uint16 {
4413	// https://github.com/WebAssembly/spec/blob/wg-2.0.draft1/proposals/simd/SIMD.md#lane-wise-integer-rounding-average
4414	return uint16((uint32(v1) + uint32(v2) + 1) / 2)
4415}
4416
4417func i8Abs(v byte) byte {
4418	if i := int8(v); i < 0 {
4419		return byte(-i)
4420	} else {
4421		return byte(i)
4422	}
4423}
4424
4425func i8MaxU(v1, v2 byte) byte {
4426	if v1 < v2 {
4427		return v2
4428	} else {
4429		return v1
4430	}
4431}
4432
4433func i8MinU(v1, v2 byte) byte {
4434	if v1 > v2 {
4435		return v2
4436	} else {
4437		return v1
4438	}
4439}
4440
4441func i8MaxS(v1, v2 byte) byte {
4442	if int8(v1) < int8(v2) {
4443		return v2
4444	} else {
4445		return v1
4446	}
4447}
4448
4449func i8MinS(v1, v2 byte) byte {
4450	if int8(v1) > int8(v2) {
4451		return v2
4452	} else {
4453		return v1
4454	}
4455}
4456
4457func i16MaxU(v1, v2 uint16) uint16 {
4458	if v1 < v2 {
4459		return v2
4460	} else {
4461		return v1
4462	}
4463}
4464
4465func i16MinU(v1, v2 uint16) uint16 {
4466	if v1 > v2 {
4467		return v2
4468	} else {
4469		return v1
4470	}
4471}
4472
4473func i16MaxS(v1, v2 uint16) uint16 {
4474	if int16(v1) < int16(v2) {
4475		return v2
4476	} else {
4477		return v1
4478	}
4479}
4480
4481func i16MinS(v1, v2 uint16) uint16 {
4482	if int16(v1) > int16(v2) {
4483		return v2
4484	} else {
4485		return v1
4486	}
4487}
4488
4489func i32MaxU(v1, v2 uint32) uint32 {
4490	if v1 < v2 {
4491		return v2
4492	} else {
4493		return v1
4494	}
4495}
4496
4497func i32MinU(v1, v2 uint32) uint32 {
4498	if v1 > v2 {
4499		return v2
4500	} else {
4501		return v1
4502	}
4503}
4504
4505func i32MaxS(v1, v2 uint32) uint32 {
4506	if int32(v1) < int32(v2) {
4507		return v2
4508	} else {
4509		return v1
4510	}
4511}
4512
4513func i32MinS(v1, v2 uint32) uint32 {
4514	if int32(v1) > int32(v2) {
4515		return v2
4516	} else {
4517		return v1
4518	}
4519}
4520
4521func i16Abs(v uint16) uint16 {
4522	if i := int16(v); i < 0 {
4523		return uint16(-i)
4524	} else {
4525		return uint16(i)
4526	}
4527}
4528
4529func i32Abs(v uint32) uint32 {
4530	if i := int32(v); i < 0 {
4531		return uint32(-i)
4532	} else {
4533		return uint32(i)
4534	}
4535}
4536
4537func (ce *callEngine) callNativeFuncWithListener(ctx context.Context, m *wasm.ModuleInstance, f *function, fnl experimental.FunctionListener) context.Context {
4538	def, typ := f.definition(), f.funcType
4539
4540	ce.stackIterator.reset(ce.stack, ce.frames, f)
4541	fnl.Before(ctx, m, def, ce.peekValues(typ.ParamNumInUint64), &ce.stackIterator)
4542	ce.stackIterator.clear()
4543	ce.callNativeFunc(ctx, m, f)
4544	fnl.After(ctx, m, def, ce.peekValues(typ.ResultNumInUint64))
4545	return ctx
4546}
4547
4548// popMemoryOffset takes a memory offset off the stack for use in load and store instructions.
4549// As the top of stack value is 64-bit, this ensures it is in range before returning it.
4550func (ce *callEngine) popMemoryOffset(op *unionOperation) uint32 {
4551	offset := op.U2 + ce.popValue()
4552	if offset > math.MaxUint32 {
4553		panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
4554	}
4555	return uint32(offset)
4556}
4557
4558func (ce *callEngine) callGoFuncWithStack(ctx context.Context, m *wasm.ModuleInstance, f *function) {
4559	typ := f.funcType
4560	paramLen := typ.ParamNumInUint64
4561	resultLen := typ.ResultNumInUint64
4562	stackLen := paramLen
4563
4564	// In the interpreter engine, ce.stack may only have capacity to store
4565	// parameters. Grow when there are more results than parameters.
4566	if growLen := resultLen - paramLen; growLen > 0 {
4567		for i := 0; i < growLen; i++ {
4568			ce.stack = append(ce.stack, 0)
4569		}
4570		stackLen += growLen
4571	}
4572
4573	// Pass the stack elements to the go function.
4574	stack := ce.stack[len(ce.stack)-stackLen:]
4575	ce.callGoFunc(ctx, m, f, stack)
4576
4577	// Shrink the stack when there were more parameters than results.
4578	if shrinkLen := paramLen - resultLen; shrinkLen > 0 {
4579		ce.stack = ce.stack[0 : len(ce.stack)-shrinkLen]
4580	}
4581}
4582
4583// v128Dot performs a dot product of two 64-bit vectors.
4584// Note: for some reason (which I suspect is due to a bug in Go compiler's regalloc),
4585// inlining this function causes a bug which happens **only when** we run with -race AND arm64 AND Go 1.22.
4586func v128Dot(x1Hi, x1Lo, x2Hi, x2Lo uint64) (uint64, uint64) {
4587	r1 := int32(int16(x1Lo>>0)) * int32(int16(x2Lo>>0))
4588	r2 := int32(int16(x1Lo>>16)) * int32(int16(x2Lo>>16))
4589	r3 := int32(int16(x1Lo>>32)) * int32(int16(x2Lo>>32))
4590	r4 := int32(int16(x1Lo>>48)) * int32(int16(x2Lo>>48))
4591	r5 := int32(int16(x1Hi>>0)) * int32(int16(x2Hi>>0))
4592	r6 := int32(int16(x1Hi>>16)) * int32(int16(x2Hi>>16))
4593	r7 := int32(int16(x1Hi>>32)) * int32(int16(x2Hi>>32))
4594	r8 := int32(int16(x1Hi>>48)) * int32(int16(x2Hi>>48))
4595	return uint64(uint32(r1+r2)) | (uint64(uint32(r3+r4)) << 32), uint64(uint32(r5+r6)) | (uint64(uint32(r7+r8)) << 32)
4596}