Blame - ap/lib/libssl/openssl-1.1.1o/crypto/modes/asm/ghash-sparcv9.pl - R306

blob: ccebc74b4e9753786fff763c6f38d7f36313423e [file] [log] [blame]

yuezonghe	824eb0c	2024-06-27 02:32:26 -0700	[diff] [blame]	1	#! /usr/bin/env perl
				2	# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
				3	#
				4	# Licensed under the OpenSSL license (the "License"). You may not use
				5	# this file except in compliance with the License. You can obtain a copy
				6	# in the file LICENSE in the source distribution or at
				7	# https://www.openssl.org/source/license.html
				8
				9
				10	# ====================================================================
				11	# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
				12	# project. The module is, however, dual licensed under OpenSSL and
				13	# CRYPTOGAMS licenses depending on where you obtain it. For further
				14	# details see http://www.openssl.org/~appro/cryptogams/.
				15	# ====================================================================
				16
				17	# March 2010
				18	#
				19	# The module implements "4-bit" GCM GHASH function and underlying
				20	# single multiplication operation in GF(2^128). "4-bit" means that it
				21	# uses 256 bytes per-key table [+128 bytes shared table]. Performance
				22	# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
				23	# and are expressed in cycles per processed byte, less is better:
				24	#
				25	# gcc 3.3.x cc 5.2 this assembler
				26	#
				27	# 32-bit build 81.4 43.3 12.6 (+546%/+244%)
				28	# 64-bit build 20.2 21.2 12.6 (+60%/+68%)
				29	#
				30	# Here is data collected on UltraSPARC T1 system running Linux:
				31	#
				32	# gcc 4.4.1 this assembler
				33	#
				34	# 32-bit build 566 50 (+1000%)
				35	# 64-bit build 56 50 (+12%)
				36	#
				37	# I don't quite understand why difference between 32-bit and 64-bit
				38	# compiler-generated code is so big. Compilers were instructed to
				39	# generate code for UltraSPARC and should have used 64-bit registers
				40	# for Z vector (see C code) even in 32-bit build... Oh well, it only
				41	# means more impressive improvement coefficients for this assembler
				42	# module;-) Loops are aggressively modulo-scheduled in respect to
				43	# references to input data and Z.hi updates to achieve 12 cycles
				44	# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
				45	# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
				46	#
				47	# October 2012
				48	#
				49	# Add VIS3 lookup-table-free implementation using polynomial
				50	# multiplication xmulx[hi] and extended addition addxc[cc]
				51	# instructions. 4.52/7.63x improvement on T3/T4 or in absolute
				52	# terms 7.90/2.14 cycles per byte. On T4 multi-process benchmark
				53	# saturates at ~15.5x single-process result on 8-core processor,
				54	# or ~20.5GBps per 2.85GHz socket.
				55
				56	$output=pop;
				57	open STDOUT,">$output";
				58
				59	$frame="STACK_FRAME";
				60	$bias="STACK_BIAS";
				61
				62	$Zhi="%o0"; # 64-bit values
				63	$Zlo="%o1";
				64	$Thi="%o2";
				65	$Tlo="%o3";
				66	$rem="%o4";
				67	$tmp="%o5";
				68
				69	$nhi="%l0"; # small values and pointers
				70	$nlo="%l1";
				71	$xi0="%l2";
				72	$xi1="%l3";
				73	$rem_4bit="%l4";
				74	$remi="%l5";
				75	$Htblo="%l6";
				76	$cnt="%l7";
				77
				78	$Xi="%i0"; # input argument block
				79	$Htbl="%i1";
				80	$inp="%i2";
				81	$len="%i3";
				82
				83	$code.=<<___;
				84	#include "sparc_arch.h"
				85
				86	#ifdef __arch64__
				87	.register %g2,#scratch
				88	.register %g3,#scratch
				89	#endif
				90
				91	.section ".text",#alloc,#execinstr
				92
				93	.align 64
				94	rem_4bit:
				95	.long `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
				96	.long `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
				97	.long `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
				98	.long `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
				99	.type rem_4bit,#object
				100	.size rem_4bit,(.-rem_4bit)
				101
				102	.globl gcm_ghash_4bit
				103	.align 32
				104	gcm_ghash_4bit:
				105	save %sp,-$frame,%sp
				106	ldub [$inp+15],$nlo
				107	ldub [$Xi+15],$xi0
				108	ldub [$Xi+14],$xi1
				109	add $len,$inp,$len
				110	add $Htbl,8,$Htblo
				111
				112	1: call .+8
				113	add %o7,rem_4bit-1b,$rem_4bit
				114
				115	.Louter:
				116	xor $xi0,$nlo,$nlo
				117	and $nlo,0xf0,$nhi
				118	and $nlo,0x0f,$nlo
				119	sll $nlo,4,$nlo
				120	ldx [$Htblo+$nlo],$Zlo
				121	ldx [$Htbl+$nlo],$Zhi
				122
				123	ldub [$inp+14],$nlo
				124
				125	ldx [$Htblo+$nhi],$Tlo
				126	and $Zlo,0xf,$remi
				127	ldx [$Htbl+$nhi],$Thi
				128	sll $remi,3,$remi
				129	ldx [$rem_4bit+$remi],$rem
				130	srlx $Zlo,4,$Zlo
				131	mov 13,$cnt
				132	sllx $Zhi,60,$tmp
				133	xor $Tlo,$Zlo,$Zlo
				134	srlx $Zhi,4,$Zhi
				135	xor $Zlo,$tmp,$Zlo
				136
				137	xor $xi1,$nlo,$nlo
				138	and $Zlo,0xf,$remi
				139	and $nlo,0xf0,$nhi
				140	and $nlo,0x0f,$nlo
				141	ba .Lghash_inner
				142	sll $nlo,4,$nlo
				143	.align 32
				144	.Lghash_inner:
				145	ldx [$Htblo+$nlo],$Tlo
				146	sll $remi,3,$remi
				147	xor $Thi,$Zhi,$Zhi
				148	ldx [$Htbl+$nlo],$Thi
				149	srlx $Zlo,4,$Zlo
				150	xor $rem,$Zhi,$Zhi
				151	ldx [$rem_4bit+$remi],$rem
				152	sllx $Zhi,60,$tmp
				153	xor $Tlo,$Zlo,$Zlo
				154	ldub [$inp+$cnt],$nlo
				155	srlx $Zhi,4,$Zhi
				156	xor $Zlo,$tmp,$Zlo
				157	ldub [$Xi+$cnt],$xi1
				158	xor $Thi,$Zhi,$Zhi
				159	and $Zlo,0xf,$remi
				160
				161	ldx [$Htblo+$nhi],$Tlo
				162	sll $remi,3,$remi
				163	xor $rem,$Zhi,$Zhi
				164	ldx [$Htbl+$nhi],$Thi
				165	srlx $Zlo,4,$Zlo
				166	ldx [$rem_4bit+$remi],$rem
				167	sllx $Zhi,60,$tmp
				168	xor $xi1,$nlo,$nlo
				169	srlx $Zhi,4,$Zhi
				170	and $nlo,0xf0,$nhi
				171	addcc $cnt,-1,$cnt
				172	xor $Zlo,$tmp,$Zlo
				173	and $nlo,0x0f,$nlo
				174	xor $Tlo,$Zlo,$Zlo
				175	sll $nlo,4,$nlo
				176	blu .Lghash_inner
				177	and $Zlo,0xf,$remi
				178
				179	ldx [$Htblo+$nlo],$Tlo
				180	sll $remi,3,$remi
				181	xor $Thi,$Zhi,$Zhi
				182	ldx [$Htbl+$nlo],$Thi
				183	srlx $Zlo,4,$Zlo
				184	xor $rem,$Zhi,$Zhi
				185	ldx [$rem_4bit+$remi],$rem
				186	sllx $Zhi,60,$tmp
				187	xor $Tlo,$Zlo,$Zlo
				188	srlx $Zhi,4,$Zhi
				189	xor $Zlo,$tmp,$Zlo
				190	xor $Thi,$Zhi,$Zhi
				191
				192	add $inp,16,$inp
				193	cmp $inp,$len
				194	be,pn SIZE_T_CC,.Ldone
				195	and $Zlo,0xf,$remi
				196
				197	ldx [$Htblo+$nhi],$Tlo
				198	sll $remi,3,$remi
				199	xor $rem,$Zhi,$Zhi
				200	ldx [$Htbl+$nhi],$Thi
				201	srlx $Zlo,4,$Zlo
				202	ldx [$rem_4bit+$remi],$rem
				203	sllx $Zhi,60,$tmp
				204	xor $Tlo,$Zlo,$Zlo
				205	ldub [$inp+15],$nlo
				206	srlx $Zhi,4,$Zhi
				207	xor $Zlo,$tmp,$Zlo
				208	xor $Thi,$Zhi,$Zhi
				209	stx $Zlo,[$Xi+8]
				210	xor $rem,$Zhi,$Zhi
				211	stx $Zhi,[$Xi]
				212	srl $Zlo,8,$xi1
				213	and $Zlo,0xff,$xi0
				214	ba .Louter
				215	and $xi1,0xff,$xi1
				216	.align 32
				217	.Ldone:
				218	ldx [$Htblo+$nhi],$Tlo
				219	sll $remi,3,$remi
				220	xor $rem,$Zhi,$Zhi
				221	ldx [$Htbl+$nhi],$Thi
				222	srlx $Zlo,4,$Zlo
				223	ldx [$rem_4bit+$remi],$rem
				224	sllx $Zhi,60,$tmp
				225	xor $Tlo,$Zlo,$Zlo
				226	srlx $Zhi,4,$Zhi
				227	xor $Zlo,$tmp,$Zlo
				228	xor $Thi,$Zhi,$Zhi
				229	stx $Zlo,[$Xi+8]
				230	xor $rem,$Zhi,$Zhi
				231	stx $Zhi,[$Xi]
				232
				233	ret
				234	restore
				235	.type gcm_ghash_4bit,#function
				236	.size gcm_ghash_4bit,(.-gcm_ghash_4bit)
				237	___
				238
				239	undef $inp;
				240	undef $len;
				241
				242	$code.=<<___;
				243	.globl gcm_gmult_4bit
				244	.align 32
				245	gcm_gmult_4bit:
				246	save %sp,-$frame,%sp
				247	ldub [$Xi+15],$nlo
				248	add $Htbl,8,$Htblo
				249
				250	1: call .+8
				251	add %o7,rem_4bit-1b,$rem_4bit
				252
				253	and $nlo,0xf0,$nhi
				254	and $nlo,0x0f,$nlo
				255	sll $nlo,4,$nlo
				256	ldx [$Htblo+$nlo],$Zlo
				257	ldx [$Htbl+$nlo],$Zhi
				258
				259	ldub [$Xi+14],$nlo
				260
				261	ldx [$Htblo+$nhi],$Tlo
				262	and $Zlo,0xf,$remi
				263	ldx [$Htbl+$nhi],$Thi
				264	sll $remi,3,$remi
				265	ldx [$rem_4bit+$remi],$rem
				266	srlx $Zlo,4,$Zlo
				267	mov 13,$cnt
				268	sllx $Zhi,60,$tmp
				269	xor $Tlo,$Zlo,$Zlo
				270	srlx $Zhi,4,$Zhi
				271	xor $Zlo,$tmp,$Zlo
				272
				273	and $Zlo,0xf,$remi
				274	and $nlo,0xf0,$nhi
				275	and $nlo,0x0f,$nlo
				276	ba .Lgmult_inner
				277	sll $nlo,4,$nlo
				278	.align 32
				279	.Lgmult_inner:
				280	ldx [$Htblo+$nlo],$Tlo
				281	sll $remi,3,$remi
				282	xor $Thi,$Zhi,$Zhi
				283	ldx [$Htbl+$nlo],$Thi
				284	srlx $Zlo,4,$Zlo
				285	xor $rem,$Zhi,$Zhi
				286	ldx [$rem_4bit+$remi],$rem
				287	sllx $Zhi,60,$tmp
				288	xor $Tlo,$Zlo,$Zlo
				289	ldub [$Xi+$cnt],$nlo
				290	srlx $Zhi,4,$Zhi
				291	xor $Zlo,$tmp,$Zlo
				292	xor $Thi,$Zhi,$Zhi
				293	and $Zlo,0xf,$remi
				294
				295	ldx [$Htblo+$nhi],$Tlo
				296	sll $remi,3,$remi
				297	xor $rem,$Zhi,$Zhi
				298	ldx [$Htbl+$nhi],$Thi
				299	srlx $Zlo,4,$Zlo
				300	ldx [$rem_4bit+$remi],$rem
				301	sllx $Zhi,60,$tmp
				302	srlx $Zhi,4,$Zhi
				303	and $nlo,0xf0,$nhi
				304	addcc $cnt,-1,$cnt
				305	xor $Zlo,$tmp,$Zlo
				306	and $nlo,0x0f,$nlo
				307	xor $Tlo,$Zlo,$Zlo
				308	sll $nlo,4,$nlo
				309	blu .Lgmult_inner
				310	and $Zlo,0xf,$remi
				311
				312	ldx [$Htblo+$nlo],$Tlo
				313	sll $remi,3,$remi
				314	xor $Thi,$Zhi,$Zhi
				315	ldx [$Htbl+$nlo],$Thi
				316	srlx $Zlo,4,$Zlo
				317	xor $rem,$Zhi,$Zhi
				318	ldx [$rem_4bit+$remi],$rem
				319	sllx $Zhi,60,$tmp
				320	xor $Tlo,$Zlo,$Zlo
				321	srlx $Zhi,4,$Zhi
				322	xor $Zlo,$tmp,$Zlo
				323	xor $Thi,$Zhi,$Zhi
				324	and $Zlo,0xf,$remi
				325
				326	ldx [$Htblo+$nhi],$Tlo
				327	sll $remi,3,$remi
				328	xor $rem,$Zhi,$Zhi
				329	ldx [$Htbl+$nhi],$Thi
				330	srlx $Zlo,4,$Zlo
				331	ldx [$rem_4bit+$remi],$rem
				332	sllx $Zhi,60,$tmp
				333	xor $Tlo,$Zlo,$Zlo
				334	srlx $Zhi,4,$Zhi
				335	xor $Zlo,$tmp,$Zlo
				336	xor $Thi,$Zhi,$Zhi
				337	stx $Zlo,[$Xi+8]
				338	xor $rem,$Zhi,$Zhi
				339	stx $Zhi,[$Xi]
				340
				341	ret
				342	restore
				343	.type gcm_gmult_4bit,#function
				344	.size gcm_gmult_4bit,(.-gcm_gmult_4bit)
				345	___
				346
				347	{{{
				348	# Straightforward 128x128-bit multiplication using Karatsuba algorithm
				349	# followed by pair of 64-bit reductions [with a shortcut in first one,
				350	# which allowed to break dependency between reductions and remove one
				351	# multiplication from critical path]. While it might be suboptimal
				352	# with regard to sheer number of multiplications, other methods [such
				353	# as aggregate reduction] would require more 64-bit registers, which
				354	# we don't have in 32-bit application context.
				355
				356	($Xip,$Htable,$inp,$len)=map("%i$_",(0..3));
				357
				358	($Hhl,$Hlo,$Hhi,$Xlo,$Xhi,$xE1,$sqr, $C0,$C1,$C2,$C3,$V)=
				359	(map("%o$_",(0..5,7)),map("%g$_",(1..5)));
				360
				361	($shl,$shr)=map("%l$_",(0..7));
				362
				363	# For details regarding "twisted H" see ghash-x86.pl.
				364	$code.=<<___;
				365	.globl gcm_init_vis3
				366	.align 32
				367	gcm_init_vis3:
				368	save %sp,-$frame,%sp
				369
				370	ldx [%i1+0],$Hhi
				371	ldx [%i1+8],$Hlo
				372	mov 0xE1,$Xhi
				373	mov 1,$Xlo
				374	sllx $Xhi,57,$Xhi
				375	srax $Hhi,63,$C0 ! broadcast carry
				376	addcc $Hlo,$Hlo,$Hlo ! H<<=1
				377	addxc $Hhi,$Hhi,$Hhi
				378	and $C0,$Xlo,$Xlo
				379	and $C0,$Xhi,$Xhi
				380	xor $Xlo,$Hlo,$Hlo
				381	xor $Xhi,$Hhi,$Hhi
				382	stx $Hlo,[%i0+8] ! save twisted H
				383	stx $Hhi,[%i0+0]
				384
				385	sethi %hi(0xA0406080),$V
				386	sethi %hi(0x20C0E000),%l0
				387	or $V,%lo(0xA0406080),$V
				388	or %l0,%lo(0x20C0E000),%l0
				389	sllx $V,32,$V
				390	or %l0,$V,$V ! (0xE0·i)&0xff=0xA040608020C0E000
				391	stx $V,[%i0+16]
				392
				393	ret
				394	restore
				395	.type gcm_init_vis3,#function
				396	.size gcm_init_vis3,.-gcm_init_vis3
				397
				398	.globl gcm_gmult_vis3
				399	.align 32
				400	gcm_gmult_vis3:
				401	save %sp,-$frame,%sp
				402
				403	ldx [$Xip+8],$Xlo ! load Xi
				404	ldx [$Xip+0],$Xhi
				405	ldx [$Htable+8],$Hlo ! load twisted H
				406	ldx [$Htable+0],$Hhi
				407
				408	mov 0xE1,%l7
				409	sllx %l7,57,$xE1 ! 57 is not a typo
				410	ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000
				411
				412	xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing
				413	xmulx $Xlo,$Hlo,$C0
				414	xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing
				415	xmulx $C2,$Hhl,$C1
				416	xmulxhi $Xlo,$Hlo,$Xlo
				417	xmulxhi $C2,$Hhl,$C2
				418	xmulxhi $Xhi,$Hhi,$C3
				419	xmulx $Xhi,$Hhi,$Xhi
				420
				421	sll $C0,3,$sqr
				422	srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)]
				423	xor $C0,$sqr,$sqr
				424	sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
				425
				426	xor $C0,$C1,$C1 ! Karatsuba post-processing
				427	xor $Xlo,$C2,$C2
				428	xor $sqr,$Xlo,$Xlo ! real destination is $C1
				429	xor $C3,$C2,$C2
				430	xor $Xlo,$C1,$C1
				431	xor $Xhi,$C2,$C2
				432	xor $Xhi,$C1,$C1
				433
				434	xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56
				435	xor $C0,$C2,$C2
				436	xmulx $C1,$xE1,$C0
				437	xor $C1,$C3,$C3
				438	xmulxhi $C1,$xE1,$C1
				439
				440	xor $Xlo,$C2,$C2
				441	xor $C0,$C2,$C2
				442	xor $C1,$C3,$C3
				443
				444	stx $C2,[$Xip+8] ! save Xi
				445	stx $C3,[$Xip+0]
				446
				447	ret
				448	restore
				449	.type gcm_gmult_vis3,#function
				450	.size gcm_gmult_vis3,.-gcm_gmult_vis3
				451
				452	.globl gcm_ghash_vis3
				453	.align 32
				454	gcm_ghash_vis3:
				455	save %sp,-$frame,%sp
				456	nop
				457	srln $len,0,$len ! needed on v8+, "nop" on v9
				458
				459	ldx [$Xip+8],$C2 ! load Xi
				460	ldx [$Xip+0],$C3
				461	ldx [$Htable+8],$Hlo ! load twisted H
				462	ldx [$Htable+0],$Hhi
				463
				464	mov 0xE1,%l7
				465	sllx %l7,57,$xE1 ! 57 is not a typo
				466	ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000
				467
				468	and $inp,7,$shl
				469	andn $inp,7,$inp
				470	sll $shl,3,$shl
				471	prefetch [$inp+63], 20
				472	sub %g0,$shl,$shr
				473
				474	xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing
				475	.Loop:
				476	ldx [$inp+8],$Xlo
				477	brz,pt $shl,1f
				478	ldx [$inp+0],$Xhi
				479
				480	ldx [$inp+16],$C1 ! align data
				481	srlx $Xlo,$shr,$C0
				482	sllx $Xlo,$shl,$Xlo
				483	sllx $Xhi,$shl,$Xhi
				484	srlx $C1,$shr,$C1
				485	or $C0,$Xhi,$Xhi
				486	or $C1,$Xlo,$Xlo
				487	1:
				488	add $inp,16,$inp
				489	sub $len,16,$len
				490	xor $C2,$Xlo,$Xlo
				491	xor $C3,$Xhi,$Xhi
				492	prefetch [$inp+63], 20
				493
				494	xmulx $Xlo,$Hlo,$C0
				495	xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing
				496	xmulx $C2,$Hhl,$C1
				497	xmulxhi $Xlo,$Hlo,$Xlo
				498	xmulxhi $C2,$Hhl,$C2
				499	xmulxhi $Xhi,$Hhi,$C3
				500	xmulx $Xhi,$Hhi,$Xhi
				501
				502	sll $C0,3,$sqr
				503	srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)]
				504	xor $C0,$sqr,$sqr
				505	sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
				506
				507	xor $C0,$C1,$C1 ! Karatsuba post-processing
				508	xor $Xlo,$C2,$C2
				509	xor $sqr,$Xlo,$Xlo ! real destination is $C1
				510	xor $C3,$C2,$C2
				511	xor $Xlo,$C1,$C1
				512	xor $Xhi,$C2,$C2
				513	xor $Xhi,$C1,$C1
				514
				515	xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56
				516	xor $C0,$C2,$C2
				517	xmulx $C1,$xE1,$C0
				518	xor $C1,$C3,$C3
				519	xmulxhi $C1,$xE1,$C1
				520
				521	xor $Xlo,$C2,$C2
				522	xor $C0,$C2,$C2
				523	brnz,pt $len,.Loop
				524	xor $C1,$C3,$C3
				525
				526	stx $C2,[$Xip+8] ! save Xi
				527	stx $C3,[$Xip+0]
				528
				529	ret
				530	restore
				531	.type gcm_ghash_vis3,#function
				532	.size gcm_ghash_vis3,.-gcm_ghash_vis3
				533	___
				534	}}}
				535	$code.=<<___;
				536	.asciz "GHASH for SPARCv9/VIS3, CRYPTOGAMS by <appro\@openssl.org>"
				537	.align 4
				538	___
				539
				540
				541	# Purpose of these subroutines is to explicitly encode VIS instructions,
				542	# so that one can compile the module without having to specify VIS
				543	# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
				544	# Idea is to reserve for option to produce "universal" binary and let
				545	# programmer detect if current CPU is VIS capable at run-time.
				546	sub unvis3 {
				547	my ($mnemonic,$rs1,$rs2,$rd)=@_;
				548	my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
				549	my ($ref,$opf);
				550	my %visopf = ( "addxc" => 0x011,
				551	"addxccc" => 0x013,
				552	"xmulx" => 0x115,
				553	"xmulxhi" => 0x116 );
				554
				555	$ref = "$mnemonic\t$rs1,$rs2,$rd";
				556
				557	if ($opf=$visopf{$mnemonic}) {
				558	foreach ($rs1,$rs2,$rd) {
				559	return $ref if (!/%([goli])([0-9])/);
				560	$_=$bias{$1}+$2;
				561	}
				562
				563	return sprintf ".word\t0x%08x !%s",
				564	0x81b00000\|$rd<<25\|$rs1<<14\|$opf<<5\|$rs2,
				565	$ref;
				566	} else {
				567	return $ref;
				568	}
				569	}
				570
				571	foreach (split("\n",$code)) {
				572	s/\`([^\`]*)\`/eval $1/ge;
				573
				574	s/\b(xmulx[hi]\|addxc[c]{0,2})\s+(%[goli][0-7]),\s(%[goli][0-7]),\s*(%[goli][0-7])/
				575	&unvis3($1,$2,$3,$4)
				576	/ge;
				577
				578	print $_,"\n";
				579	}
				580
				581	close STDOUT or die "error closing STDOUT: $!";