Blame - ap/lib/libpng/pnggccrd.c - T106_DC

blob: 8d81c31471321450d8df6755c37bcbddc245edfe [file] [log] [blame]

lh	9ed821d	2023-04-07 01:36:19 -0700	[diff] [blame]	1	/* pnggccrd.c - mixed C/assembler version of utilities to read a PNG file
				2	*
				3	* For Intel x86 CPU (Pentium-MMX or later) and GNU C compiler.
				4	*
				5	* See http://www.intel.com/drg/pentiumII/appnotes/916/916.htm
				6	* and http://www.intel.com/drg/pentiumII/appnotes/923/923.htm
				7	* for Intel's performance analysis of the MMX vs. non-MMX code.
				8	*
				9	* libpng version 1.2.5 - October 3, 2002
				10	* For conditions of distribution and use, see copyright notice in png.h
				11	* Copyright (c) 1998-2002 Glenn Randers-Pehrson
				12	* Copyright (c) 1998, Intel Corporation
				13	*
				14	* Based on MSVC code contributed by Nirav Chhatrapati, Intel Corp., 1998.
				15	* Interface to libpng contributed by Gilles Vollant, 1999.
				16	* GNU C port by Greg Roelofs, 1999-2001.
				17	*
				18	* Lines 2350-4300 converted in place with intel2gas 1.3.1:
				19	*
				20	* intel2gas -mdI pnggccrd.c.partially-msvc -o pnggccrd.c
				21	*
				22	* and then cleaned up by hand. See http://hermes.terminal.at/intel2gas/ .
				23	*
				24	* NOTE: A sufficiently recent version of GNU as (or as.exe under DOS/Windows)
				25	* is required to assemble the newer MMX instructions such as movq.
				26	* For djgpp, see
				27	*
				28	* ftp://ftp.simtel.net/pub/simtelnet/gnu/djgpp/v2gnu/bnu281b.zip
				29	*
				30	* (or a later version in the same directory). For Linux, check your
				31	* distribution's web site(s) or try these links:
				32	*
				33	* http://rufus.w3.org/linux/RPM/binutils.html
				34	* http://www.debian.org/Packages/stable/devel/binutils.html
				35	* ftp://ftp.slackware.com/pub/linux/slackware/slackware/slakware/d1/
				36	* binutils.tgz
				37	*
				38	* For other platforms, see the main GNU site:
				39	*
				40	* ftp://ftp.gnu.org/pub/gnu/binutils/
				41	*
				42	* Version 2.5.2l.15 is definitely too old...
				43	*/
				44
				45	/*
				46	* TEMPORARY PORTING NOTES AND CHANGELOG (mostly by Greg Roelofs)
				47	* =====================================
				48	*
				49	* 19991006:
				50	* - fixed sign error in post-MMX cleanup code (16- & 32-bit cases)
				51	*
				52	* 19991007:
				53	* - additional optimizations (possible or definite):
				54	* x [DONE] write MMX code for 64-bit case (pixel_bytes == 8) [not tested]
				55	* - write MMX code for 48-bit case (pixel_bytes == 6)
				56	* - figure out what's up with 24-bit case (pixel_bytes == 3):
				57	* why subtract 8 from width_mmx in the pass 4/5 case?
				58	* (only width_mmx case) (near line 1606)
				59	* x [DONE] replace pixel_bytes within each block with the true
				60	* constant value (or are compilers smart enough to do that?)
				61	* - rewrite all MMX interlacing code so it's aligned with
				62	* the beginning of the row buffer, not the end. This
				63	* would not only allow one to eliminate half of the memory
				64	* writes for odd passes (that is, pass == odd), it may also
				65	* eliminate some unaligned-data-access exceptions (assuming
				66	* there's a penalty for not aligning 64-bit accesses on
				67	* 64-bit boundaries). The only catch is that the "leftover"
				68	* pixel(s) at the end of the row would have to be saved,
				69	* but there are enough unused MMX registers in every case,
				70	* so this is not a problem. A further benefit is that the
				71	* post-MMX cleanup code (C code) in at least some of the
				72	* cases could be done within the assembler block.
				73	* x [DONE] the "v3 v2 v1 v0 v7 v6 v5 v4" comments are confusing,
				74	* inconsistent, and don't match the MMX Programmer's Reference
				75	* Manual conventions anyway. They should be changed to
				76	* "b7 b6 b5 b4 b3 b2 b1 b0," where b0 indicates the byte that
				77	* was lowest in memory (e.g., corresponding to a left pixel)
				78	* and b7 is the byte that was highest (e.g., a right pixel).
				79	*
				80	* 19991016:
				81	* - Brennan's Guide notwithstanding, gcc under Linux does not
				82	* want globals prefixed by underscores when referencing them--
				83	* i.e., if the variable is const4, then refer to it as const4,
				84	* not _const4. This seems to be a djgpp-specific requirement.
				85	* Also, such variables apparently must be declared outside
				86	* of functions; neither static nor automatic variables work if
				87	* defined within the scope of a single function, but both
				88	* static and truly global (multi-module) variables work fine.
				89	*
				90	* 19991023:
				91	* - fixed png_combine_row() non-MMX replication bug (odd passes only?)
				92	* - switched from string-concatenation-with-macros to cleaner method of
				93	* renaming global variables for djgpp--i.e., always use prefixes in
				94	* inlined assembler code (== strings) and conditionally rename the
				95	* variables, not the other way around. Hence _const4, _mask8_0, etc.
				96	*
				97	* 19991024:
				98	* - fixed mmxsupport()/png_do_read_interlace() first-row bug
				99	* This one was severely weird: even though mmxsupport() doesn't touch
				100	* ebx (where "row" pointer was stored), it nevertheless managed to zero
				101	* the register (even in static/non-fPIC code--see below), which in turn
				102	* caused png_do_read_interlace() to return prematurely on the first row of
				103	* interlaced images (i.e., without expanding the interlaced pixels).
				104	* Inspection of the generated assembly code didn't turn up any clues,
				105	* although it did point at a minor optimization (i.e., get rid of
				106	* mmx_supported_local variable and just use eax). Possibly the CPUID
				107	* instruction is more destructive than it looks? (Not yet checked.)
				108	* - "info gcc" was next to useless, so compared fPIC and non-fPIC assembly
				109	* listings... Apparently register spillage has to do with ebx, since
				110	* it's used to index the global offset table. Commenting it out of the
				111	* input-reg lists in png_combine_row() eliminated compiler barfage, so
				112	* ifdef'd with __PIC__ macro: if defined, use a global for unmask
				113	*
				114	* 19991107:
				115	* - verified CPUID clobberage: 12-char string constant ("GenuineIntel",
				116	* "AuthenticAMD", etc.) placed in ebx:ecx:edx. Still need to polish.
				117	*
				118	* 19991120:
				119	* - made "diff" variable (now "_dif") global to simplify conversion of
				120	* filtering routines (running out of regs, sigh). "diff" is still used
				121	* in interlacing routines, however.
				122	* - fixed up both versions of mmxsupport() (ORIG_THAT_USED_TO_CLOBBER_EBX
				123	* macro determines which is used); original not yet tested.
				124	*
				125	* 20000213:
				126	* - when compiling with gcc, be sure to use -fomit-frame-pointer
				127	*
				128	* 20000319:
				129	* - fixed a register-name typo in png_do_read_interlace(), default (MMX) case,
				130	* pass == 4 or 5, that caused visible corruption of interlaced images
				131	*
				132	* 20000623:
				133	* - Various problems were reported with gcc 2.95.2 in the Cygwin environment,
				134	* many of the form "forbidden register 0 (ax) was spilled for class AREG."
				135	* This is explained at http://gcc.gnu.org/fom_serv/cache/23.html, and
				136	* Chuck Wilson supplied a patch involving dummy output registers. See
				137	* http://sourceforge.net/bugs/?func=detailbug&bug_id=108741&group_id=5624
				138	* for the original (anonymous) SourceForge bug report.
				139	*
				140	* 20000706:
				141	* - Chuck Wilson passed along these remaining gcc 2.95.2 errors:
				142	* pnggccrd.c: In function `png_combine_row':
				143	* pnggccrd.c:525: more than 10 operands in `asm'
				144	* pnggccrd.c:669: more than 10 operands in `asm'
				145	* pnggccrd.c:828: more than 10 operands in `asm'
				146	* pnggccrd.c:994: more than 10 operands in `asm'
				147	* pnggccrd.c:1177: more than 10 operands in `asm'
				148	* They are all the same problem and can be worked around by using the
				149	* global _unmask variable unconditionally, not just in the -fPIC case.
				150	* Reportedly earlier versions of gcc also have the problem with more than
				151	* 10 operands; they just don't report it. Much strangeness ensues, etc.
				152	*
				153	* 20000729:
				154	* - enabled png_read_filter_row_mmx_up() (shortest remaining unconverted
				155	* MMX routine); began converting png_read_filter_row_mmx_sub()
				156	* - to finish remaining sections:
				157	* - clean up indentation and comments
				158	* - preload local variables
				159	* - add output and input regs (order of former determines numerical
				160	* mapping of latter)
				161	* - avoid all usage of ebx (including bx, bh, bl) register [20000823]
				162	* - remove "$" from addressing of Shift and Mask variables [20000823]
				163	*
				164	* 20000731:
				165	* - global union vars causing segfaults in png_read_filter_row_mmx_sub()?
				166	*
				167	* 20000822:
				168	* - ARGH, stupid png_read_filter_row_mmx_sub() segfault only happens with
				169	* shared-library (-fPIC) version! Code works just fine as part of static
				170	* library. Damn damn damn damn damn, should have tested that sooner.
				171	* ebx is getting clobbered again (explicitly this time); need to save it
				172	* on stack or rewrite asm code to avoid using it altogether. Blargh!
				173	*
				174	* 20000823:
				175	* - first section was trickiest; all remaining sections have ebx -> edx now.
				176	* (-fPIC works again.) Also added missing underscores to various Shift*
				177	* and Mask globals and got rid of leading "$" signs.
				178	*
				179	* 20000826:
				180	* - added visual separators to help navigate microscopic printed copies
				181	* (http://pobox.com/~newt/code/gpr-latest.zip, mode 10); started working
				182	* on png_read_filter_row_mmx_avg()
				183	*
				184	* 20000828:
				185	* - finished png_read_filter_row_mmx_avg(): only Paeth left! (930 lines...)
				186	* What the hell, did png_read_filter_row_mmx_paeth(), too. Comments not
				187	* cleaned up/shortened in either routine, but functionality is complete
				188	* and seems to be working fine.
				189	*
				190	* 20000829:
				191	* - ahhh, figured out last(?) bit of gcc/gas asm-fu: if register is listed
				192	* as an input reg (with dummy output variables, etc.), then it cannot
				193	* also appear in the clobber list or gcc 2.95.2 will barf. The solution
				194	* is simple enough...
				195	*
				196	* 20000914:
				197	* - bug in png_read_filter_row_mmx_avg(): 16-bit grayscale not handled
				198	* correctly (but 48-bit RGB just fine)
				199	*
				200	* 20000916:
				201	* - fixed bug in png_read_filter_row_mmx_avg(), bpp == 2 case; three errors:
				202	* - "_ShiftBpp.use = 24;" should have been "_ShiftBpp.use = 16;"
				203	* - "_ShiftRem.use = 40;" should have been "_ShiftRem.use = 48;"
				204	* - "psllq _ShiftRem, %%mm2" should have been "psrlq _ShiftRem, %%mm2"
				205	*
				206	* 20010101:
				207	* - added new png_init_mmx_flags() function (here only because it needs to
				208	* call mmxsupport(), which should probably become global png_mmxsupport());
				209	* modified other MMX routines to run conditionally (png_ptr->asm_flags)
				210	*
				211	* 20010103:
				212	* - renamed mmxsupport() to png_mmx_support(), with auto-set of mmx_supported,
				213	* and made it public; moved png_init_mmx_flags() to png.c as internal func
				214	*
				215	* 20010104:
				216	* - removed dependency on png_read_filter_row_c() (C code already duplicated
				217	* within MMX version of png_read_filter_row()) so no longer necessary to
				218	* compile it into pngrutil.o
				219	*
				220	* 20010310:
				221	* - fixed buffer-overrun bug in png_combine_row() C code (non-MMX)
				222	*
				223	* 20020304:
				224	* - eliminated incorrect use of width_mmx in pixel_bytes == 8 case
				225	*
				226	* STILL TO DO:
				227	* - test png_do_read_interlace() 64-bit case (pixel_bytes == 8)
				228	* - write MMX code for 48-bit case (pixel_bytes == 6)
				229	* - figure out what's up with 24-bit case (pixel_bytes == 3):
				230	* why subtract 8 from width_mmx in the pass 4/5 case?
				231	* (only width_mmx case) (near line 1606)
				232	* - rewrite all MMX interlacing code so it's aligned with beginning
				233	* of the row buffer, not the end (see 19991007 for details)
				234	* x pick one version of mmxsupport() and get rid of the other
				235	* - add error messages to any remaining bogus default cases
				236	* - enable pixel_depth == 8 cases in png_read_filter_row()? (test speed)
				237	* x add support for runtime enable/disable/query of various MMX routines
				238	*/
				239
				240	#define PNG_INTERNAL
				241	#include "png.h"
				242
				243	#if defined(PNG_USE_PNGGCCRD)
				244
				245	int PNGAPI png_mmx_support(void);
				246
				247	#ifdef PNG_USE_LOCAL_ARRAYS
				248	static const int FARDATA png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0};
				249	static const int FARDATA png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
				250	static const int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
				251	#endif
				252
				253	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
				254	/* djgpp, Win32, and Cygwin add their own underscores to global variables,
				255	* so define them without: */
				256	#if defined(__DJGPP__) \|\| defined(WIN32) \|\| defined(__CYGWIN__)
				257	# define _mmx_supported mmx_supported
				258	# define _const4 const4
				259	# define _const6 const6
				260	# define _mask8_0 mask8_0
				261	# define _mask16_1 mask16_1
				262	# define _mask16_0 mask16_0
				263	# define _mask24_2 mask24_2
				264	# define _mask24_1 mask24_1
				265	# define _mask24_0 mask24_0
				266	# define _mask32_3 mask32_3
				267	# define _mask32_2 mask32_2
				268	# define _mask32_1 mask32_1
				269	# define _mask32_0 mask32_0
				270	# define _mask48_5 mask48_5
				271	# define _mask48_4 mask48_4
				272	# define _mask48_3 mask48_3
				273	# define _mask48_2 mask48_2
				274	# define _mask48_1 mask48_1
				275	# define _mask48_0 mask48_0
				276	# define _LBCarryMask LBCarryMask
				277	# define _HBClearMask HBClearMask
				278	# define _ActiveMask ActiveMask
				279	# define _ActiveMask2 ActiveMask2
				280	# define _ActiveMaskEnd ActiveMaskEnd
				281	# define _ShiftBpp ShiftBpp
				282	# define _ShiftRem ShiftRem
				283	#ifdef PNG_THREAD_UNSAFE_OK
				284	# define _unmask unmask
				285	# define _FullLength FullLength
				286	# define _MMXLength MMXLength
				287	# define _dif dif
				288	# define _patemp patemp
				289	# define _pbtemp pbtemp
				290	# define _pctemp pctemp
				291	#endif
				292	#endif
				293
				294
				295	/* These constants are used in the inlined MMX assembly code.
				296	Ignore gcc's "At top level: defined but not used" warnings. */
				297
				298	/* GRR 20000706: originally _unmask was needed only when compiling with -fPIC,
				299	* since that case uses the %ebx register for indexing the Global Offset Table
				300	* and there were no other registers available. But gcc 2.95 and later emit
				301	* "more than 10 operands in `asm'" errors when %ebx is used to preload unmask
				302	* in the non-PIC case, so we'll just use the global unconditionally now.
				303	*/
				304	#ifdef PNG_THREAD_UNSAFE_OK
				305	static int _unmask;
				306	#endif
				307
				308	static unsigned long long _mask8_0 = 0x0102040810204080LL;
				309
				310	static unsigned long long _mask16_1 = 0x0101020204040808LL;
				311	static unsigned long long _mask16_0 = 0x1010202040408080LL;
				312
				313	static unsigned long long _mask24_2 = 0x0101010202020404LL;
				314	static unsigned long long _mask24_1 = 0x0408080810101020LL;
				315	static unsigned long long _mask24_0 = 0x2020404040808080LL;
				316
				317	static unsigned long long _mask32_3 = 0x0101010102020202LL;
				318	static unsigned long long _mask32_2 = 0x0404040408080808LL;
				319	static unsigned long long _mask32_1 = 0x1010101020202020LL;
				320	static unsigned long long _mask32_0 = 0x4040404080808080LL;
				321
				322	static unsigned long long _mask48_5 = 0x0101010101010202LL;
				323	static unsigned long long _mask48_4 = 0x0202020204040404LL;
				324	static unsigned long long _mask48_3 = 0x0404080808080808LL;
				325	static unsigned long long _mask48_2 = 0x1010101010102020LL;
				326	static unsigned long long _mask48_1 = 0x2020202040404040LL;
				327	static unsigned long long _mask48_0 = 0x4040808080808080LL;
				328
				329	static unsigned long long _const4 = 0x0000000000FFFFFFLL;
				330	//static unsigned long long _const5 = 0x000000FFFFFF0000LL; // NOT USED
				331	static unsigned long long _const6 = 0x00000000000000FFLL;
				332
				333	// These are used in the row-filter routines and should/would be local
				334	// variables if not for gcc addressing limitations.
				335	// WARNING: Their presence probably defeats the thread safety of libpng.
				336
				337	#ifdef PNG_THREAD_UNSAFE_OK
				338	static png_uint_32 _FullLength;
				339	static png_uint_32 _MMXLength;
				340	static int _dif;
				341	static int _patemp; // temp variables for Paeth routine
				342	static int _pbtemp;
				343	static int _pctemp;
				344	#endif
				345
				346	void /* PRIVATE */
				347	png_squelch_warnings(void)
				348	{
				349	#ifdef PNG_THREAD_UNSAFE_OK
				350	_dif = _dif;
				351	_patemp = _patemp;
				352	_pbtemp = _pbtemp;
				353	_pctemp = _pctemp;
				354	_MMXLength = _MMXLength;
				355	#endif
				356	_const4 = _const4;
				357	_const6 = _const6;
				358	_mask8_0 = _mask8_0;
				359	_mask16_1 = _mask16_1;
				360	_mask16_0 = _mask16_0;
				361	_mask24_2 = _mask24_2;
				362	_mask24_1 = _mask24_1;
				363	_mask24_0 = _mask24_0;
				364	_mask32_3 = _mask32_3;
				365	_mask32_2 = _mask32_2;
				366	_mask32_1 = _mask32_1;
				367	_mask32_0 = _mask32_0;
				368	_mask48_5 = _mask48_5;
				369	_mask48_4 = _mask48_4;
				370	_mask48_3 = _mask48_3;
				371	_mask48_2 = _mask48_2;
				372	_mask48_1 = _mask48_1;
				373	_mask48_0 = _mask48_0;
				374	}
				375	#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
				376
				377
				378	static int _mmx_supported = 2;
				379
				380	/===========================================================================/
				381	/* */
				382	/* P N G _ C O M B I N E _ R O W */
				383	/* */
				384	/===========================================================================/
				385
				386	#if defined(PNG_HAVE_ASSEMBLER_COMBINE_ROW)
				387
				388	#define BPP2 2
				389	#define BPP3 3 /* bytes per pixel (a.k.a. pixel_bytes) */
				390	#define BPP4 4
				391	#define BPP6 6 /* (defined only to help avoid cut-and-paste errors) */
				392	#define BPP8 8
				393
				394	/* Combines the row recently read in with the previous row.
				395	This routine takes care of alpha and transparency if requested.
				396	This routine also handles the two methods of progressive display
				397	of interlaced images, depending on the mask value.
				398	The mask value describes which pixels are to be combined with
				399	the row. The pattern always repeats every 8 pixels, so just 8
				400	bits are needed. A one indicates the pixel is to be combined; a
				401	zero indicates the pixel is to be skipped. This is in addition
				402	to any alpha or transparency value associated with the pixel.
				403	If you want all pixels to be combined, pass 0xff (255) in mask. */
				404
				405	/* Use this routine for the x86 platform - it uses a faster MMX routine
				406	if the machine supports MMX. */
				407
				408	void /* PRIVATE */
				409	png_combine_row(png_structp png_ptr, png_bytep row, int mask)
				410	{
				411	png_debug(1, "in png_combine_row (pnggccrd.c)\n");
				412
				413	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
				414	if (_mmx_supported == 2) {
				415	/* this should have happened in png_init_mmx_flags() already */
				416	png_warning(png_ptr, "asm_flags may not have been initialized");
				417	png_mmx_support();
				418	}
				419	#endif
				420
				421	if (mask == 0xff)
				422	{
				423	png_debug(2,"mask == 0xff: doing single png_memcpy()\n");
				424	png_memcpy(row, png_ptr->row_buf + 1,
				425	(png_size_t)((png_ptr->width * png_ptr->row_info.pixel_depth + 7) >> 3));
				426	}
				427	else /* (png_combine_row() is never called with mask == 0) */
				428	{
				429	switch (png_ptr->row_info.pixel_depth)
				430	{
				431	case 1: /* png_ptr->row_info.pixel_depth */
				432	{
				433	png_bytep sp;
				434	png_bytep dp;
				435	int s_inc, s_start, s_end;
				436	int m;
				437	int shift;
				438	png_uint_32 i;
				439
				440	sp = png_ptr->row_buf + 1;
				441	dp = row;
				442	m = 0x80;
				443	#if defined(PNG_READ_PACKSWAP_SUPPORTED)
				444	if (png_ptr->transformations & PNG_PACKSWAP)
				445	{
				446	s_start = 0;
				447	s_end = 7;
				448	s_inc = 1;
				449	}
				450	else
				451	#endif
				452	{
				453	s_start = 7;
				454	s_end = 0;
				455	s_inc = -1;
				456	}
				457
				458	shift = s_start;
				459
				460	for (i = 0; i < png_ptr->width; i++)
				461	{
				462	if (m & mask)
				463	{
				464	int value;
				465
				466	value = (*sp >> shift) & 0x1;
				467	*dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
				468	*dp \|= (png_byte)(value << shift);
				469	}
				470
				471	if (shift == s_end)
				472	{
				473	shift = s_start;
				474	sp++;
				475	dp++;
				476	}
				477	else
				478	shift += s_inc;
				479
				480	if (m == 1)
				481	m = 0x80;
				482	else
				483	m >>= 1;
				484	}
				485	break;
				486	}
				487
				488	case 2: /* png_ptr->row_info.pixel_depth */
				489	{
				490	png_bytep sp;
				491	png_bytep dp;
				492	int s_start, s_end, s_inc;
				493	int m;
				494	int shift;
				495	png_uint_32 i;
				496	int value;
				497
				498	sp = png_ptr->row_buf + 1;
				499	dp = row;
				500	m = 0x80;
				501	#if defined(PNG_READ_PACKSWAP_SUPPORTED)
				502	if (png_ptr->transformations & PNG_PACKSWAP)
				503	{
				504	s_start = 0;
				505	s_end = 6;
				506	s_inc = 2;
				507	}
				508	else
				509	#endif
				510	{
				511	s_start = 6;
				512	s_end = 0;
				513	s_inc = -2;
				514	}
				515
				516	shift = s_start;
				517
				518	for (i = 0; i < png_ptr->width; i++)
				519	{
				520	if (m & mask)
				521	{
				522	value = (*sp >> shift) & 0x3;
				523	*dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
				524	*dp \|= (png_byte)(value << shift);
				525	}
				526
				527	if (shift == s_end)
				528	{
				529	shift = s_start;
				530	sp++;
				531	dp++;
				532	}
				533	else
				534	shift += s_inc;
				535	if (m == 1)
				536	m = 0x80;
				537	else
				538	m >>= 1;
				539	}
				540	break;
				541	}
				542
				543	case 4: /* png_ptr->row_info.pixel_depth */
				544	{
				545	png_bytep sp;
				546	png_bytep dp;
				547	int s_start, s_end, s_inc;
				548	int m;
				549	int shift;
				550	png_uint_32 i;
				551	int value;
				552
				553	sp = png_ptr->row_buf + 1;
				554	dp = row;
				555	m = 0x80;
				556	#if defined(PNG_READ_PACKSWAP_SUPPORTED)
				557	if (png_ptr->transformations & PNG_PACKSWAP)
				558	{
				559	s_start = 0;
				560	s_end = 4;
				561	s_inc = 4;
				562	}
				563	else
				564	#endif
				565	{
				566	s_start = 4;
				567	s_end = 0;
				568	s_inc = -4;
				569	}
				570	shift = s_start;
				571
				572	for (i = 0; i < png_ptr->width; i++)
				573	{
				574	if (m & mask)
				575	{
				576	value = (*sp >> shift) & 0xf;
				577	*dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
				578	*dp \|= (png_byte)(value << shift);
				579	}
				580
				581	if (shift == s_end)
				582	{
				583	shift = s_start;
				584	sp++;
				585	dp++;
				586	}
				587	else
				588	shift += s_inc;
				589	if (m == 1)
				590	m = 0x80;
				591	else
				592	m >>= 1;
				593	}
				594	break;
				595	}
				596
				597	case 8: /* png_ptr->row_info.pixel_depth */
				598	{
				599	png_bytep srcptr;
				600	png_bytep dstptr;
				601
				602	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
				603	#if !defined(PNG_1_0_X)
				604	if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
				605	/* && _mmx_supported */ )
				606	#else
				607	if (_mmx_supported)
				608	#endif
				609	{
				610	png_uint_32 len;
				611	int diff;
				612	int dummy_value_a; // fix 'forbidden register spilled' error
				613	int dummy_value_d;
				614	int dummy_value_c;
				615	int dummy_value_S;
				616	int dummy_value_D;
				617	_unmask = ~mask; // global variable for -fPIC version
				618	srcptr = png_ptr->row_buf + 1;
				619	dstptr = row;
				620	len = png_ptr->width &~7; // reduce to multiple of 8
				621	diff = (int) (png_ptr->width & 7); // amount lost
				622
				623	__asm__ __volatile__ (
				624	"movd _unmask, %%mm7 \n\t" // load bit pattern
				625	"psubb %%mm6, %%mm6 \n\t" // zero mm6
				626	"punpcklbw %%mm7, %%mm7 \n\t"
				627	"punpcklwd %%mm7, %%mm7 \n\t"
				628	"punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
				629
				630	"movq _mask8_0, %%mm0 \n\t"
				631	"pand %%mm7, %%mm0 \n\t" // nonzero if keep byte
				632	"pcmpeqb %%mm6, %%mm0 \n\t" // zeros->1s, v versa
				633
				634	// preload "movl len, %%ecx \n\t" // load length of line
				635	// preload "movl srcptr, %%esi \n\t" // load source
				636	// preload "movl dstptr, %%edi \n\t" // load dest
				637
				638	"cmpl $0, %%ecx \n\t" // len == 0 ?
				639	"je mainloop8end \n\t"
				640
				641	"mainloop8: \n\t"
				642	"movq (%%esi), %%mm4 \n\t" // *srcptr
				643	"pand %%mm0, %%mm4 \n\t"
				644	"movq %%mm0, %%mm6 \n\t"
				645	"pandn (%%edi), %%mm6 \n\t" // *dstptr
				646	"por %%mm6, %%mm4 \n\t"
				647	"movq %%mm4, (%%edi) \n\t"
				648	"addl $8, %%esi \n\t" // inc by 8 bytes processed
				649	"addl $8, %%edi \n\t"
				650	"subl $8, %%ecx \n\t" // dec by 8 pixels processed
				651	"ja mainloop8 \n\t"
				652
				653	"mainloop8end: \n\t"
				654	// preload "movl diff, %%ecx \n\t" // (diff is in eax)
				655	"movl %%eax, %%ecx \n\t"
				656	"cmpl $0, %%ecx \n\t"
				657	"jz end8 \n\t"
				658	// preload "movl mask, %%edx \n\t"
				659	"sall $24, %%edx \n\t" // make low byte, high byte
				660
				661	"secondloop8: \n\t"
				662	"sall %%edx \n\t" // move high bit to CF
				663	"jnc skip8 \n\t" // if CF = 0
				664	"movb (%%esi), %%al \n\t"
				665	"movb %%al, (%%edi) \n\t"
				666
				667	"skip8: \n\t"
				668	"incl %%esi \n\t"
				669	"incl %%edi \n\t"
				670	"decl %%ecx \n\t"
				671	"jnz secondloop8 \n\t"
				672
				673	"end8: \n\t"
				674	"EMMS \n\t" // DONE
				675
				676	: "=a" (dummy_value_a), // output regs (dummy)
				677	"=d" (dummy_value_d),
				678	"=c" (dummy_value_c),
				679	"=S" (dummy_value_S),
				680	"=D" (dummy_value_D)
				681
				682	: "3" (srcptr), // esi // input regs
				683	"4" (dstptr), // edi
				684	"0" (diff), // eax
				685	// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
				686	"2" (len), // ecx
				687	"1" (mask) // edx
				688
				689	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
				690	: "%mm0", "%mm4", "%mm6", "%mm7" // clobber list
				691	#endif
				692	);
				693	}
				694	else /* mmx _not supported - Use modified C routine */
				695	#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
				696	{
				697	register png_uint_32 i;
				698	png_uint_32 initial_val = png_pass_start[png_ptr->pass];
				699	/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
				700	register int stride = png_pass_inc[png_ptr->pass];
				701	/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
				702	register int rep_bytes = png_pass_width[png_ptr->pass];
				703	/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
				704	png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
				705	int diff = (int) (png_ptr->width & 7); /* amount lost */
				706	register png_uint_32 final_val = len; /* GRR bugfix */
				707
				708	srcptr = png_ptr->row_buf + 1 + initial_val;
				709	dstptr = row + initial_val;
				710
				711	for (i = initial_val; i < final_val; i += stride)
				712	{
				713	png_memcpy(dstptr, srcptr, rep_bytes);
				714	srcptr += stride;
				715	dstptr += stride;
				716	}
				717	if (diff) /* number of leftover pixels: 3 for pngtest */
				718	{
				719	final_val+=diff /* BPP1 / ;
				720	for (; i < final_val; i += stride)
				721	{
				722	if (rep_bytes > (int)(final_val-i))
				723	rep_bytes = (int)(final_val-i);
				724	png_memcpy(dstptr, srcptr, rep_bytes);
				725	srcptr += stride;
				726	dstptr += stride;
				727	}
				728	}
				729
				730	} /* end of else (_mmx_supported) */
				731
				732	break;
				733	} /* end 8 bpp */
				734
				735	case 16: /* png_ptr->row_info.pixel_depth */
				736	{
				737	png_bytep srcptr;
				738	png_bytep dstptr;
				739
				740	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
				741	#if !defined(PNG_1_0_X)
				742	if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
				743	/* && _mmx_supported */ )
				744	#else
				745	if (_mmx_supported)
				746	#endif
				747	{
				748	png_uint_32 len;
				749	int diff;
				750	int dummy_value_a; // fix 'forbidden register spilled' error
				751	int dummy_value_d;
				752	int dummy_value_c;
				753	int dummy_value_S;
				754	int dummy_value_D;
				755	_unmask = ~mask; // global variable for -fPIC version
				756	srcptr = png_ptr->row_buf + 1;
				757	dstptr = row;
				758	len = png_ptr->width &~7; // reduce to multiple of 8
				759	diff = (int) (png_ptr->width & 7); // amount lost //
				760
				761	__asm__ __volatile__ (
				762	"movd _unmask, %%mm7 \n\t" // load bit pattern
				763	"psubb %%mm6, %%mm6 \n\t" // zero mm6
				764	"punpcklbw %%mm7, %%mm7 \n\t"
				765	"punpcklwd %%mm7, %%mm7 \n\t"
				766	"punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
				767
				768	"movq _mask16_0, %%mm0 \n\t"
				769	"movq _mask16_1, %%mm1 \n\t"
				770
				771	"pand %%mm7, %%mm0 \n\t"
				772	"pand %%mm7, %%mm1 \n\t"
				773
				774	"pcmpeqb %%mm6, %%mm0 \n\t"
				775	"pcmpeqb %%mm6, %%mm1 \n\t"
				776
				777	// preload "movl len, %%ecx \n\t" // load length of line
				778	// preload "movl srcptr, %%esi \n\t" // load source
				779	// preload "movl dstptr, %%edi \n\t" // load dest
				780
				781	"cmpl $0, %%ecx \n\t"
				782	"jz mainloop16end \n\t"
				783
				784	"mainloop16: \n\t"
				785	"movq (%%esi), %%mm4 \n\t"
				786	"pand %%mm0, %%mm4 \n\t"
				787	"movq %%mm0, %%mm6 \n\t"
				788	"movq (%%edi), %%mm7 \n\t"
				789	"pandn %%mm7, %%mm6 \n\t"
				790	"por %%mm6, %%mm4 \n\t"
				791	"movq %%mm4, (%%edi) \n\t"
				792
				793	"movq 8(%%esi), %%mm5 \n\t"
				794	"pand %%mm1, %%mm5 \n\t"
				795	"movq %%mm1, %%mm7 \n\t"
				796	"movq 8(%%edi), %%mm6 \n\t"
				797	"pandn %%mm6, %%mm7 \n\t"
				798	"por %%mm7, %%mm5 \n\t"
				799	"movq %%mm5, 8(%%edi) \n\t"
				800
				801	"addl $16, %%esi \n\t" // inc by 16 bytes processed
				802	"addl $16, %%edi \n\t"
				803	"subl $8, %%ecx \n\t" // dec by 8 pixels processed
				804	"ja mainloop16 \n\t"
				805
				806	"mainloop16end: \n\t"
				807	// preload "movl diff, %%ecx \n\t" // (diff is in eax)
				808	"movl %%eax, %%ecx \n\t"
				809	"cmpl $0, %%ecx \n\t"
				810	"jz end16 \n\t"
				811	// preload "movl mask, %%edx \n\t"
				812	"sall $24, %%edx \n\t" // make low byte, high byte
				813
				814	"secondloop16: \n\t"
				815	"sall %%edx \n\t" // move high bit to CF
				816	"jnc skip16 \n\t" // if CF = 0
				817	"movw (%%esi), %%ax \n\t"
				818	"movw %%ax, (%%edi) \n\t"
				819
				820	"skip16: \n\t"
				821	"addl $2, %%esi \n\t"
				822	"addl $2, %%edi \n\t"
				823	"decl %%ecx \n\t"
				824	"jnz secondloop16 \n\t"
				825
				826	"end16: \n\t"
				827	"EMMS \n\t" // DONE
				828
				829	: "=a" (dummy_value_a), // output regs (dummy)
				830	"=c" (dummy_value_c),
				831	"=d" (dummy_value_d),
				832	"=S" (dummy_value_S),
				833	"=D" (dummy_value_D)
				834
				835	: "0" (diff), // eax // input regs
				836	// was (unmask) " " RESERVED // ebx // Global Offset Table idx
				837	"1" (len), // ecx
				838	"2" (mask), // edx
				839	"3" (srcptr), // esi
				840	"4" (dstptr) // edi
				841
				842	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
				843	: "%mm0", "%mm1", "%mm4" // clobber list
				844	, "%mm5", "%mm6", "%mm7"
				845	#endif
				846	);
				847	}
				848	else /* mmx _not supported - Use modified C routine */
				849	#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
				850	{
				851	register png_uint_32 i;
				852	png_uint_32 initial_val = BPP2 * png_pass_start[png_ptr->pass];
				853	/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
				854	register int stride = BPP2 * png_pass_inc[png_ptr->pass];
				855	/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
				856	register int rep_bytes = BPP2 * png_pass_width[png_ptr->pass];
				857	/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
				858	png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
				859	int diff = (int) (png_ptr->width & 7); /* amount lost */
				860	register png_uint_32 final_val = BPP2 * len; /* GRR bugfix */
				861
				862	srcptr = png_ptr->row_buf + 1 + initial_val;
				863	dstptr = row + initial_val;
				864
				865	for (i = initial_val; i < final_val; i += stride)
				866	{
				867	png_memcpy(dstptr, srcptr, rep_bytes);
				868	srcptr += stride;
				869	dstptr += stride;
				870	}
				871	if (diff) /* number of leftover pixels: 3 for pngtest */
				872	{
				873	final_val+=diff*BPP2;
				874	for (; i < final_val; i += stride)
				875	{
				876	if (rep_bytes > (int)(final_val-i))
				877	rep_bytes = (int)(final_val-i);
				878	png_memcpy(dstptr, srcptr, rep_bytes);
				879	srcptr += stride;
				880	dstptr += stride;
				881	}
				882	}
				883	} /* end of else (_mmx_supported) */
				884
				885	break;
				886	} /* end 16 bpp */
				887
				888	case 24: /* png_ptr->row_info.pixel_depth */
				889	{
				890	png_bytep srcptr;
				891	png_bytep dstptr;
				892
				893	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
				894	#if !defined(PNG_1_0_X)
				895	if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
				896	/* && _mmx_supported */ )
				897	#else
				898	if (_mmx_supported)
				899	#endif
				900	{
				901	png_uint_32 len;
				902	int diff;
				903	int dummy_value_a; // fix 'forbidden register spilled' error
				904	int dummy_value_d;
				905	int dummy_value_c;
				906	int dummy_value_S;
				907	int dummy_value_D;
				908	_unmask = ~mask; // global variable for -fPIC version
				909	srcptr = png_ptr->row_buf + 1;
				910	dstptr = row;
				911	len = png_ptr->width &~7; // reduce to multiple of 8
				912	diff = (int) (png_ptr->width & 7); // amount lost //
				913
				914	__asm__ __volatile__ (
				915	"movd _unmask, %%mm7 \n\t" // load bit pattern
				916	"psubb %%mm6, %%mm6 \n\t" // zero mm6
				917	"punpcklbw %%mm7, %%mm7 \n\t"
				918	"punpcklwd %%mm7, %%mm7 \n\t"
				919	"punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
				920
				921	"movq _mask24_0, %%mm0 \n\t"
				922	"movq _mask24_1, %%mm1 \n\t"
				923	"movq _mask24_2, %%mm2 \n\t"
				924
				925	"pand %%mm7, %%mm0 \n\t"
				926	"pand %%mm7, %%mm1 \n\t"
				927	"pand %%mm7, %%mm2 \n\t"
				928
				929	"pcmpeqb %%mm6, %%mm0 \n\t"
				930	"pcmpeqb %%mm6, %%mm1 \n\t"
				931	"pcmpeqb %%mm6, %%mm2 \n\t"
				932
				933	// preload "movl len, %%ecx \n\t" // load length of line
				934	// preload "movl srcptr, %%esi \n\t" // load source
				935	// preload "movl dstptr, %%edi \n\t" // load dest
				936
				937	"cmpl $0, %%ecx \n\t"
				938	"jz mainloop24end \n\t"
				939
				940	"mainloop24: \n\t"
				941	"movq (%%esi), %%mm4 \n\t"
				942	"pand %%mm0, %%mm4 \n\t"
				943	"movq %%mm0, %%mm6 \n\t"
				944	"movq (%%edi), %%mm7 \n\t"
				945	"pandn %%mm7, %%mm6 \n\t"
				946	"por %%mm6, %%mm4 \n\t"
				947	"movq %%mm4, (%%edi) \n\t"
				948
				949	"movq 8(%%esi), %%mm5 \n\t"
				950	"pand %%mm1, %%mm5 \n\t"
				951	"movq %%mm1, %%mm7 \n\t"
				952	"movq 8(%%edi), %%mm6 \n\t"
				953	"pandn %%mm6, %%mm7 \n\t"
				954	"por %%mm7, %%mm5 \n\t"
				955	"movq %%mm5, 8(%%edi) \n\t"
				956
				957	"movq 16(%%esi), %%mm6 \n\t"
				958	"pand %%mm2, %%mm6 \n\t"
				959	"movq %%mm2, %%mm4 \n\t"
				960	"movq 16(%%edi), %%mm7 \n\t"
				961	"pandn %%mm7, %%mm4 \n\t"
				962	"por %%mm4, %%mm6 \n\t"
				963	"movq %%mm6, 16(%%edi) \n\t"
				964
				965	"addl $24, %%esi \n\t" // inc by 24 bytes processed
				966	"addl $24, %%edi \n\t"
				967	"subl $8, %%ecx \n\t" // dec by 8 pixels processed
				968
				969	"ja mainloop24 \n\t"
				970
				971	"mainloop24end: \n\t"
				972	// preload "movl diff, %%ecx \n\t" // (diff is in eax)
				973	"movl %%eax, %%ecx \n\t"
				974	"cmpl $0, %%ecx \n\t"
				975	"jz end24 \n\t"
				976	// preload "movl mask, %%edx \n\t"
				977	"sall $24, %%edx \n\t" // make low byte, high byte
				978
				979	"secondloop24: \n\t"
				980	"sall %%edx \n\t" // move high bit to CF
				981	"jnc skip24 \n\t" // if CF = 0
				982	"movw (%%esi), %%ax \n\t"
				983	"movw %%ax, (%%edi) \n\t"
				984	"xorl %%eax, %%eax \n\t"
				985	"movb 2(%%esi), %%al \n\t"
				986	"movb %%al, 2(%%edi) \n\t"
				987
				988	"skip24: \n\t"
				989	"addl $3, %%esi \n\t"
				990	"addl $3, %%edi \n\t"
				991	"decl %%ecx \n\t"
				992	"jnz secondloop24 \n\t"
				993
				994	"end24: \n\t"
				995	"EMMS \n\t" // DONE
				996
				997	: "=a" (dummy_value_a), // output regs (dummy)
				998	"=d" (dummy_value_d),
				999	"=c" (dummy_value_c),
				1000	"=S" (dummy_value_S),
				1001	"=D" (dummy_value_D)
				1002
				1003	: "3" (srcptr), // esi // input regs
				1004	"4" (dstptr), // edi
				1005	"0" (diff), // eax
				1006	// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
				1007	"2" (len), // ecx
				1008	"1" (mask) // edx
				1009
				1010	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
				1011	: "%mm0", "%mm1", "%mm2" // clobber list
				1012	, "%mm4", "%mm5", "%mm6", "%mm7"
				1013	#endif
				1014	);
				1015	}
				1016	else /* mmx _not supported - Use modified C routine */
				1017	#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
				1018	{
				1019	register png_uint_32 i;
				1020	png_uint_32 initial_val = BPP3 * png_pass_start[png_ptr->pass];
				1021	/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
				1022	register int stride = BPP3 * png_pass_inc[png_ptr->pass];
				1023	/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
				1024	register int rep_bytes = BPP3 * png_pass_width[png_ptr->pass];
				1025	/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
				1026	png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
				1027	int diff = (int) (png_ptr->width & 7); /* amount lost */
				1028	register png_uint_32 final_val = BPP3 * len; /* GRR bugfix */
				1029
				1030	srcptr = png_ptr->row_buf + 1 + initial_val;
				1031	dstptr = row + initial_val;
				1032
				1033	for (i = initial_val; i < final_val; i += stride)
				1034	{
				1035	png_memcpy(dstptr, srcptr, rep_bytes);
				1036	srcptr += stride;
				1037	dstptr += stride;
				1038	}
				1039	if (diff) /* number of leftover pixels: 3 for pngtest */
				1040	{
				1041	final_val+=diff*BPP3;
				1042	for (; i < final_val; i += stride)
				1043	{
				1044	if (rep_bytes > (int)(final_val-i))
				1045	rep_bytes = (int)(final_val-i);
				1046	png_memcpy(dstptr, srcptr, rep_bytes);
				1047	srcptr += stride;
				1048	dstptr += stride;
				1049	}
				1050	}
				1051	} /* end of else (_mmx_supported) */
				1052
				1053	break;
				1054	} /* end 24 bpp */
				1055
				1056	case 32: /* png_ptr->row_info.pixel_depth */
				1057	{
				1058	png_bytep srcptr;
				1059	png_bytep dstptr;
				1060
				1061	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
				1062	#if !defined(PNG_1_0_X)
				1063	if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
				1064	/* && _mmx_supported */ )
				1065	#else
				1066	if (_mmx_supported)
				1067	#endif
				1068	{
				1069	png_uint_32 len;
				1070	int diff;
				1071	int dummy_value_a; // fix 'forbidden register spilled' error
				1072	int dummy_value_d;
				1073	int dummy_value_c;
				1074	int dummy_value_S;
				1075	int dummy_value_D;
				1076	_unmask = ~mask; // global variable for -fPIC version
				1077	srcptr = png_ptr->row_buf + 1;
				1078	dstptr = row;
				1079	len = png_ptr->width &~7; // reduce to multiple of 8
				1080	diff = (int) (png_ptr->width & 7); // amount lost //
				1081
				1082	__asm__ __volatile__ (
				1083	"movd _unmask, %%mm7 \n\t" // load bit pattern
				1084	"psubb %%mm6, %%mm6 \n\t" // zero mm6
				1085	"punpcklbw %%mm7, %%mm7 \n\t"
				1086	"punpcklwd %%mm7, %%mm7 \n\t"
				1087	"punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
				1088
				1089	"movq _mask32_0, %%mm0 \n\t"
				1090	"movq _mask32_1, %%mm1 \n\t"
				1091	"movq _mask32_2, %%mm2 \n\t"
				1092	"movq _mask32_3, %%mm3 \n\t"
				1093
				1094	"pand %%mm7, %%mm0 \n\t"
				1095	"pand %%mm7, %%mm1 \n\t"
				1096	"pand %%mm7, %%mm2 \n\t"
				1097	"pand %%mm7, %%mm3 \n\t"
				1098
				1099	"pcmpeqb %%mm6, %%mm0 \n\t"
				1100	"pcmpeqb %%mm6, %%mm1 \n\t"
				1101	"pcmpeqb %%mm6, %%mm2 \n\t"
				1102	"pcmpeqb %%mm6, %%mm3 \n\t"
				1103
				1104	// preload "movl len, %%ecx \n\t" // load length of line
				1105	// preload "movl srcptr, %%esi \n\t" // load source
				1106	// preload "movl dstptr, %%edi \n\t" // load dest
				1107
				1108	"cmpl $0, %%ecx \n\t" // lcr
				1109	"jz mainloop32end \n\t"
				1110
				1111	"mainloop32: \n\t"
				1112	"movq (%%esi), %%mm4 \n\t"
				1113	"pand %%mm0, %%mm4 \n\t"
				1114	"movq %%mm0, %%mm6 \n\t"
				1115	"movq (%%edi), %%mm7 \n\t"
				1116	"pandn %%mm7, %%mm6 \n\t"
				1117	"por %%mm6, %%mm4 \n\t"
				1118	"movq %%mm4, (%%edi) \n\t"
				1119
				1120	"movq 8(%%esi), %%mm5 \n\t"
				1121	"pand %%mm1, %%mm5 \n\t"
				1122	"movq %%mm1, %%mm7 \n\t"
				1123	"movq 8(%%edi), %%mm6 \n\t"
				1124	"pandn %%mm6, %%mm7 \n\t"
				1125	"por %%mm7, %%mm5 \n\t"
				1126	"movq %%mm5, 8(%%edi) \n\t"
				1127
				1128	"movq 16(%%esi), %%mm6 \n\t"
				1129	"pand %%mm2, %%mm6 \n\t"
				1130	"movq %%mm2, %%mm4 \n\t"
				1131	"movq 16(%%edi), %%mm7 \n\t"
				1132	"pandn %%mm7, %%mm4 \n\t"
				1133	"por %%mm4, %%mm6 \n\t"
				1134	"movq %%mm6, 16(%%edi) \n\t"
				1135
				1136	"movq 24(%%esi), %%mm7 \n\t"
				1137	"pand %%mm3, %%mm7 \n\t"
				1138	"movq %%mm3, %%mm5 \n\t"
				1139	"movq 24(%%edi), %%mm4 \n\t"
				1140	"pandn %%mm4, %%mm5 \n\t"
				1141	"por %%mm5, %%mm7 \n\t"
				1142	"movq %%mm7, 24(%%edi) \n\t"
				1143
				1144	"addl $32, %%esi \n\t" // inc by 32 bytes processed
				1145	"addl $32, %%edi \n\t"
				1146	"subl $8, %%ecx \n\t" // dec by 8 pixels processed
				1147	"ja mainloop32 \n\t"
				1148
				1149	"mainloop32end: \n\t"
				1150	// preload "movl diff, %%ecx \n\t" // (diff is in eax)
				1151	"movl %%eax, %%ecx \n\t"
				1152	"cmpl $0, %%ecx \n\t"
				1153	"jz end32 \n\t"
				1154	// preload "movl mask, %%edx \n\t"
				1155	"sall $24, %%edx \n\t" // low byte => high byte
				1156
				1157	"secondloop32: \n\t"
				1158	"sall %%edx \n\t" // move high bit to CF
				1159	"jnc skip32 \n\t" // if CF = 0
				1160	"movl (%%esi), %%eax \n\t"
				1161	"movl %%eax, (%%edi) \n\t"
				1162
				1163	"skip32: \n\t"
				1164	"addl $4, %%esi \n\t"
				1165	"addl $4, %%edi \n\t"
				1166	"decl %%ecx \n\t"
				1167	"jnz secondloop32 \n\t"
				1168
				1169	"end32: \n\t"
				1170	"EMMS \n\t" // DONE
				1171
				1172	: "=a" (dummy_value_a), // output regs (dummy)
				1173	"=d" (dummy_value_d),
				1174	"=c" (dummy_value_c),
				1175	"=S" (dummy_value_S),
				1176	"=D" (dummy_value_D)
				1177
				1178	: "3" (srcptr), // esi // input regs
				1179	"4" (dstptr), // edi
				1180	"0" (diff), // eax
				1181	// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
				1182	"2" (len), // ecx
				1183	"1" (mask) // edx
				1184
				1185	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
				1186	: "%mm0", "%mm1", "%mm2", "%mm3" // clobber list
				1187	, "%mm4", "%mm5", "%mm6", "%mm7"
				1188	#endif
				1189	);
				1190	}
				1191	else /* mmx _not supported - Use modified C routine */
				1192	#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
				1193	{
				1194	register png_uint_32 i;
				1195	png_uint_32 initial_val = BPP4 * png_pass_start[png_ptr->pass];
				1196	/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
				1197	register int stride = BPP4 * png_pass_inc[png_ptr->pass];
				1198	/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
				1199	register int rep_bytes = BPP4 * png_pass_width[png_ptr->pass];
				1200	/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
				1201	png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
				1202	int diff = (int) (png_ptr->width & 7); /* amount lost */
				1203	register png_uint_32 final_val = BPP4 * len; /* GRR bugfix */
				1204
				1205	srcptr = png_ptr->row_buf + 1 + initial_val;
				1206	dstptr = row + initial_val;
				1207
				1208	for (i = initial_val; i < final_val; i += stride)
				1209	{
				1210	png_memcpy(dstptr, srcptr, rep_bytes);
				1211	srcptr += stride;
				1212	dstptr += stride;
				1213	}
				1214	if (diff) /* number of leftover pixels: 3 for pngtest */
				1215	{
				1216	final_val+=diff*BPP4;
				1217	for (; i < final_val; i += stride)
				1218	{
				1219	if (rep_bytes > (int)(final_val-i))
				1220	rep_bytes = (int)(final_val-i);
				1221	png_memcpy(dstptr, srcptr, rep_bytes);
				1222	srcptr += stride;
				1223	dstptr += stride;
				1224	}
				1225	}
				1226	} /* end of else (_mmx_supported) */
				1227
				1228	break;
				1229	} /* end 32 bpp */
				1230
				1231	case 48: /* png_ptr->row_info.pixel_depth */
				1232	{
				1233	png_bytep srcptr;
				1234	png_bytep dstptr;
				1235
				1236	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
				1237	#if !defined(PNG_1_0_X)
				1238	if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
				1239	/* && _mmx_supported */ )
				1240	#else
				1241	if (_mmx_supported)
				1242	#endif
				1243	{
				1244	png_uint_32 len;
				1245	int diff;
				1246	int dummy_value_a; // fix 'forbidden register spilled' error
				1247	int dummy_value_d;
				1248	int dummy_value_c;
				1249	int dummy_value_S;
				1250	int dummy_value_D;
				1251	_unmask = ~mask; // global variable for -fPIC version
				1252	srcptr = png_ptr->row_buf + 1;
				1253	dstptr = row;
				1254	len = png_ptr->width &~7; // reduce to multiple of 8
				1255	diff = (int) (png_ptr->width & 7); // amount lost //
				1256
				1257	__asm__ __volatile__ (
				1258	"movd _unmask, %%mm7 \n\t" // load bit pattern
				1259	"psubb %%mm6, %%mm6 \n\t" // zero mm6
				1260	"punpcklbw %%mm7, %%mm7 \n\t"
				1261	"punpcklwd %%mm7, %%mm7 \n\t"
				1262	"punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
				1263
				1264	"movq _mask48_0, %%mm0 \n\t"
				1265	"movq _mask48_1, %%mm1 \n\t"
				1266	"movq _mask48_2, %%mm2 \n\t"
				1267	"movq _mask48_3, %%mm3 \n\t"
				1268	"movq _mask48_4, %%mm4 \n\t"
				1269	"movq _mask48_5, %%mm5 \n\t"
				1270
				1271	"pand %%mm7, %%mm0 \n\t"
				1272	"pand %%mm7, %%mm1 \n\t"
				1273	"pand %%mm7, %%mm2 \n\t"
				1274	"pand %%mm7, %%mm3 \n\t"
				1275	"pand %%mm7, %%mm4 \n\t"
				1276	"pand %%mm7, %%mm5 \n\t"
				1277
				1278	"pcmpeqb %%mm6, %%mm0 \n\t"
				1279	"pcmpeqb %%mm6, %%mm1 \n\t"
				1280	"pcmpeqb %%mm6, %%mm2 \n\t"
				1281	"pcmpeqb %%mm6, %%mm3 \n\t"
				1282	"pcmpeqb %%mm6, %%mm4 \n\t"
				1283	"pcmpeqb %%mm6, %%mm5 \n\t"
				1284
				1285	// preload "movl len, %%ecx \n\t" // load length of line
				1286	// preload "movl srcptr, %%esi \n\t" // load source
				1287	// preload "movl dstptr, %%edi \n\t" // load dest
				1288
				1289	"cmpl $0, %%ecx \n\t"
				1290	"jz mainloop48end \n\t"
				1291
				1292	"mainloop48: \n\t"
				1293	"movq (%%esi), %%mm7 \n\t"
				1294	"pand %%mm0, %%mm7 \n\t"
				1295	"movq %%mm0, %%mm6 \n\t"
				1296	"pandn (%%edi), %%mm6 \n\t"
				1297	"por %%mm6, %%mm7 \n\t"
				1298	"movq %%mm7, (%%edi) \n\t"
				1299
				1300	"movq 8(%%esi), %%mm6 \n\t"
				1301	"pand %%mm1, %%mm6 \n\t"
				1302	"movq %%mm1, %%mm7 \n\t"
				1303	"pandn 8(%%edi), %%mm7 \n\t"
				1304	"por %%mm7, %%mm6 \n\t"
				1305	"movq %%mm6, 8(%%edi) \n\t"
				1306
				1307	"movq 16(%%esi), %%mm6 \n\t"
				1308	"pand %%mm2, %%mm6 \n\t"
				1309	"movq %%mm2, %%mm7 \n\t"
				1310	"pandn 16(%%edi), %%mm7 \n\t"
				1311	"por %%mm7, %%mm6 \n\t"
				1312	"movq %%mm6, 16(%%edi) \n\t"
				1313
				1314	"movq 24(%%esi), %%mm7 \n\t"
				1315	"pand %%mm3, %%mm7 \n\t"
				1316	"movq %%mm3, %%mm6 \n\t"
				1317	"pandn 24(%%edi), %%mm6 \n\t"
				1318	"por %%mm6, %%mm7 \n\t"
				1319	"movq %%mm7, 24(%%edi) \n\t"
				1320
				1321	"movq 32(%%esi), %%mm6 \n\t"
				1322	"pand %%mm4, %%mm6 \n\t"
				1323	"movq %%mm4, %%mm7 \n\t"
				1324	"pandn 32(%%edi), %%mm7 \n\t"
				1325	"por %%mm7, %%mm6 \n\t"
				1326	"movq %%mm6, 32(%%edi) \n\t"
				1327
				1328	"movq 40(%%esi), %%mm7 \n\t"
				1329	"pand %%mm5, %%mm7 \n\t"
				1330	"movq %%mm5, %%mm6 \n\t"
				1331	"pandn 40(%%edi), %%mm6 \n\t"
				1332	"por %%mm6, %%mm7 \n\t"
				1333	"movq %%mm7, 40(%%edi) \n\t"
				1334
				1335	"addl $48, %%esi \n\t" // inc by 48 bytes processed
				1336	"addl $48, %%edi \n\t"
				1337	"subl $8, %%ecx \n\t" // dec by 8 pixels processed
				1338
				1339	"ja mainloop48 \n\t"
				1340
				1341	"mainloop48end: \n\t"
				1342	// preload "movl diff, %%ecx \n\t" // (diff is in eax)
				1343	"movl %%eax, %%ecx \n\t"
				1344	"cmpl $0, %%ecx \n\t"
				1345	"jz end48 \n\t"
				1346	// preload "movl mask, %%edx \n\t"
				1347	"sall $24, %%edx \n\t" // make low byte, high byte
				1348
				1349	"secondloop48: \n\t"
				1350	"sall %%edx \n\t" // move high bit to CF
				1351	"jnc skip48 \n\t" // if CF = 0
				1352	"movl (%%esi), %%eax \n\t"
				1353	"movl %%eax, (%%edi) \n\t"
				1354
				1355	"skip48: \n\t"
				1356	"addl $4, %%esi \n\t"
				1357	"addl $4, %%edi \n\t"
				1358	"decl %%ecx \n\t"
				1359	"jnz secondloop48 \n\t"
				1360
				1361	"end48: \n\t"
				1362	"EMMS \n\t" // DONE
				1363
				1364	: "=a" (dummy_value_a), // output regs (dummy)
				1365	"=d" (dummy_value_d),
				1366	"=c" (dummy_value_c),
				1367	"=S" (dummy_value_S),
				1368	"=D" (dummy_value_D)
				1369
				1370	: "3" (srcptr), // esi // input regs
				1371	"4" (dstptr), // edi
				1372	"0" (diff), // eax
				1373	// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
				1374	"2" (len), // ecx
				1375	"1" (mask) // edx
				1376
				1377	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
				1378	: "%mm0", "%mm1", "%mm2", "%mm3" // clobber list
				1379	, "%mm4", "%mm5", "%mm6", "%mm7"
				1380	#endif
				1381	);
				1382	}
				1383	else /* mmx _not supported - Use modified C routine */
				1384	#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
				1385	{
				1386	register png_uint_32 i;
				1387	png_uint_32 initial_val = BPP6 * png_pass_start[png_ptr->pass];
				1388	/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
				1389	register int stride = BPP6 * png_pass_inc[png_ptr->pass];
				1390	/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
				1391	register int rep_bytes = BPP6 * png_pass_width[png_ptr->pass];
				1392	/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
				1393	png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
				1394	int diff = (int) (png_ptr->width & 7); /* amount lost */
				1395	register png_uint_32 final_val = BPP6 * len; /* GRR bugfix */
				1396
				1397	srcptr = png_ptr->row_buf + 1 + initial_val;
				1398	dstptr = row + initial_val;
				1399
				1400	for (i = initial_val; i < final_val; i += stride)
				1401	{
				1402	png_memcpy(dstptr, srcptr, rep_bytes);
				1403	srcptr += stride;
				1404	dstptr += stride;
				1405	}
				1406	if (diff) /* number of leftover pixels: 3 for pngtest */
				1407	{
				1408	final_val+=diff*BPP6;
				1409	for (; i < final_val; i += stride)
				1410	{
				1411	if (rep_bytes > (int)(final_val-i))
				1412	rep_bytes = (int)(final_val-i);
				1413	png_memcpy(dstptr, srcptr, rep_bytes);
				1414	srcptr += stride;
				1415	dstptr += stride;
				1416	}
				1417	}
				1418	} /* end of else (_mmx_supported) */
				1419
				1420	break;
				1421	} /* end 48 bpp */
				1422
				1423	case 64: /* png_ptr->row_info.pixel_depth */
				1424	{
				1425	png_bytep srcptr;
				1426	png_bytep dstptr;
				1427	register png_uint_32 i;
				1428	png_uint_32 initial_val = BPP8 * png_pass_start[png_ptr->pass];
				1429	/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
				1430	register int stride = BPP8 * png_pass_inc[png_ptr->pass];
				1431	/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
				1432	register int rep_bytes = BPP8 * png_pass_width[png_ptr->pass];
				1433	/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
				1434	png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
				1435	int diff = (int) (png_ptr->width & 7); /* amount lost */
				1436	register png_uint_32 final_val = BPP8 * len; /* GRR bugfix */
				1437
				1438	srcptr = png_ptr->row_buf + 1 + initial_val;
				1439	dstptr = row + initial_val;
				1440
				1441	for (i = initial_val; i < final_val; i += stride)
				1442	{
				1443	png_memcpy(dstptr, srcptr, rep_bytes);
				1444	srcptr += stride;
				1445	dstptr += stride;
				1446	}
				1447	if (diff) /* number of leftover pixels: 3 for pngtest */
				1448	{
				1449	final_val+=diff*BPP8;
				1450	for (; i < final_val; i += stride)
				1451	{
				1452	if (rep_bytes > (int)(final_val-i))
				1453	rep_bytes = (int)(final_val-i);
				1454	png_memcpy(dstptr, srcptr, rep_bytes);
				1455	srcptr += stride;
				1456	dstptr += stride;
				1457	}
				1458	}
				1459
				1460	break;
				1461	} /* end 64 bpp */
				1462
				1463	default: /* png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64 */
				1464	{
				1465	/* this should never happen */
				1466	png_warning(png_ptr, "Invalid row_info.pixel_depth in pnggccrd");
				1467	break;
				1468	}
				1469	} /* end switch (png_ptr->row_info.pixel_depth) */
				1470
				1471	} /* end if (non-trivial mask) */
				1472
				1473	} /* end png_combine_row() */
				1474
				1475	#endif /* PNG_HAVE_ASSEMBLER_COMBINE_ROW */
				1476
				1477
				1478
				1479
				1480	/===========================================================================/
				1481	/* */
				1482	/* P N G _ D O _ R E A D _ I N T E R L A C E */
				1483	/* */
				1484	/===========================================================================/
				1485
				1486	#if defined(PNG_READ_INTERLACING_SUPPORTED)
				1487	#if defined(PNG_HAVE_ASSEMBLER_READ_INTERLACE)
				1488
				1489	/* png_do_read_interlace() is called after any 16-bit to 8-bit conversion
				1490	* has taken place. [GRR: what other steps come before and/or after?]
				1491	*/
				1492
				1493	void /* PRIVATE */
				1494	png_do_read_interlace(png_structp png_ptr)
				1495	{
				1496	png_row_infop row_info = &(png_ptr->row_info);
				1497	png_bytep row = png_ptr->row_buf + 1;
				1498	int pass = png_ptr->pass;
				1499	#if defined(PNG_READ_PACKSWAP_SUPPORTED)
				1500	png_uint_32 transformations = png_ptr->transformations;
				1501	#endif
				1502
				1503	png_debug(1, "in png_do_read_interlace (pnggccrd.c)\n");
				1504
				1505	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
				1506	if (_mmx_supported == 2) {
				1507	#if !defined(PNG_1_0_X)
				1508	/* this should have happened in png_init_mmx_flags() already */
				1509	png_warning(png_ptr, "asm_flags may not have been initialized");
				1510	#endif
				1511	png_mmx_support();
				1512	}
				1513	#endif
				1514
				1515	if (row != NULL && row_info != NULL)
				1516	{
				1517	png_uint_32 final_width;
				1518
				1519	final_width = row_info->width * png_pass_inc[pass];
				1520
				1521	switch (row_info->pixel_depth)
				1522	{
				1523	case 1:
				1524	{
				1525	png_bytep sp, dp;
				1526	int sshift, dshift;
				1527	int s_start, s_end, s_inc;
				1528	png_byte v;
				1529	png_uint_32 i;
				1530	int j;
				1531
				1532	sp = row + (png_size_t)((row_info->width - 1) >> 3);
				1533	dp = row + (png_size_t)((final_width - 1) >> 3);
				1534	#if defined(PNG_READ_PACKSWAP_SUPPORTED)
				1535	if (transformations & PNG_PACKSWAP)
				1536	{
				1537	sshift = (int)((row_info->width + 7) & 7);
				1538	dshift = (int)((final_width + 7) & 7);
				1539	s_start = 7;
				1540	s_end = 0;
				1541	s_inc = -1;
				1542	}
				1543	else
				1544	#endif
				1545	{
				1546	sshift = 7 - (int)((row_info->width + 7) & 7);
				1547	dshift = 7 - (int)((final_width + 7) & 7);
				1548	s_start = 0;
				1549	s_end = 7;
				1550	s_inc = 1;
				1551	}
				1552
				1553	for (i = row_info->width; i; i--)
				1554	{
				1555	v = (png_byte)((*sp >> sshift) & 0x1);
				1556	for (j = 0; j < png_pass_inc[pass]; j++)
				1557	{
				1558	*dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
				1559	*dp \|= (png_byte)(v << dshift);
				1560	if (dshift == s_end)
				1561	{
				1562	dshift = s_start;
				1563	dp--;
				1564	}
				1565	else
				1566	dshift += s_inc;
				1567	}
				1568	if (sshift == s_end)
				1569	{
				1570	sshift = s_start;
				1571	sp--;
				1572	}
				1573	else
				1574	sshift += s_inc;
				1575	}
				1576	break;
				1577	}
				1578
				1579	case 2:
				1580	{
				1581	png_bytep sp, dp;
				1582	int sshift, dshift;
				1583	int s_start, s_end, s_inc;
				1584	png_uint_32 i;
				1585
				1586	sp = row + (png_size_t)((row_info->width - 1) >> 2);
				1587	dp = row + (png_size_t)((final_width - 1) >> 2);
				1588	#if defined(PNG_READ_PACKSWAP_SUPPORTED)
				1589	if (transformations & PNG_PACKSWAP)
				1590	{
				1591	sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
				1592	dshift = (png_size_t)(((final_width + 3) & 3) << 1);
				1593	s_start = 6;
				1594	s_end = 0;
				1595	s_inc = -2;
				1596	}
				1597	else
				1598	#endif
				1599	{
				1600	sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
				1601	dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
				1602	s_start = 0;
				1603	s_end = 6;
				1604	s_inc = 2;
				1605	}
				1606
				1607	for (i = row_info->width; i; i--)
				1608	{
				1609	png_byte v;
				1610	int j;
				1611
				1612	v = (png_byte)((*sp >> sshift) & 0x3);
				1613	for (j = 0; j < png_pass_inc[pass]; j++)
				1614	{
				1615	*dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
				1616	*dp \|= (png_byte)(v << dshift);
				1617	if (dshift == s_end)
				1618	{
				1619	dshift = s_start;
				1620	dp--;
				1621	}
				1622	else
				1623	dshift += s_inc;
				1624	}
				1625	if (sshift == s_end)
				1626	{
				1627	sshift = s_start;
				1628	sp--;
				1629	}
				1630	else
				1631	sshift += s_inc;
				1632	}
				1633	break;
				1634	}
				1635
				1636	case 4:
				1637	{
				1638	png_bytep sp, dp;
				1639	int sshift, dshift;
				1640	int s_start, s_end, s_inc;
				1641	png_uint_32 i;
				1642
				1643	sp = row + (png_size_t)((row_info->width - 1) >> 1);
				1644	dp = row + (png_size_t)((final_width - 1) >> 1);
				1645	#if defined(PNG_READ_PACKSWAP_SUPPORTED)
				1646	if (transformations & PNG_PACKSWAP)
				1647	{
				1648	sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
				1649	dshift = (png_size_t)(((final_width + 1) & 1) << 2);
				1650	s_start = 4;
				1651	s_end = 0;
				1652	s_inc = -4;
				1653	}
				1654	else
				1655	#endif
				1656	{
				1657	sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
				1658	dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
				1659	s_start = 0;
				1660	s_end = 4;
				1661	s_inc = 4;
				1662	}
				1663
				1664	for (i = row_info->width; i; i--)
				1665	{
				1666	png_byte v;
				1667	int j;
				1668
				1669	v = (png_byte)((*sp >> sshift) & 0xf);
				1670	for (j = 0; j < png_pass_inc[pass]; j++)
				1671	{
				1672	*dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
				1673	*dp \|= (png_byte)(v << dshift);
				1674	if (dshift == s_end)
				1675	{
				1676	dshift = s_start;
				1677	dp--;
				1678	}
				1679	else
				1680	dshift += s_inc;
				1681	}
				1682	if (sshift == s_end)
				1683	{
				1684	sshift = s_start;
				1685	sp--;
				1686	}
				1687	else
				1688	sshift += s_inc;
				1689	}
				1690	break;
				1691	}
				1692
				1693	/====================================================================/
				1694
				1695	default: /* 8-bit or larger (this is where the routine is modified) */
				1696	{
				1697	#if 0
				1698	// static unsigned long long _const4 = 0x0000000000FFFFFFLL; no good
				1699	// static unsigned long long const4 = 0x0000000000FFFFFFLL; no good
				1700	// unsigned long long _const4 = 0x0000000000FFFFFFLL; no good
				1701	// unsigned long long const4 = 0x0000000000FFFFFFLL; no good
				1702	#endif
				1703	png_bytep sptr, dp;
				1704	png_uint_32 i;
				1705	png_size_t pixel_bytes;
				1706	int width = (int)row_info->width;
				1707
				1708	pixel_bytes = (row_info->pixel_depth >> 3);
				1709
				1710	/* point sptr at the last pixel in the pre-expanded row: */
				1711	sptr = row + (width - 1) * pixel_bytes;
				1712
				1713	/* point dp at the last pixel position in the expanded row: */
				1714	dp = row + (final_width - 1) * pixel_bytes;
				1715
				1716	/* New code by Nirav Chhatrapati - Intel Corporation */
				1717
				1718	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
				1719	#if !defined(PNG_1_0_X)
				1720	if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
				1721	/* && _mmx_supported */ )
				1722	#else
				1723	if (_mmx_supported)
				1724	#endif
				1725	{
				1726	//--------------------------------------------------------------
				1727	if (pixel_bytes == 3)
				1728	{
				1729	if (((pass == 0) \|\| (pass == 1)) && width)
				1730	{
				1731	int dummy_value_c; // fix 'forbidden register spilled'
				1732	int dummy_value_S;
				1733	int dummy_value_D;
				1734
				1735	__asm__ __volatile__ (
				1736	"subl $21, %%edi \n\t"
				1737	// (png_pass_inc[pass] - 1)*pixel_bytes
				1738
				1739	".loop3_pass0: \n\t"
				1740	"movd (%%esi), %%mm0 \n\t" // x x x x x 2 1 0
				1741	"pand _const4, %%mm0 \n\t" // z z z z z 2 1 0
				1742	"movq %%mm0, %%mm1 \n\t" // z z z z z 2 1 0
				1743	"psllq $16, %%mm0 \n\t" // z z z 2 1 0 z z
				1744	"movq %%mm0, %%mm2 \n\t" // z z z 2 1 0 z z
				1745	"psllq $24, %%mm0 \n\t" // 2 1 0 z z z z z
				1746	"psrlq $8, %%mm1 \n\t" // z z z z z z 2 1
				1747	"por %%mm2, %%mm0 \n\t" // 2 1 0 2 1 0 z z
				1748	"por %%mm1, %%mm0 \n\t" // 2 1 0 2 1 0 2 1
				1749	"movq %%mm0, %%mm3 \n\t" // 2 1 0 2 1 0 2 1
				1750	"psllq $16, %%mm0 \n\t" // 0 2 1 0 2 1 z z
				1751	"movq %%mm3, %%mm4 \n\t" // 2 1 0 2 1 0 2 1
				1752	"punpckhdq %%mm0, %%mm3 \n\t" // 0 2 1 0 2 1 0 2
				1753	"movq %%mm4, 16(%%edi) \n\t"
				1754	"psrlq $32, %%mm0 \n\t" // z z z z 0 2 1 0
				1755	"movq %%mm3, 8(%%edi) \n\t"
				1756	"punpckldq %%mm4, %%mm0 \n\t" // 1 0 2 1 0 2 1 0
				1757	"subl $3, %%esi \n\t"
				1758	"movq %%mm0, (%%edi) \n\t"
				1759	"subl $24, %%edi \n\t"
				1760	"decl %%ecx \n\t"
				1761	"jnz .loop3_pass0 \n\t"
				1762	"EMMS \n\t" // DONE
				1763
				1764	: "=c" (dummy_value_c), // output regs (dummy)
				1765	"=S" (dummy_value_S),
				1766	"=D" (dummy_value_D)
				1767
				1768	: "1" (sptr), // esi // input regs
				1769	"2" (dp), // edi
				1770	"0" (width) // ecx
				1771	// doesn't work "i" (0x0000000000FFFFFFLL) // %1 (a.k.a. _const4)
				1772
				1773	#if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
				1774	: "%mm0", "%mm1", "%mm2" // clobber list
				1775	, "%mm3", "%mm4"
				1776	#endif
				1777	);
				1778	}
				1779	else if (((pass == 2) \|\| (pass == 3)) && width)
				1780	{
				1781	int dummy_value_c; // fix 'forbidden register spilled'
				1782	int dummy_value_S;
				1783	int dummy_value_D;
				1784
				1785	__asm__ __volatile__ (
				1786	"subl $9, %%edi \n\t"
				1787	// (png_pass_inc[pass] - 1)*pixel_bytes
				1788
				1789	".loop3_pass2: \n\t"
				1790	"movd (%%esi), %%mm0 \n\t" // x x x x x 2 1 0
				1791	"pand _const4, %%mm0 \n\t" // z z z z z 2 1 0
				1792	"movq %%mm0, %%mm1 \n\t" // z z z z z 2 1 0
				1793	"psllq $16, %%mm0 \n\t" // z z z 2 1 0 z z
				1794	"movq %%mm0, %%mm2 \n\t" // z z z 2 1 0 z z
				1795	"psllq $24, %%mm0 \n\t" // 2 1 0 z z z z z
				1796	"psrlq $8, %%mm1 \n\t" // z z z z z z 2 1
				1797	"por %%mm2, %%mm0 \n\t" // 2 1 0 2 1 0 z z
				1798	"por %%mm1, %%mm0 \n\t" // 2 1 0 2 1 0 2 1
				1799	"movq %%mm0, 4(%%edi) \n\t"
				1800	"psrlq $16, %%mm0 \n\t" // z z 2 1 0 2 1 0
				1801	"subl $3, %%esi \n\t"
				1802	"movd %%mm0, (%%edi) \n\t"
				1803	"subl $12, %%edi \n\t"
				1804	"decl %%ecx \n\t"
				1805	"jnz .loop3_pass2 \n\t"
				1806	"EMMS \n\t" // DONE
				1807
				1808	: "=c" (dummy_value_c), // output regs (dummy)
				1809	"=S" (dummy_value_S),
				1810	"=D" (dummy_value_D)
				1811
				1812	: "1" (sptr), // esi // input regs
				1813	"2" (dp), // edi
				1814	"0" (width) // ecx
				1815
				1816	#if 0 /* %mm0, ..., %mm2 not supported by gcc 2.7.2.3 or egcs 1.1 */
				1817	: "%mm0", "%mm1", "%mm2" // clobber list
				1818	#endif
				1819	);
				1820	}
				1821	else if (width) /* && ((pass == 4) \|\| (pass == 5)) */
				1822	{
				1823	int width_mmx = ((width >> 1) << 1) - 8; // GRR: huh?
				1824	if (width_mmx < 0)
				1825	width_mmx = 0;
				1826	width -= width_mmx; // 8 or 9 pix, 24 or 27 bytes
				1827	if (width_mmx)
				1828	{
				1829	// png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
				1830	// sptr points at last pixel in pre-expanded row
				1831	// dp points at last pixel position in expanded row
				1832	int dummy_value_c; // fix 'forbidden register spilled'
				1833	int dummy_value_S;
				1834	int dummy_value_D;
				1835
				1836	__asm__ __volatile__ (
				1837	"subl $3, %%esi \n\t"
				1838	"subl $9, %%edi \n\t"
				1839	// (png_pass_inc[pass] + 1)*pixel_bytes
				1840
				1841	".loop3_pass4: \n\t"
				1842	"movq (%%esi), %%mm0 \n\t" // x x 5 4 3 2 1 0
				1843	"movq %%mm0, %%mm1 \n\t" // x x 5 4 3 2 1 0
				1844	"movq %%mm0, %%mm2 \n\t" // x x 5 4 3 2 1 0
				1845	"psllq $24, %%mm0 \n\t" // 4 3 2 1 0 z z z
				1846	"pand _const4, %%mm1 \n\t" // z z z z z 2 1 0
				1847	"psrlq $24, %%mm2 \n\t" // z z z x x 5 4 3
				1848	"por %%mm1, %%mm0 \n\t" // 4 3 2 1 0 2 1 0
				1849	"movq %%mm2, %%mm3 \n\t" // z z z x x 5 4 3
				1850	"psllq $8, %%mm2 \n\t" // z z x x 5 4 3 z
				1851	"movq %%mm0, (%%edi) \n\t"
				1852	"psrlq $16, %%mm3 \n\t" // z z z z z x x 5
				1853	"pand _const6, %%mm3 \n\t" // z z z z z z z 5
				1854	"por %%mm3, %%mm2 \n\t" // z z x x 5 4 3 5
				1855	"subl $6, %%esi \n\t"
				1856	"movd %%mm2, 8(%%edi) \n\t"
				1857	"subl $12, %%edi \n\t"
				1858	"subl $2, %%ecx \n\t"
				1859	"jnz .loop3_pass4 \n\t"
				1860	"EMMS \n\t" // DONE
				1861
				1862	: "=c" (dummy_value_c), // output regs (dummy)
				1863	"=S" (dummy_value_S),
				1864	"=D" (dummy_value_D)
				1865
				1866	: "1" (sptr), // esi // input regs
				1867	"2" (dp), // edi
				1868	"0" (width_mmx) // ecx
				1869
				1870	#if 0 /* %mm0, ..., %mm3 not supported by gcc 2.7.2.3 or egcs 1.1 */
				1871	: "%mm0", "%mm1" // clobber list
				1872	, "%mm2", "%mm3"
				1873	#endif
				1874	);
				1875	}
				1876
				1877	sptr -= width_mmx*3;
				1878	dp -= width_mmx*6;
				1879	for (i = width; i; i--)
				1880	{
				1881	png_byte v[8];
				1882	int j;
				1883
				1884	png_memcpy(v, sptr, 3);
				1885	for (j = 0; j < png_pass_inc[pass]; j++)
				1886	{
				1887	png_memcpy(dp, v, 3);
				1888	dp -= 3;
				1889	}
				1890	sptr -= 3;
				1891	}
				1892	}
				1893	} /* end of pixel_bytes == 3 */
				1894
				1895	//--------------------------------------------------------------
				1896	else if (pixel_bytes == 1)
				1897	{
				1898	if (((pass == 0) \|\| (pass == 1)) && width)
				1899	{
				1900	int width_mmx = ((width >> 2) << 2);
				1901	width -= width_mmx; // 0-3 pixels => 0-3 bytes
				1902	if (width_mmx)
				1903	{
				1904	int dummy_value_c; // fix 'forbidden register spilled'
				1905	int dummy_value_S;
				1906	int dummy_value_D;
				1907
				1908	__asm__ __volatile__ (
				1909	"subl $3, %%esi \n\t"
				1910	"subl $31, %%edi \n\t"
				1911
				1912	".loop1_pass0: \n\t"
				1913	"movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
				1914	"movq %%mm0, %%mm1 \n\t" // x x x x 3 2 1 0
				1915	"punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
				1916	"movq %%mm0, %%mm2 \n\t" // 3 3 2 2 1 1 0 0
				1917	"punpcklwd %%mm0, %%mm0 \n\t" // 1 1 1 1 0 0 0 0
				1918	"movq %%mm0, %%mm3 \n\t" // 1 1 1 1 0 0 0 0
				1919	"punpckldq %%mm0, %%mm0 \n\t" // 0 0 0 0 0 0 0 0
				1920	"punpckhdq %%mm3, %%mm3 \n\t" // 1 1 1 1 1 1 1 1
				1921	"movq %%mm0, (%%edi) \n\t"
				1922	"punpckhwd %%mm2, %%mm2 \n\t" // 3 3 3 3 2 2 2 2
				1923	"movq %%mm3, 8(%%edi) \n\t"
				1924	"movq %%mm2, %%mm4 \n\t" // 3 3 3 3 2 2 2 2
				1925	"punpckldq %%mm2, %%mm2 \n\t" // 2 2 2 2 2 2 2 2
				1926	"punpckhdq %%mm4, %%mm4 \n\t" // 3 3 3 3 3 3 3 3
				1927	"movq %%mm2, 16(%%edi) \n\t"
				1928	"subl $4, %%esi \n\t"
				1929	"movq %%mm4, 24(%%edi) \n\t"
				1930	"subl $32, %%edi \n\t"
				1931	"subl $4, %%ecx \n\t"
				1932	"jnz .loop1_pass0 \n\t"
				1933	"EMMS \n\t" // DONE
				1934
				1935	: "=c" (dummy_value_c), // output regs (dummy)
				1936	"=S" (dummy_value_S),
				1937	"=D" (dummy_value_D)
				1938
				1939	: "1" (sptr), // esi // input regs
				1940	"2" (dp), // edi
				1941	"0" (width_mmx) // ecx
				1942
				1943	#if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
				1944	: "%mm0", "%mm1", "%mm2" // clobber list
				1945	, "%mm3", "%mm4"
				1946	#endif
				1947	);
				1948	}
				1949
				1950	sptr -= width_mmx;
				1951	dp -= width_mmx*8;
				1952	for (i = width; i; i--)
				1953	{
				1954	int j;
				1955
				1956	/* I simplified this part in version 1.0.4e
				1957	* here and in several other instances where
				1958	* pixel_bytes == 1 -- GR-P
				1959	*
				1960	* Original code:
				1961	*
				1962	* png_byte v[8];
				1963	* png_memcpy(v, sptr, pixel_bytes);
				1964	* for (j = 0; j < png_pass_inc[pass]; j++)
				1965	* {
				1966	* png_memcpy(dp, v, pixel_bytes);
				1967	* dp -= pixel_bytes;
				1968	* }
				1969	* sptr -= pixel_bytes;
				1970	*
				1971	* Replacement code is in the next three lines:
				1972	*/
				1973
				1974	for (j = 0; j < png_pass_inc[pass]; j++)
				1975	{
				1976	dp-- = sptr;
				1977	}
				1978	--sptr;
				1979	}
				1980	}
				1981	else if (((pass == 2) \|\| (pass == 3)) && width)
				1982	{
				1983	int width_mmx = ((width >> 2) << 2);
				1984	width -= width_mmx; // 0-3 pixels => 0-3 bytes
				1985	if (width_mmx)
				1986	{
				1987	int dummy_value_c; // fix 'forbidden register spilled'
				1988	int dummy_value_S;
				1989	int dummy_value_D;
				1990
				1991	__asm__ __volatile__ (
				1992	"subl $3, %%esi \n\t"
				1993	"subl $15, %%edi \n\t"
				1994
				1995	".loop1_pass2: \n\t"
				1996	"movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
				1997	"punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
				1998	"movq %%mm0, %%mm1 \n\t" // 3 3 2 2 1 1 0 0
				1999	"punpcklwd %%mm0, %%mm0 \n\t" // 1 1 1 1 0 0 0 0
				2000	"punpckhwd %%mm1, %%mm1 \n\t" // 3 3 3 3 2 2 2 2
				2001	"movq %%mm0, (%%edi) \n\t"
				2002	"subl $4, %%esi \n\t"
				2003	"movq %%mm1, 8(%%edi) \n\t"
				2004	"subl $16, %%edi \n\t"
				2005	"subl $4, %%ecx \n\t"
				2006	"jnz .loop1_pass2 \n\t"
				2007	"EMMS \n\t" // DONE
				2008
				2009	: "=c" (dummy_value_c), // output regs (dummy)
				2010	"=S" (dummy_value_S),
				2011	"=D" (dummy_value_D)
				2012
				2013	: "1" (sptr), // esi // input regs
				2014	"2" (dp), // edi
				2015	"0" (width_mmx) // ecx
				2016
				2017	#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
				2018	: "%mm0", "%mm1" // clobber list
				2019	#endif
				2020	);
				2021	}
				2022
				2023	sptr -= width_mmx;
				2024	dp -= width_mmx*4;
				2025	for (i = width; i; i--)
				2026	{
				2027	int j;
				2028
				2029	for (j = 0; j < png_pass_inc[pass]; j++)
				2030	{
				2031	dp-- = sptr;
				2032	}
				2033	--sptr;
				2034	}
				2035	}
				2036	else if (width) /* && ((pass == 4) \|\| (pass == 5)) */
				2037	{
				2038	int width_mmx = ((width >> 3) << 3);
				2039	width -= width_mmx; // 0-3 pixels => 0-3 bytes
				2040	if (width_mmx)
				2041	{
				2042	int dummy_value_c; // fix 'forbidden register spilled'
				2043	int dummy_value_S;
				2044	int dummy_value_D;
				2045
				2046	__asm__ __volatile__ (
				2047	"subl $7, %%esi \n\t"
				2048	"subl $15, %%edi \n\t"
				2049
				2050	".loop1_pass4: \n\t"
				2051	"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
				2052	"movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
				2053	"punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
				2054	"punpckhbw %%mm1, %%mm1 \n\t" // 7 7 6 6 5 5 4 4
				2055	"movq %%mm1, 8(%%edi) \n\t"
				2056	"subl $8, %%esi \n\t"
				2057	"movq %%mm0, (%%edi) \n\t"
				2058	"subl $16, %%edi \n\t"
				2059	"subl $8, %%ecx \n\t"
				2060	"jnz .loop1_pass4 \n\t"
				2061	"EMMS \n\t" // DONE
				2062
				2063	: "=c" (dummy_value_c), // output regs (none)
				2064	"=S" (dummy_value_S),
				2065	"=D" (dummy_value_D)
				2066
				2067	: "1" (sptr), // esi // input regs
				2068	"2" (dp), // edi
				2069	"0" (width_mmx) // ecx
				2070
				2071	#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
				2072	: "%mm0", "%mm1" // clobber list
				2073	#endif
				2074	);
				2075	}
				2076
				2077	sptr -= width_mmx;
				2078	dp -= width_mmx*2;
				2079	for (i = width; i; i--)
				2080	{
				2081	int j;
				2082
				2083	for (j = 0; j < png_pass_inc[pass]; j++)
				2084	{
				2085	dp-- = sptr;
				2086	}
				2087	--sptr;
				2088	}
				2089	}
				2090	} /* end of pixel_bytes == 1 */
				2091
				2092	//--------------------------------------------------------------
				2093	else if (pixel_bytes == 2)
				2094	{
				2095	if (((pass == 0) \|\| (pass == 1)) && width)
				2096	{
				2097	int width_mmx = ((width >> 1) << 1);
				2098	width -= width_mmx; // 0,1 pixels => 0,2 bytes
				2099	if (width_mmx)
				2100	{
				2101	int dummy_value_c; // fix 'forbidden register spilled'
				2102	int dummy_value_S;
				2103	int dummy_value_D;
				2104
				2105	__asm__ __volatile__ (
				2106	"subl $2, %%esi \n\t"
				2107	"subl $30, %%edi \n\t"
				2108
				2109	".loop2_pass0: \n\t"
				2110	"movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
				2111	"punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
				2112	"movq %%mm0, %%mm1 \n\t" // 3 2 3 2 1 0 1 0
				2113	"punpckldq %%mm0, %%mm0 \n\t" // 1 0 1 0 1 0 1 0
				2114	"punpckhdq %%mm1, %%mm1 \n\t" // 3 2 3 2 3 2 3 2
				2115	"movq %%mm0, (%%edi) \n\t"
				2116	"movq %%mm0, 8(%%edi) \n\t"
				2117	"movq %%mm1, 16(%%edi) \n\t"
				2118	"subl $4, %%esi \n\t"
				2119	"movq %%mm1, 24(%%edi) \n\t"
				2120	"subl $32, %%edi \n\t"
				2121	"subl $2, %%ecx \n\t"
				2122	"jnz .loop2_pass0 \n\t"
				2123	"EMMS \n\t" // DONE
				2124
				2125	: "=c" (dummy_value_c), // output regs (dummy)
				2126	"=S" (dummy_value_S),
				2127	"=D" (dummy_value_D)
				2128
				2129	: "1" (sptr), // esi // input regs
				2130	"2" (dp), // edi
				2131	"0" (width_mmx) // ecx
				2132
				2133	#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
				2134	: "%mm0", "%mm1" // clobber list
				2135	#endif
				2136	);
				2137	}
				2138
				2139	sptr -= (width_mmx*2 - 2); // sign fixed
				2140	dp -= (width_mmx*16 - 2); // sign fixed
				2141	for (i = width; i; i--)
				2142	{
				2143	png_byte v[8];
				2144	int j;
				2145	sptr -= 2;
				2146	png_memcpy(v, sptr, 2);
				2147	for (j = 0; j < png_pass_inc[pass]; j++)
				2148	{
				2149	dp -= 2;
				2150	png_memcpy(dp, v, 2);
				2151	}
				2152	}
				2153	}
				2154	else if (((pass == 2) \|\| (pass == 3)) && width)
				2155	{
				2156	int width_mmx = ((width >> 1) << 1) ;
				2157	width -= width_mmx; // 0,1 pixels => 0,2 bytes
				2158	if (width_mmx)
				2159	{
				2160	int dummy_value_c; // fix 'forbidden register spilled'
				2161	int dummy_value_S;
				2162	int dummy_value_D;
				2163
				2164	__asm__ __volatile__ (
				2165	"subl $2, %%esi \n\t"
				2166	"subl $14, %%edi \n\t"
				2167
				2168	".loop2_pass2: \n\t"
				2169	"movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
				2170	"punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
				2171	"movq %%mm0, %%mm1 \n\t" // 3 2 3 2 1 0 1 0
				2172	"punpckldq %%mm0, %%mm0 \n\t" // 1 0 1 0 1 0 1 0
				2173	"punpckhdq %%mm1, %%mm1 \n\t" // 3 2 3 2 3 2 3 2
				2174	"movq %%mm0, (%%edi) \n\t"
				2175	"subl $4, %%esi \n\t"
				2176	"movq %%mm1, 8(%%edi) \n\t"
				2177	"subl $16, %%edi \n\t"
				2178	"subl $2, %%ecx \n\t"
				2179	"jnz .loop2_pass2 \n\t"
				2180	"EMMS \n\t" // DONE
				2181
				2182	: "=c" (dummy_value_c), // output regs (dummy)
				2183	"=S" (dummy_value_S),
				2184	"=D" (dummy_value_D)
				2185
				2186	: "1" (sptr), // esi // input regs
				2187	"2" (dp), // edi
				2188	"0" (width_mmx) // ecx
				2189
				2190	#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
				2191	: "%mm0", "%mm1" // clobber list
				2192	#endif
				2193	);
				2194	}
				2195
				2196	sptr -= (width_mmx*2 - 2); // sign fixed
				2197	dp -= (width_mmx*8 - 2); // sign fixed
				2198	for (i = width; i; i--)
				2199	{
				2200	png_byte v[8];
				2201	int j;
				2202	sptr -= 2;
				2203	png_memcpy(v, sptr, 2);
				2204	for (j = 0; j < png_pass_inc[pass]; j++)
				2205	{
				2206	dp -= 2;
				2207	png_memcpy(dp, v, 2);
				2208	}
				2209	}
				2210	}
				2211	else if (width) // pass == 4 or 5
				2212	{
				2213	int width_mmx = ((width >> 1) << 1) ;
				2214	width -= width_mmx; // 0,1 pixels => 0,2 bytes
				2215	if (width_mmx)
				2216	{
				2217	int dummy_value_c; // fix 'forbidden register spilled'
				2218	int dummy_value_S;
				2219	int dummy_value_D;
				2220
				2221	__asm__ __volatile__ (
				2222	"subl $2, %%esi \n\t"
				2223	"subl $6, %%edi \n\t"
				2224
				2225	".loop2_pass4: \n\t"
				2226	"movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
				2227	"punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
				2228	"subl $4, %%esi \n\t"
				2229	"movq %%mm0, (%%edi) \n\t"
				2230	"subl $8, %%edi \n\t"
				2231	"subl $2, %%ecx \n\t"
				2232	"jnz .loop2_pass4 \n\t"
				2233	"EMMS \n\t" // DONE
				2234
				2235	: "=c" (dummy_value_c), // output regs (dummy)
				2236	"=S" (dummy_value_S),
				2237	"=D" (dummy_value_D)
				2238
				2239	: "1" (sptr), // esi // input regs
				2240	"2" (dp), // edi
				2241	"0" (width_mmx) // ecx
				2242
				2243	#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
				2244	: "%mm0" // clobber list
				2245	#endif
				2246	);
				2247	}
				2248
				2249	sptr -= (width_mmx*2 - 2); // sign fixed
				2250	dp -= (width_mmx*4 - 2); // sign fixed
				2251	for (i = width; i; i--)
				2252	{
				2253	png_byte v[8];
				2254	int j;
				2255	sptr -= 2;
				2256	png_memcpy(v, sptr, 2);
				2257	for (j = 0; j < png_pass_inc[pass]; j++)
				2258	{
				2259	dp -= 2;
				2260	png_memcpy(dp, v, 2);
				2261	}
				2262	}
				2263	}
				2264	} /* end of pixel_bytes == 2 */
				2265
				2266	//--------------------------------------------------------------
				2267	else if (pixel_bytes == 4)
				2268	{
				2269	if (((pass == 0) \|\| (pass == 1)) && width)
				2270	{
				2271	int width_mmx = ((width >> 1) << 1);
				2272	width -= width_mmx; // 0,1 pixels => 0,4 bytes
				2273	if (width_mmx)
				2274	{
				2275	int dummy_value_c; // fix 'forbidden register spilled'
				2276	int dummy_value_S;
				2277	int dummy_value_D;
				2278
				2279	__asm__ __volatile__ (
				2280	"subl $4, %%esi \n\t"
				2281	"subl $60, %%edi \n\t"
				2282
				2283	".loop4_pass0: \n\t"
				2284	"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
				2285	"movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
				2286	"punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
				2287	"punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
				2288	"movq %%mm0, (%%edi) \n\t"
				2289	"movq %%mm0, 8(%%edi) \n\t"
				2290	"movq %%mm0, 16(%%edi) \n\t"
				2291	"movq %%mm0, 24(%%edi) \n\t"
				2292	"movq %%mm1, 32(%%edi) \n\t"
				2293	"movq %%mm1, 40(%%edi) \n\t"
				2294	"movq %%mm1, 48(%%edi) \n\t"
				2295	"subl $8, %%esi \n\t"
				2296	"movq %%mm1, 56(%%edi) \n\t"
				2297	"subl $64, %%edi \n\t"
				2298	"subl $2, %%ecx \n\t"
				2299	"jnz .loop4_pass0 \n\t"
				2300	"EMMS \n\t" // DONE
				2301
				2302	: "=c" (dummy_value_c), // output regs (dummy)
				2303	"=S" (dummy_value_S),
				2304	"=D" (dummy_value_D)
				2305
				2306	: "1" (sptr), // esi // input regs
				2307	"2" (dp), // edi
				2308	"0" (width_mmx) // ecx
				2309
				2310	#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
				2311	: "%mm0", "%mm1" // clobber list
				2312	#endif
				2313	);
				2314	}
				2315
				2316	sptr -= (width_mmx*4 - 4); // sign fixed
				2317	dp -= (width_mmx*32 - 4); // sign fixed
				2318	for (i = width; i; i--)
				2319	{
				2320	png_byte v[8];
				2321	int j;
				2322	sptr -= 4;
				2323	png_memcpy(v, sptr, 4);
				2324	for (j = 0; j < png_pass_inc[pass]; j++)
				2325	{
				2326	dp -= 4;
				2327	png_memcpy(dp, v, 4);
				2328	}
				2329	}
				2330	}
				2331	else if (((pass == 2) \|\| (pass == 3)) && width)
				2332	{
				2333	int width_mmx = ((width >> 1) << 1);
				2334	width -= width_mmx; // 0,1 pixels => 0,4 bytes
				2335	if (width_mmx)
				2336	{
				2337	int dummy_value_c; // fix 'forbidden register spilled'
				2338	int dummy_value_S;
				2339	int dummy_value_D;
				2340
				2341	__asm__ __volatile__ (
				2342	"subl $4, %%esi \n\t"
				2343	"subl $28, %%edi \n\t"
				2344
				2345	".loop4_pass2: \n\t"
				2346	"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
				2347	"movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
				2348	"punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
				2349	"punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
				2350	"movq %%mm0, (%%edi) \n\t"
				2351	"movq %%mm0, 8(%%edi) \n\t"
				2352	"movq %%mm1, 16(%%edi) \n\t"
				2353	"movq %%mm1, 24(%%edi) \n\t"
				2354	"subl $8, %%esi \n\t"
				2355	"subl $32, %%edi \n\t"
				2356	"subl $2, %%ecx \n\t"
				2357	"jnz .loop4_pass2 \n\t"
				2358	"EMMS \n\t" // DONE
				2359
				2360	: "=c" (dummy_value_c), // output regs (dummy)
				2361	"=S" (dummy_value_S),
				2362	"=D" (dummy_value_D)
				2363
				2364	: "1" (sptr), // esi // input regs
				2365	"2" (dp), // edi
				2366	"0" (width_mmx) // ecx
				2367
				2368	#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
				2369	: "%mm0", "%mm1" // clobber list
				2370	#endif
				2371	);
				2372	}
				2373
				2374	sptr -= (width_mmx*4 - 4); // sign fixed
				2375	dp -= (width_mmx*16 - 4); // sign fixed
				2376	for (i = width; i; i--)
				2377	{
				2378	png_byte v[8];
				2379	int j;
				2380	sptr -= 4;
				2381	png_memcpy(v, sptr, 4);
				2382	for (j = 0; j < png_pass_inc[pass]; j++)
				2383	{
				2384	dp -= 4;
				2385	png_memcpy(dp, v, 4);
				2386	}
				2387	}
				2388	}
				2389	else if (width) // pass == 4 or 5
				2390	{
				2391	int width_mmx = ((width >> 1) << 1) ;
				2392	width -= width_mmx; // 0,1 pixels => 0,4 bytes
				2393	if (width_mmx)
				2394	{
				2395	int dummy_value_c; // fix 'forbidden register spilled'
				2396	int dummy_value_S;
				2397	int dummy_value_D;
				2398
				2399	__asm__ __volatile__ (
				2400	"subl $4, %%esi \n\t"
				2401	"subl $12, %%edi \n\t"
				2402
				2403	".loop4_pass4: \n\t"
				2404	"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
				2405	"movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
				2406	"punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
				2407	"punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
				2408	"movq %%mm0, (%%edi) \n\t"
				2409	"subl $8, %%esi \n\t"
				2410	"movq %%mm1, 8(%%edi) \n\t"
				2411	"subl $16, %%edi \n\t"
				2412	"subl $2, %%ecx \n\t"
				2413	"jnz .loop4_pass4 \n\t"
				2414	"EMMS \n\t" // DONE
				2415
				2416	: "=c" (dummy_value_c), // output regs (dummy)
				2417	"=S" (dummy_value_S),
				2418	"=D" (dummy_value_D)
				2419
				2420	: "1" (sptr), // esi // input regs
				2421	"2" (dp), // edi
				2422	"0" (width_mmx) // ecx
				2423
				2424	#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
				2425	: "%mm0", "%mm1" // clobber list
				2426	#endif
				2427	);
				2428	}
				2429
				2430	sptr -= (width_mmx*4 - 4); // sign fixed
				2431	dp -= (width_mmx*8 - 4); // sign fixed
				2432	for (i = width; i; i--)
				2433	{
				2434	png_byte v[8];
				2435	int j;
				2436	sptr -= 4;
				2437	png_memcpy(v, sptr, 4);
				2438	for (j = 0; j < png_pass_inc[pass]; j++)
				2439	{
				2440	dp -= 4;
				2441	png_memcpy(dp, v, 4);
				2442	}
				2443	}
				2444	}
				2445	} /* end of pixel_bytes == 4 */
				2446
				2447	//--------------------------------------------------------------
				2448	else if (pixel_bytes == 8)
				2449	{
				2450	// GRR TEST: should work, but needs testing (special 64-bit version of rpng2?)
				2451	// GRR NOTE: no need to combine passes here!
				2452	if (((pass == 0) \|\| (pass == 1)) && width)
				2453	{
				2454	int dummy_value_c; // fix 'forbidden register spilled'
				2455	int dummy_value_S;
				2456	int dummy_value_D;
				2457
				2458	// source is 8-byte RRGGBBAA
				2459	// dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA ...
				2460	__asm__ __volatile__ (
				2461	"subl $56, %%edi \n\t" // start of last block
				2462
				2463	".loop8_pass0: \n\t"
				2464	"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
				2465	"movq %%mm0, (%%edi) \n\t"
				2466	"movq %%mm0, 8(%%edi) \n\t"
				2467	"movq %%mm0, 16(%%edi) \n\t"
				2468	"movq %%mm0, 24(%%edi) \n\t"
				2469	"movq %%mm0, 32(%%edi) \n\t"
				2470	"movq %%mm0, 40(%%edi) \n\t"
				2471	"movq %%mm0, 48(%%edi) \n\t"
				2472	"subl $8, %%esi \n\t"
				2473	"movq %%mm0, 56(%%edi) \n\t"
				2474	"subl $64, %%edi \n\t"
				2475	"decl %%ecx \n\t"
				2476	"jnz .loop8_pass0 \n\t"
				2477	"EMMS \n\t" // DONE
				2478
				2479	: "=c" (dummy_value_c), // output regs (dummy)
				2480	"=S" (dummy_value_S),
				2481	"=D" (dummy_value_D)
				2482
				2483	: "1" (sptr), // esi // input regs
				2484	"2" (dp), // edi
				2485	"0" (width) // ecx
				2486
				2487	#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
				2488	: "%mm0" // clobber list
				2489	#endif
				2490	);
				2491	}
				2492	else if (((pass == 2) \|\| (pass == 3)) && width)
				2493	{
				2494	// source is 8-byte RRGGBBAA
				2495	// dest is 32-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA
				2496	// (recall that expansion is _in place_: sptr and dp
				2497	// both point at locations within same row buffer)
				2498	{
				2499	int dummy_value_c; // fix 'forbidden register spilled'
				2500	int dummy_value_S;
				2501	int dummy_value_D;
				2502
				2503	__asm__ __volatile__ (
				2504	"subl $24, %%edi \n\t" // start of last block
				2505
				2506	".loop8_pass2: \n\t"
				2507	"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
				2508	"movq %%mm0, (%%edi) \n\t"
				2509	"movq %%mm0, 8(%%edi) \n\t"
				2510	"movq %%mm0, 16(%%edi) \n\t"
				2511	"subl $8, %%esi \n\t"
				2512	"movq %%mm0, 24(%%edi) \n\t"
				2513	"subl $32, %%edi \n\t"
				2514	"decl %%ecx \n\t"
				2515	"jnz .loop8_pass2 \n\t"
				2516	"EMMS \n\t" // DONE
				2517
				2518	: "=c" (dummy_value_c), // output regs (dummy)
				2519	"=S" (dummy_value_S),
				2520	"=D" (dummy_value_D)
				2521
				2522	: "1" (sptr), // esi // input regs
				2523	"2" (dp), // edi
				2524	"0" (width) // ecx
				2525
				2526	#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
				2527	: "%mm0" // clobber list
				2528	#endif
				2529	);
				2530	}
				2531	}
				2532	else if (width) // pass == 4 or 5
				2533	{
				2534	// source is 8-byte RRGGBBAA
				2535	// dest is 16-byte RRGGBBAA RRGGBBAA
				2536	{
				2537	int dummy_value_c; // fix 'forbidden register spilled'
				2538	int dummy_value_S;
				2539	int dummy_value_D;
				2540
				2541	__asm__ __volatile__ (
				2542	"subl $8, %%edi \n\t" // start of last block
				2543
				2544	".loop8_pass4: \n\t"
				2545	"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
				2546	"movq %%mm0, (%%edi) \n\t"
				2547	"subl $8, %%esi \n\t"
				2548	"movq %%mm0, 8(%%edi) \n\t"
				2549	"subl $16, %%edi \n\t"
				2550	"decl %%ecx \n\t"
				2551	"jnz .loop8_pass4 \n\t"
				2552	"EMMS \n\t" // DONE
				2553
				2554	: "=c" (dummy_value_c), // output regs (dummy)
				2555	"=S" (dummy_value_S),
				2556	"=D" (dummy_value_D)
				2557
				2558	: "1" (sptr), // esi // input regs
				2559	"2" (dp), // edi
				2560	"0" (width) // ecx
				2561
				2562	#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
				2563	: "%mm0" // clobber list
				2564	#endif
				2565	);
				2566	}
				2567	}
				2568
				2569	} /* end of pixel_bytes == 8 */
				2570
				2571	//--------------------------------------------------------------
				2572	else if (pixel_bytes == 6)
				2573	{
				2574	for (i = width; i; i--)
				2575	{
				2576	png_byte v[8];
				2577	int j;
				2578	png_memcpy(v, sptr, 6);
				2579	for (j = 0; j < png_pass_inc[pass]; j++)
				2580	{
				2581	png_memcpy(dp, v, 6);
				2582	dp -= 6;
				2583	}
				2584	sptr -= 6;
				2585	}
				2586	} /* end of pixel_bytes == 6 */
				2587
				2588	//--------------------------------------------------------------
				2589	else
				2590	{
				2591	for (i = width; i; i--)
				2592	{
				2593	png_byte v[8];
				2594	int j;
				2595	png_memcpy(v, sptr, pixel_bytes);
				2596	for (j = 0; j < png_pass_inc[pass]; j++)
				2597	{
				2598	png_memcpy(dp, v, pixel_bytes);
				2599	dp -= pixel_bytes;
				2600	}
				2601	sptr-= pixel_bytes;
				2602	}
				2603	}
				2604	} // end of _mmx_supported ========================================
				2605
				2606	else /* MMX not supported: use modified C code - takes advantage
				2607	* of inlining of png_memcpy for a constant */
				2608	/* GRR 19991007: does it? or should pixel_bytes in each
				2609	* block be replaced with immediate value (e.g., 1)? */
				2610	/* GRR 19991017: replaced with constants in each case */
				2611	#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
				2612	{
				2613	if (pixel_bytes == 1)
				2614	{
				2615	for (i = width; i; i--)
				2616	{
				2617	int j;
				2618	for (j = 0; j < png_pass_inc[pass]; j++)
				2619	{
				2620	dp-- = sptr;
				2621	}
				2622	--sptr;
				2623	}
				2624	}
				2625	else if (pixel_bytes == 3)
				2626	{
				2627	for (i = width; i; i--)
				2628	{
				2629	png_byte v[8];
				2630	int j;
				2631	png_memcpy(v, sptr, 3);
				2632	for (j = 0; j < png_pass_inc[pass]; j++)
				2633	{
				2634	png_memcpy(dp, v, 3);
				2635	dp -= 3;
				2636	}
				2637	sptr -= 3;
				2638	}
				2639	}
				2640	else if (pixel_bytes == 2)
				2641	{
				2642	for (i = width; i; i--)
				2643	{
				2644	png_byte v[8];
				2645	int j;
				2646	png_memcpy(v, sptr, 2);
				2647	for (j = 0; j < png_pass_inc[pass]; j++)
				2648	{
				2649	png_memcpy(dp, v, 2);
				2650	dp -= 2;
				2651	}
				2652	sptr -= 2;
				2653	}
				2654	}
				2655	else if (pixel_bytes == 4)
				2656	{
				2657	for (i = width; i; i--)
				2658	{
				2659	png_byte v[8];
				2660	int j;
				2661	png_memcpy(v, sptr, 4);
				2662	for (j = 0; j < png_pass_inc[pass]; j++)
				2663	{
				2664	#ifdef PNG_DEBUG
				2665	if (dp < row \|\| dp+3 > row+png_ptr->row_buf_size)
				2666	{
				2667	printf("dp out of bounds: row=%d, dp=%d, rp=%d\n",
				2668	row, dp, row+png_ptr->row_buf_size);
				2669	printf("row_buf=%d\n",png_ptr->row_buf_size);
				2670	}
				2671	#endif
				2672	png_memcpy(dp, v, 4);
				2673	dp -= 4;
				2674	}
				2675	sptr -= 4;
				2676	}
				2677	}
				2678	else if (pixel_bytes == 6)
				2679	{
				2680	for (i = width; i; i--)
				2681	{
				2682	png_byte v[8];
				2683	int j;
				2684	png_memcpy(v, sptr, 6);
				2685	for (j = 0; j < png_pass_inc[pass]; j++)
				2686	{
				2687	png_memcpy(dp, v, 6);
				2688	dp -= 6;
				2689	}
				2690	sptr -= 6;
				2691	}
				2692	}
				2693	else if (pixel_bytes == 8)
				2694	{
				2695	for (i = width; i; i--)
				2696	{
				2697	png_byte v[8];
				2698	int j;
				2699	png_memcpy(v, sptr, 8);
				2700	for (j = 0; j < png_pass_inc[pass]; j++)
				2701	{
				2702	png_memcpy(dp, v, 8);
				2703	dp -= 8;
				2704	}
				2705	sptr -= 8;
				2706	}
				2707	}
				2708	else /* GRR: should never be reached */
				2709	{
				2710	for (i = width; i; i--)
				2711	{
				2712	png_byte v[8];
				2713	int j;
				2714	png_memcpy(v, sptr, pixel_bytes);
				2715	for (j = 0; j < png_pass_inc[pass]; j++)
				2716	{
				2717	png_memcpy(dp, v, pixel_bytes);
				2718	dp -= pixel_bytes;
				2719	}
				2720	sptr -= pixel_bytes;
				2721	}
				2722	}
				2723
				2724	} /* end if (MMX not supported) */
				2725	break;
				2726	}
				2727	} /* end switch (row_info->pixel_depth) */
				2728
				2729	row_info->width = final_width;
				2730	row_info->rowbytes = ((final_width *
				2731	(png_uint_32)row_info->pixel_depth + 7) >> 3);
				2732	}
				2733
				2734	} /* end png_do_read_interlace() */
				2735
				2736	#endif /* PNG_HAVE_ASSEMBLER_READ_INTERLACE */
				2737	#endif /* PNG_READ_INTERLACING_SUPPORTED */
				2738
				2739
				2740
				2741	#if defined(PNG_HAVE_ASSEMBLER_READ_FILTER_ROW)
				2742	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
				2743
				2744	// These variables are utilized in the functions below. They are declared
				2745	// globally here to ensure alignment on 8-byte boundaries.
				2746
				2747	union uAll {
				2748	long long use;
				2749	double align;
				2750	} _LBCarryMask = {0x0101010101010101LL},
				2751	_HBClearMask = {0x7f7f7f7f7f7f7f7fLL},
				2752	_ActiveMask, _ActiveMask2, _ActiveMaskEnd, _ShiftBpp, _ShiftRem;
				2753
				2754	#ifdef PNG_THREAD_UNSAFE_OK
				2755	//===========================================================================//
				2756	// //
				2757	// P N G _ R E A D _ F I L T E R _ R O W _ M M X _ A V G //
				2758	// //
				2759	//===========================================================================//
				2760
				2761	// Optimized code for PNG Average filter decoder
				2762
				2763	static void /* PRIVATE */
				2764	png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
				2765	png_bytep prev_row)
				2766	{
				2767	int bpp;
				2768	int dummy_value_c; // fix 'forbidden register 2 (cx) was spilled' error
				2769	int dummy_value_S;
				2770	int dummy_value_D;
				2771
				2772	bpp = (row_info->pixel_depth + 7) >> 3; // get # bytes per pixel
				2773	_FullLength = row_info->rowbytes; // # of bytes to filter
				2774
				2775	__asm__ __volatile__ (
				2776	// initialize address pointers and offset
				2777	#ifdef __PIC__
				2778	"pushl %%ebx \n\t" // save index to Global Offset Table
				2779	#endif
				2780	//pre "movl row, %%edi \n\t" // edi: Avg(x)
				2781	"xorl %%ebx, %%ebx \n\t" // ebx: x
				2782	"movl %%edi, %%edx \n\t"
				2783	//pre "movl prev_row, %%esi \n\t" // esi: Prior(x)
				2784	//pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
				2785	"subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
				2786
				2787	"xorl %%eax,%%eax \n\t"
				2788
				2789	// Compute the Raw value for the first bpp bytes
				2790	// Raw(x) = Avg(x) + (Prior(x)/2)
				2791	"avg_rlp: \n\t"
				2792	"movb (%%esi,%%ebx,),%%al \n\t" // load al with Prior(x)
				2793	"incl %%ebx \n\t"
				2794	"shrb %%al \n\t" // divide by 2
				2795	"addb -1(%%edi,%%ebx,),%%al \n\t" // add Avg(x); -1 to offset inc ebx
				2796	//pre "cmpl bpp, %%ebx \n\t" // (bpp is preloaded into ecx)
				2797	"cmpl %%ecx, %%ebx \n\t"
				2798	"movb %%al,-1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
				2799	"jb avg_rlp \n\t" // mov does not affect flags
				2800
				2801	// get # of bytes to alignment
				2802	"movl %%edi, _dif \n\t" // take start of row
				2803	"addl %%ebx, _dif \n\t" // add bpp
				2804	"addl $0xf, _dif \n\t" // add 7+8 to incr past alignment bdry
				2805	"andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
				2806	"subl %%edi, _dif \n\t" // subtract from start => value ebx at
				2807	"jz avg_go \n\t" // alignment
				2808
				2809	// fix alignment
				2810	// Compute the Raw value for the bytes up to the alignment boundary
				2811	// Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
				2812	"xorl %%ecx, %%ecx \n\t"
				2813
				2814	"avg_lp1: \n\t"
				2815	"xorl %%eax, %%eax \n\t"
				2816	"movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
				2817	"movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
				2818	"addw %%cx, %%ax \n\t"
				2819	"incl %%ebx \n\t"
				2820	"shrw %%ax \n\t" // divide by 2
				2821	"addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
				2822	"cmpl _dif, %%ebx \n\t" // check if at alignment boundary
				2823	"movb %%al, -1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
				2824	"jb avg_lp1 \n\t" // repeat until at alignment boundary
				2825
				2826	"avg_go: \n\t"
				2827	"movl _FullLength, %%eax \n\t"
				2828	"movl %%eax, %%ecx \n\t"
				2829	"subl %%ebx, %%eax \n\t" // subtract alignment fix
				2830	"andl $0x00000007, %%eax \n\t" // calc bytes over mult of 8
				2831	"subl %%eax, %%ecx \n\t" // drop over bytes from original length
				2832	"movl %%ecx, _MMXLength \n\t"
				2833	#ifdef __PIC__
				2834	"popl %%ebx \n\t" // restore index to Global Offset Table
				2835	#endif
				2836
				2837	: "=c" (dummy_value_c), // output regs (dummy)
				2838	"=S" (dummy_value_S),
				2839	"=D" (dummy_value_D)
				2840
				2841	: "0" (bpp), // ecx // input regs
				2842	"1" (prev_row), // esi
				2843	"2" (row) // edi
				2844
				2845	: "%eax", "%edx" // clobber list
				2846	#ifndef __PIC__
				2847	, "%ebx"
				2848	#endif
				2849	// GRR: INCLUDE "memory" as clobbered? (_dif, _MMXLength)
				2850	// (seems to work fine without...)
				2851	);
				2852
				2853	// now do the math for the rest of the row
				2854	switch (bpp)
				2855	{
				2856	case 3:
				2857	{
				2858	_ActiveMask.use = 0x0000000000ffffffLL;
				2859	_ShiftBpp.use = 24; // == 3 * 8
				2860	_ShiftRem.use = 40; // == 64 - 24
				2861
				2862	__asm__ __volatile__ (
				2863	// re-init address pointers and offset
				2864	"movq _ActiveMask, %%mm7 \n\t"
				2865	"movl _dif, %%ecx \n\t" // ecx: x = offset to
				2866	"movq _LBCarryMask, %%mm5 \n\t" // alignment boundary
				2867	// preload "movl row, %%edi \n\t" // edi: Avg(x)
				2868	"movq _HBClearMask, %%mm4 \n\t"
				2869	// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
				2870
				2871	// prime the pump: load the first Raw(x-bpp) data set
				2872	"movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
				2873	// (correct pos. in loop below)
				2874	"avg_3lp: \n\t"
				2875	"movq (%%edi,%%ecx,), %%mm0 \n\t" // load mm0 with Avg(x)
				2876	"movq %%mm5, %%mm3 \n\t"
				2877	"psrlq _ShiftRem, %%mm2 \n\t" // correct position Raw(x-bpp)
				2878	// data
				2879	"movq (%%esi,%%ecx,), %%mm1 \n\t" // load mm1 with Prior(x)
				2880	"movq %%mm7, %%mm6 \n\t"
				2881	"pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
				2882	"psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
				2883	"pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
				2884	// byte
				2885	"paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
				2886	// each byte
				2887	// add 1st active group (Raw(x-bpp)/2) to average with LBCarry
				2888	"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
				2889	// LBCarrys
				2890	"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
				2891	// where both
				2892	// lsb's were == 1 (only valid for active group)
				2893	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
				2894	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
				2895	// byte
				2896	"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
				2897	// for each byte
				2898	"pand %%mm6, %%mm2 \n\t" // leave only Active Group 1
				2899	// bytes to add to Avg
				2900	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
				2901	// Avg for each Active
				2902	// byte
				2903	// add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
				2904	"psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
				2905	// bytes 3-5
				2906	"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
				2907	"psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
				2908	"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
				2909	// LBCarrys
				2910	"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
				2911	// where both
				2912	// lsb's were == 1 (only valid for active group)
				2913	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
				2914	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
				2915	// byte
				2916	"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
				2917	// for each byte
				2918	"pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
				2919	// bytes to add to Avg
				2920	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
				2921	// Avg for each Active
				2922	// byte
				2923
				2924	// add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
				2925	"psllq _ShiftBpp, %%mm6 \n\t" // shift mm6 mask to cover last
				2926	// two
				2927	// bytes
				2928	"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
				2929	"psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
				2930	// Data only needs to be shifted once here to
				2931	// get the correct x-bpp offset.
				2932	"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
				2933	// LBCarrys
				2934	"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
				2935	// where both
				2936	// lsb's were == 1 (only valid for active group)
				2937	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
				2938	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
				2939	// byte
				2940	"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
				2941	// for each byte
				2942	"pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
				2943	// bytes to add to Avg
				2944	"addl $8, %%ecx \n\t"
				2945	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
				2946	// Avg for each Active
				2947	// byte
				2948	// now ready to write back to memory
				2949	"movq %%mm0, -8(%%edi,%%ecx,) \n\t"
				2950	// move updated Raw(x) to use as Raw(x-bpp) for next loop
				2951	"cmpl _MMXLength, %%ecx \n\t"
				2952	"movq %%mm0, %%mm2 \n\t" // mov updated Raw(x) to mm2
				2953	"jb avg_3lp \n\t"
				2954
				2955	: "=S" (dummy_value_S), // output regs (dummy)
				2956	"=D" (dummy_value_D)
				2957
				2958	: "0" (prev_row), // esi // input regs
				2959	"1" (row) // edi
				2960
				2961	: "%ecx" // clobber list
				2962	#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
				2963	, "%mm0", "%mm1", "%mm2", "%mm3"
				2964	, "%mm4", "%mm5", "%mm6", "%mm7"
				2965	#endif
				2966	);
				2967	}
				2968	break; // end 3 bpp
				2969
				2970	case 6:
				2971	case 4:
				2972	//case 7: // who wrote this? PNG doesn't support 5 or 7 bytes/pixel
				2973	//case 5: // GRR BOGUS
				2974	{
				2975	_ActiveMask.use = 0xffffffffffffffffLL; // use shift below to clear
				2976	// appropriate inactive bytes
				2977	_ShiftBpp.use = bpp << 3;
				2978	_ShiftRem.use = 64 - _ShiftBpp.use;
				2979
				2980	__asm__ __volatile__ (
				2981	"movq _HBClearMask, %%mm4 \n\t"
				2982
				2983	// re-init address pointers and offset
				2984	"movl _dif, %%ecx \n\t" // ecx: x = offset to
				2985	// alignment boundary
				2986
				2987	// load _ActiveMask and clear all bytes except for 1st active group
				2988	"movq _ActiveMask, %%mm7 \n\t"
				2989	// preload "movl row, %%edi \n\t" // edi: Avg(x)
				2990	"psrlq _ShiftRem, %%mm7 \n\t"
				2991	// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
				2992	"movq %%mm7, %%mm6 \n\t"
				2993	"movq _LBCarryMask, %%mm5 \n\t"
				2994	"psllq _ShiftBpp, %%mm6 \n\t" // create mask for 2nd active
				2995	// group
				2996
				2997	// prime the pump: load the first Raw(x-bpp) data set
				2998	"movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
				2999	// (we correct pos. in loop below)
				3000	"avg_4lp: \n\t"
				3001	"movq (%%edi,%%ecx,), %%mm0 \n\t"
				3002	"psrlq _ShiftRem, %%mm2 \n\t" // shift data to pos. correctly
				3003	"movq (%%esi,%%ecx,), %%mm1 \n\t"
				3004	// add (Prev_row/2) to average
				3005	"movq %%mm5, %%mm3 \n\t"
				3006	"pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
				3007	"psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
				3008	"pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
				3009	// byte
				3010	"paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
				3011	// each byte
				3012	// add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
				3013	"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
				3014	// LBCarrys
				3015	"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
				3016	// where both
				3017	// lsb's were == 1 (only valid for active group)
				3018	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
				3019	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
				3020	// byte
				3021	"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
				3022	// for each byte
				3023	"pand %%mm7, %%mm2 \n\t" // leave only Active Group 1
				3024	// bytes to add to Avg
				3025	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg
				3026	// for each Active
				3027	// byte
				3028	// add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
				3029	"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
				3030	"psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
				3031	"addl $8, %%ecx \n\t"
				3032	"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
				3033	// LBCarrys
				3034	"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
				3035	// where both
				3036	// lsb's were == 1 (only valid for active group)
				3037	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
				3038	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
				3039	// byte
				3040	"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
				3041	// for each byte
				3042	"pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
				3043	// bytes to add to Avg
				3044	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
				3045	// Avg for each Active
				3046	// byte
				3047	"cmpl _MMXLength, %%ecx \n\t"
				3048	// now ready to write back to memory
				3049	"movq %%mm0, -8(%%edi,%%ecx,) \n\t"
				3050	// prep Raw(x-bpp) for next loop
				3051	"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
				3052	"jb avg_4lp \n\t"
				3053
				3054	: "=S" (dummy_value_S), // output regs (dummy)
				3055	"=D" (dummy_value_D)
				3056
				3057	: "0" (prev_row), // esi // input regs
				3058	"1" (row) // edi
				3059
				3060	: "%ecx" // clobber list
				3061	#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
				3062	, "%mm0", "%mm1", "%mm2", "%mm3"
				3063	, "%mm4", "%mm5", "%mm6", "%mm7"
				3064	#endif
				3065	);
				3066	}
				3067	break; // end 4,6 bpp
				3068
				3069	case 2:
				3070	{
				3071	_ActiveMask.use = 0x000000000000ffffLL;
				3072	_ShiftBpp.use = 16; // == 2 * 8
				3073	_ShiftRem.use = 48; // == 64 - 16
				3074
				3075	__asm__ __volatile__ (
				3076	// load _ActiveMask
				3077	"movq _ActiveMask, %%mm7 \n\t"
				3078	// re-init address pointers and offset
				3079	"movl _dif, %%ecx \n\t" // ecx: x = offset to alignment
				3080	// boundary
				3081	"movq _LBCarryMask, %%mm5 \n\t"
				3082	// preload "movl row, %%edi \n\t" // edi: Avg(x)
				3083	"movq _HBClearMask, %%mm4 \n\t"
				3084	// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
				3085
				3086	// prime the pump: load the first Raw(x-bpp) data set
				3087	"movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
				3088	// (we correct pos. in loop below)
				3089	"avg_2lp: \n\t"
				3090	"movq (%%edi,%%ecx,), %%mm0 \n\t"
				3091	"psrlq _ShiftRem, %%mm2 \n\t" // shift data to pos. correctly
				3092	"movq (%%esi,%%ecx,), %%mm1 \n\t" // (GRR BUGFIX: was psllq)
				3093	// add (Prev_row/2) to average
				3094	"movq %%mm5, %%mm3 \n\t"
				3095	"pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
				3096	"psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
				3097	"pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
				3098	// byte
				3099	"movq %%mm7, %%mm6 \n\t"
				3100	"paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
				3101	// each byte
				3102
				3103	// add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
				3104	"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
				3105	// LBCarrys
				3106	"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
				3107	// where both
				3108	// lsb's were == 1 (only valid
				3109	// for active group)
				3110	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
				3111	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
				3112	// byte
				3113	"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
				3114	// for each byte
				3115	"pand %%mm6, %%mm2 \n\t" // leave only Active Group 1
				3116	// bytes to add to Avg
				3117	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg
				3118	// for each Active byte
				3119
				3120	// add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
				3121	"psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
				3122	// bytes 2 & 3
				3123	"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
				3124	"psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
				3125	"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
				3126	// LBCarrys
				3127	"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
				3128	// where both
				3129	// lsb's were == 1 (only valid
				3130	// for active group)
				3131	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
				3132	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
				3133	// byte
				3134	"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
				3135	// for each byte
				3136	"pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
				3137	// bytes to add to Avg
				3138	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
				3139	// Avg for each Active byte
				3140
				3141	// add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
				3142	"psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
				3143	// bytes 4 & 5
				3144	"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
				3145	"psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
				3146	"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
				3147	// LBCarrys
				3148	"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
				3149	// where both lsb's were == 1
				3150	// (only valid for active group)
				3151	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
				3152	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
				3153	// byte
				3154	"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
				3155	// for each byte
				3156	"pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
				3157	// bytes to add to Avg
				3158	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
				3159	// Avg for each Active byte
				3160
				3161	// add 4th active group (Raw(x-bpp)/2) to average with _LBCarry
				3162	"psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
				3163	// bytes 6 & 7
				3164	"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
				3165	"psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
				3166	"addl $8, %%ecx \n\t"
				3167	"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
				3168	// LBCarrys
				3169	"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
				3170	// where both
				3171	// lsb's were == 1 (only valid
				3172	// for active group)
				3173	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
				3174	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
				3175	// byte
				3176	"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
				3177	// for each byte
				3178	"pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
				3179	// bytes to add to Avg
				3180	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
				3181	// Avg for each Active byte
				3182
				3183	"cmpl _MMXLength, %%ecx \n\t"
				3184	// now ready to write back to memory
				3185	"movq %%mm0, -8(%%edi,%%ecx,) \n\t"
				3186	// prep Raw(x-bpp) for next loop
				3187	"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
				3188	"jb avg_2lp \n\t"
				3189
				3190	: "=S" (dummy_value_S), // output regs (dummy)
				3191	"=D" (dummy_value_D)
				3192
				3193	: "0" (prev_row), // esi // input regs
				3194	"1" (row) // edi
				3195
				3196	: "%ecx" // clobber list
				3197	#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
				3198	, "%mm0", "%mm1", "%mm2", "%mm3"
				3199	, "%mm4", "%mm5", "%mm6", "%mm7"
				3200	#endif
				3201	);
				3202	}
				3203	break; // end 2 bpp
				3204
				3205	case 1:
				3206	{
				3207	__asm__ __volatile__ (
				3208	// re-init address pointers and offset
				3209	#ifdef __PIC__
				3210	"pushl %%ebx \n\t" // save Global Offset Table index
				3211	#endif
				3212	"movl _dif, %%ebx \n\t" // ebx: x = offset to alignment
				3213	// boundary
				3214	// preload "movl row, %%edi \n\t" // edi: Avg(x)
				3215	"cmpl _FullLength, %%ebx \n\t" // test if offset at end of array
				3216	"jnb avg_1end \n\t"
				3217	// do Paeth decode for remaining bytes
				3218	// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
				3219	"movl %%edi, %%edx \n\t"
				3220	// preload "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
				3221	"subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
				3222	"xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx
				3223	// in loop below
				3224	"avg_1lp: \n\t"
				3225	// Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
				3226	"xorl %%eax, %%eax \n\t"
				3227	"movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
				3228	"movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
				3229	"addw %%cx, %%ax \n\t"
				3230	"incl %%ebx \n\t"
				3231	"shrw %%ax \n\t" // divide by 2
				3232	"addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset
				3233	// inc ebx
				3234	"cmpl _FullLength, %%ebx \n\t" // check if at end of array
				3235	"movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x);
				3236	// mov does not affect flags; -1 to offset inc ebx
				3237	"jb avg_1lp \n\t"
				3238
				3239	"avg_1end: \n\t"
				3240	#ifdef __PIC__
				3241	"popl %%ebx \n\t" // Global Offset Table index
				3242	#endif
				3243
				3244	: "=c" (dummy_value_c), // output regs (dummy)
				3245	"=S" (dummy_value_S),
				3246	"=D" (dummy_value_D)
				3247
				3248	: "0" (bpp), // ecx // input regs
				3249	"1" (prev_row), // esi
				3250	"2" (row) // edi
				3251
				3252	: "%eax", "%edx" // clobber list
				3253	#ifndef __PIC__
				3254	, "%ebx"
				3255	#endif
				3256	);
				3257	}
				3258	return; // end 1 bpp
				3259
				3260	case 8:
				3261	{
				3262	__asm__ __volatile__ (
				3263	// re-init address pointers and offset
				3264	"movl _dif, %%ecx \n\t" // ecx: x == offset to alignment
				3265	"movq _LBCarryMask, %%mm5 \n\t" // boundary
				3266	// preload "movl row, %%edi \n\t" // edi: Avg(x)
				3267	"movq _HBClearMask, %%mm4 \n\t"
				3268	// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
				3269
				3270	// prime the pump: load the first Raw(x-bpp) data set
				3271	"movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
				3272	// (NO NEED to correct pos. in loop below)
				3273
				3274	"avg_8lp: \n\t"
				3275	"movq (%%edi,%%ecx,), %%mm0 \n\t"
				3276	"movq %%mm5, %%mm3 \n\t"
				3277	"movq (%%esi,%%ecx,), %%mm1 \n\t"
				3278	"addl $8, %%ecx \n\t"
				3279	"pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
				3280	"psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
				3281	"pand %%mm2, %%mm3 \n\t" // get LBCarrys for each byte
				3282	// where both lsb's were == 1
				3283	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
				3284	"pand %%mm4, %%mm1 \n\t" // clear invalid bit 7, each byte
				3285	"paddb %%mm3, %%mm0 \n\t" // add LBCarrys to Avg, each byte
				3286	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7, each byte
				3287	"paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg, each
				3288	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) to Avg for each
				3289	"cmpl _MMXLength, %%ecx \n\t"
				3290	"movq %%mm0, -8(%%edi,%%ecx,) \n\t"
				3291	"movq %%mm0, %%mm2 \n\t" // reuse as Raw(x-bpp)
				3292	"jb avg_8lp \n\t"
				3293
				3294	: "=S" (dummy_value_S), // output regs (dummy)
				3295	"=D" (dummy_value_D)
				3296
				3297	: "0" (prev_row), // esi // input regs
				3298	"1" (row) // edi
				3299
				3300	: "%ecx" // clobber list
				3301	#if 0 /* %mm0, ..., %mm5 not supported by gcc 2.7.2.3 or egcs 1.1 */
				3302	, "%mm0", "%mm1", "%mm2"
				3303	, "%mm3", "%mm4", "%mm5"
				3304	#endif
				3305	);
				3306	}
				3307	break; // end 8 bpp
				3308
				3309	default: // bpp greater than 8 (!= 1,2,3,4,[5],6,[7],8)
				3310	{
				3311
				3312	#ifdef PNG_DEBUG
				3313	// GRR: PRINT ERROR HERE: SHOULD NEVER BE REACHED
				3314	png_debug(1,
				3315	"Internal logic error in pnggccrd (png_read_filter_row_mmx_avg())\n");
				3316	#endif
				3317
				3318	#if 0
				3319	__asm__ __volatile__ (
				3320	"movq _LBCarryMask, %%mm5 \n\t"
				3321	// re-init address pointers and offset
				3322	"movl _dif, %%ebx \n\t" // ebx: x = offset to
				3323	// alignment boundary
				3324	"movl row, %%edi \n\t" // edi: Avg(x)
				3325	"movq _HBClearMask, %%mm4 \n\t"
				3326	"movl %%edi, %%edx \n\t"
				3327	"movl prev_row, %%esi \n\t" // esi: Prior(x)
				3328	"subl bpp, %%edx \n\t" // edx: Raw(x-bpp)
				3329	"avg_Alp: \n\t"
				3330	"movq (%%edi,%%ebx,), %%mm0 \n\t"
				3331	"movq %%mm5, %%mm3 \n\t"
				3332	"movq (%%esi,%%ebx,), %%mm1 \n\t"
				3333	"pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
				3334	"movq (%%edx,%%ebx,), %%mm2 \n\t"
				3335	"psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
				3336	"pand %%mm2, %%mm3 \n\t" // get LBCarrys for each byte
				3337	// where both lsb's were == 1
				3338	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
				3339	"pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
				3340	// byte
				3341	"paddb %%mm3, %%mm0 \n\t" // add LBCarrys to Avg for each
				3342	// byte
				3343	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
				3344	// byte
				3345	"paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
				3346	// each byte
				3347	"addl $8, %%ebx \n\t"
				3348	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) to Avg for each
				3349	// byte
				3350	"cmpl _MMXLength, %%ebx \n\t"
				3351	"movq %%mm0, -8(%%edi,%%ebx,) \n\t"
				3352	"jb avg_Alp \n\t"
				3353
				3354	: // FIXASM: output regs/vars go here, e.g.: "=m" (memory_var)
				3355
				3356	: // FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest)
				3357
				3358	: "%ebx", "%edx", "%edi", "%esi" // CHECKASM: clobber list
				3359	);
				3360	#endif /* 0 - NEVER REACHED */
				3361	}
				3362	break;
				3363
				3364	} // end switch (bpp)
				3365
				3366	__asm__ __volatile__ (
				3367	// MMX acceleration complete; now do clean-up
				3368	// check if any remaining bytes left to decode
				3369	#ifdef __PIC__
				3370	"pushl %%ebx \n\t" // save index to Global Offset Table
				3371	#endif
				3372	"movl _MMXLength, %%ebx \n\t" // ebx: x == offset bytes after MMX
				3373	//pre "movl row, %%edi \n\t" // edi: Avg(x)
				3374	"cmpl _FullLength, %%ebx \n\t" // test if offset at end of array
				3375	"jnb avg_end \n\t"
				3376
				3377	// do Avg decode for remaining bytes
				3378	//pre "movl prev_row, %%esi \n\t" // esi: Prior(x)
				3379	"movl %%edi, %%edx \n\t"
				3380	//pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
				3381	"subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
				3382	"xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx below
				3383
				3384	"avg_lp2: \n\t"
				3385	// Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
				3386	"xorl %%eax, %%eax \n\t"
				3387	"movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
				3388	"movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
				3389	"addw %%cx, %%ax \n\t"
				3390	"incl %%ebx \n\t"
				3391	"shrw %%ax \n\t" // divide by 2
				3392	"addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
				3393	"cmpl _FullLength, %%ebx \n\t" // check if at end of array
				3394	"movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x) [mov does not
				3395	"jb avg_lp2 \n\t" // affect flags; -1 to offset inc ebx]
				3396
				3397	"avg_end: \n\t"
				3398	"EMMS \n\t" // end MMX; prep for poss. FP instrs.
				3399	#ifdef __PIC__
				3400	"popl %%ebx \n\t" // restore index to Global Offset Table
				3401	#endif
				3402
				3403	: "=c" (dummy_value_c), // output regs (dummy)
				3404	"=S" (dummy_value_S),
				3405	"=D" (dummy_value_D)
				3406
				3407	: "0" (bpp), // ecx // input regs
				3408	"1" (prev_row), // esi
				3409	"2" (row) // edi
				3410
				3411	: "%eax", "%edx" // clobber list
				3412	#ifndef __PIC__
				3413	, "%ebx"
				3414	#endif
				3415	);
				3416
				3417	} /* end png_read_filter_row_mmx_avg() */
				3418	#endif
				3419
				3420
				3421
				3422	#ifdef PNG_THREAD_UNSAFE_OK
				3423	//===========================================================================//
				3424	// //
				3425	// P N G _ R E A D _ F I L T E R _ R O W _ M M X _ P A E T H //
				3426	// //
				3427	//===========================================================================//
				3428
				3429	// Optimized code for PNG Paeth filter decoder
				3430
				3431	static void /* PRIVATE */
				3432	png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
				3433	png_bytep prev_row)
				3434	{
				3435	int bpp;
				3436	int dummy_value_c; // fix 'forbidden register 2 (cx) was spilled' error
				3437	int dummy_value_S;
				3438	int dummy_value_D;
				3439
				3440	bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
				3441	_FullLength = row_info->rowbytes; // # of bytes to filter
				3442
				3443	__asm__ __volatile__ (
				3444	#ifdef __PIC__
				3445	"pushl %%ebx \n\t" // save index to Global Offset Table
				3446	#endif
				3447	"xorl %%ebx, %%ebx \n\t" // ebx: x offset
				3448	//pre "movl row, %%edi \n\t"
				3449	"xorl %%edx, %%edx \n\t" // edx: x-bpp offset
				3450	//pre "movl prev_row, %%esi \n\t"
				3451	"xorl %%eax, %%eax \n\t"
				3452
				3453	// Compute the Raw value for the first bpp bytes
				3454	// Note: the formula works out to be always
				3455	// Paeth(x) = Raw(x) + Prior(x) where x < bpp
				3456	"paeth_rlp: \n\t"
				3457	"movb (%%edi,%%ebx,), %%al \n\t"
				3458	"addb (%%esi,%%ebx,), %%al \n\t"
				3459	"incl %%ebx \n\t"
				3460	//pre "cmpl bpp, %%ebx \n\t" (bpp is preloaded into ecx)
				3461	"cmpl %%ecx, %%ebx \n\t"
				3462	"movb %%al, -1(%%edi,%%ebx,) \n\t"
				3463	"jb paeth_rlp \n\t"
				3464	// get # of bytes to alignment
				3465	"movl %%edi, _dif \n\t" // take start of row
				3466	"addl %%ebx, _dif \n\t" // add bpp
				3467	"xorl %%ecx, %%ecx \n\t"
				3468	"addl $0xf, _dif \n\t" // add 7 + 8 to incr past alignment
				3469	// boundary
				3470	"andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
				3471	"subl %%edi, _dif \n\t" // subtract from start ==> value ebx
				3472	// at alignment
				3473	"jz paeth_go \n\t"
				3474	// fix alignment
				3475
				3476	"paeth_lp1: \n\t"
				3477	"xorl %%eax, %%eax \n\t"
				3478	// pav = p - a = (a + b - c) - a = b - c
				3479	"movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
				3480	"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
				3481	"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
				3482	"movl %%eax, _patemp \n\t" // Save pav for later use
				3483	"xorl %%eax, %%eax \n\t"
				3484	// pbv = p - b = (a + b - c) - b = a - c
				3485	"movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
				3486	"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
				3487	"movl %%eax, %%ecx \n\t"
				3488	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
				3489	"addl _patemp, %%eax \n\t" // pcv = pav + pbv
				3490	// pc = abs(pcv)
				3491	"testl $0x80000000, %%eax \n\t"
				3492	"jz paeth_pca \n\t"
				3493	"negl %%eax \n\t" // reverse sign of neg values
				3494
				3495	"paeth_pca: \n\t"
				3496	"movl %%eax, _pctemp \n\t" // save pc for later use
				3497	// pb = abs(pbv)
				3498	"testl $0x80000000, %%ecx \n\t"
				3499	"jz paeth_pba \n\t"
				3500	"negl %%ecx \n\t" // reverse sign of neg values
				3501
				3502	"paeth_pba: \n\t"
				3503	"movl %%ecx, _pbtemp \n\t" // save pb for later use
				3504	// pa = abs(pav)
				3505	"movl _patemp, %%eax \n\t"
				3506	"testl $0x80000000, %%eax \n\t"
				3507	"jz paeth_paa \n\t"
				3508	"negl %%eax \n\t" // reverse sign of neg values
				3509
				3510	"paeth_paa: \n\t"
				3511	"movl %%eax, _patemp \n\t" // save pa for later use
				3512	// test if pa <= pb
				3513	"cmpl %%ecx, %%eax \n\t"
				3514	"jna paeth_abb \n\t"
				3515	// pa > pb; now test if pb <= pc
				3516	"cmpl _pctemp, %%ecx \n\t"
				3517	"jna paeth_bbc \n\t"
				3518	// pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
				3519	"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
				3520	"jmp paeth_paeth \n\t"
				3521
				3522	"paeth_bbc: \n\t"
				3523	// pb <= pc; Raw(x) = Paeth(x) + Prior(x)
				3524	"movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
				3525	"jmp paeth_paeth \n\t"
				3526
				3527	"paeth_abb: \n\t"
				3528	// pa <= pb; now test if pa <= pc
				3529	"cmpl _pctemp, %%eax \n\t"
				3530	"jna paeth_abc \n\t"
				3531	// pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
				3532	"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
				3533	"jmp paeth_paeth \n\t"
				3534
				3535	"paeth_abc: \n\t"
				3536	// pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
				3537	"movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
				3538
				3539	"paeth_paeth: \n\t"
				3540	"incl %%ebx \n\t"
				3541	"incl %%edx \n\t"
				3542	// Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
				3543	"addb %%cl, -1(%%edi,%%ebx,) \n\t"
				3544	"cmpl _dif, %%ebx \n\t"
				3545	"jb paeth_lp1 \n\t"
				3546
				3547	"paeth_go: \n\t"
				3548	"movl _FullLength, %%ecx \n\t"
				3549	"movl %%ecx, %%eax \n\t"
				3550	"subl %%ebx, %%eax \n\t" // subtract alignment fix
				3551	"andl $0x00000007, %%eax \n\t" // calc bytes over mult of 8
				3552	"subl %%eax, %%ecx \n\t" // drop over bytes from original length
				3553	"movl %%ecx, _MMXLength \n\t"
				3554	#ifdef __PIC__
				3555	"popl %%ebx \n\t" // restore index to Global Offset Table
				3556	#endif
				3557
				3558	: "=c" (dummy_value_c), // output regs (dummy)
				3559	"=S" (dummy_value_S),
				3560	"=D" (dummy_value_D)
				3561
				3562	: "0" (bpp), // ecx // input regs
				3563	"1" (prev_row), // esi
				3564	"2" (row) // edi
				3565
				3566	: "%eax", "%edx" // clobber list
				3567	#ifndef __PIC__
				3568	, "%ebx"
				3569	#endif
				3570	);
				3571
				3572	// now do the math for the rest of the row
				3573	switch (bpp)
				3574	{
				3575	case 3:
				3576	{
				3577	_ActiveMask.use = 0x0000000000ffffffLL;
				3578	_ActiveMaskEnd.use = 0xffff000000000000LL;
				3579	_ShiftBpp.use = 24; // == bpp(3) * 8
				3580	_ShiftRem.use = 40; // == 64 - 24
				3581
				3582	__asm__ __volatile__ (
				3583	"movl _dif, %%ecx \n\t"
				3584	// preload "movl row, %%edi \n\t"
				3585	// preload "movl prev_row, %%esi \n\t"
				3586	"pxor %%mm0, %%mm0 \n\t"
				3587	// prime the pump: load the first Raw(x-bpp) data set
				3588	"movq -8(%%edi,%%ecx,), %%mm1 \n\t"
				3589	"paeth_3lp: \n\t"
				3590	"psrlq _ShiftRem, %%mm1 \n\t" // shift last 3 bytes to 1st
				3591	// 3 bytes
				3592	"movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
				3593	"punpcklbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
				3594	"movq -8(%%esi,%%ecx,), %%mm3 \n\t" // prep c=Prior(x-bpp) bytes
				3595	"punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
				3596	"psrlq _ShiftRem, %%mm3 \n\t" // shift last 3 bytes to 1st
				3597	// 3 bytes
				3598	// pav = p - a = (a + b - c) - a = b - c
				3599	"movq %%mm2, %%mm4 \n\t"
				3600	"punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
				3601	// pbv = p - b = (a + b - c) - b = a - c
				3602	"movq %%mm1, %%mm5 \n\t"
				3603	"psubw %%mm3, %%mm4 \n\t"
				3604	"pxor %%mm7, %%mm7 \n\t"
				3605	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
				3606	"movq %%mm4, %%mm6 \n\t"
				3607	"psubw %%mm3, %%mm5 \n\t"
				3608
				3609	// pa = abs(p-a) = abs(pav)
				3610	// pb = abs(p-b) = abs(pbv)
				3611	// pc = abs(p-c) = abs(pcv)
				3612	"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
				3613	"paddw %%mm5, %%mm6 \n\t"
				3614	"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
				3615	"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
				3616	"psubw %%mm0, %%mm4 \n\t"
				3617	"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
				3618	"psubw %%mm0, %%mm4 \n\t"
				3619	"psubw %%mm7, %%mm5 \n\t"
				3620	"pxor %%mm0, %%mm0 \n\t"
				3621	"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
				3622	"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
				3623	"psubw %%mm7, %%mm5 \n\t"
				3624	"psubw %%mm0, %%mm6 \n\t"
				3625	// test pa <= pb
				3626	"movq %%mm4, %%mm7 \n\t"
				3627	"psubw %%mm0, %%mm6 \n\t"
				3628	"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
				3629	"movq %%mm7, %%mm0 \n\t"
				3630	// use mm7 mask to merge pa & pb
				3631	"pand %%mm7, %%mm5 \n\t"
				3632	// use mm0 mask copy to merge a & b
				3633	"pand %%mm0, %%mm2 \n\t"
				3634	"pandn %%mm4, %%mm7 \n\t"
				3635	"pandn %%mm1, %%mm0 \n\t"
				3636	"paddw %%mm5, %%mm7 \n\t"
				3637	"paddw %%mm2, %%mm0 \n\t"
				3638	// test ((pa <= pb)? pa:pb) <= pc
				3639	"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
				3640	"pxor %%mm1, %%mm1 \n\t"
				3641	"pand %%mm7, %%mm3 \n\t"
				3642	"pandn %%mm0, %%mm7 \n\t"
				3643	"paddw %%mm3, %%mm7 \n\t"
				3644	"pxor %%mm0, %%mm0 \n\t"
				3645	"packuswb %%mm1, %%mm7 \n\t"
				3646	"movq (%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
				3647	"pand _ActiveMask, %%mm7 \n\t"
				3648	"movq %%mm3, %%mm2 \n\t" // load b=Prior(x) step 1
				3649	"paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
				3650	"punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
				3651	"movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
				3652	"movq %%mm7, %%mm1 \n\t" // now mm1 will be used as
				3653	// Raw(x-bpp)
				3654	// now do Paeth for 2nd set of bytes (3-5)
				3655	"psrlq _ShiftBpp, %%mm2 \n\t" // load b=Prior(x) step 2
				3656	"punpcklbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
				3657	"pxor %%mm7, %%mm7 \n\t"
				3658	"punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
				3659	// pbv = p - b = (a + b - c) - b = a - c
				3660	"movq %%mm1, %%mm5 \n\t"
				3661	// pav = p - a = (a + b - c) - a = b - c
				3662	"movq %%mm2, %%mm4 \n\t"
				3663	"psubw %%mm3, %%mm5 \n\t"
				3664	"psubw %%mm3, %%mm4 \n\t"
				3665	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
				3666	// pav + pbv = pbv + pav
				3667	"movq %%mm5, %%mm6 \n\t"
				3668	"paddw %%mm4, %%mm6 \n\t"
				3669
				3670	// pa = abs(p-a) = abs(pav)
				3671	// pb = abs(p-b) = abs(pbv)
				3672	// pc = abs(p-c) = abs(pcv)
				3673	"pcmpgtw %%mm5, %%mm0 \n\t" // create mask pbv bytes < 0
				3674	"pcmpgtw %%mm4, %%mm7 \n\t" // create mask pav bytes < 0
				3675	"pand %%mm5, %%mm0 \n\t" // only pbv bytes < 0 in mm0
				3676	"pand %%mm4, %%mm7 \n\t" // only pav bytes < 0 in mm7
				3677	"psubw %%mm0, %%mm5 \n\t"
				3678	"psubw %%mm7, %%mm4 \n\t"
				3679	"psubw %%mm0, %%mm5 \n\t"
				3680	"psubw %%mm7, %%mm4 \n\t"
				3681	"pxor %%mm0, %%mm0 \n\t"
				3682	"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
				3683	"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
				3684	"psubw %%mm0, %%mm6 \n\t"
				3685	// test pa <= pb
				3686	"movq %%mm4, %%mm7 \n\t"
				3687	"psubw %%mm0, %%mm6 \n\t"
				3688	"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
				3689	"movq %%mm7, %%mm0 \n\t"
				3690	// use mm7 mask to merge pa & pb
				3691	"pand %%mm7, %%mm5 \n\t"
				3692	// use mm0 mask copy to merge a & b
				3693	"pand %%mm0, %%mm2 \n\t"
				3694	"pandn %%mm4, %%mm7 \n\t"
				3695	"pandn %%mm1, %%mm0 \n\t"
				3696	"paddw %%mm5, %%mm7 \n\t"
				3697	"paddw %%mm2, %%mm0 \n\t"
				3698	// test ((pa <= pb)? pa:pb) <= pc
				3699	"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
				3700	"movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
				3701	"pand %%mm7, %%mm3 \n\t"
				3702	"pandn %%mm0, %%mm7 \n\t"
				3703	"pxor %%mm1, %%mm1 \n\t"
				3704	"paddw %%mm3, %%mm7 \n\t"
				3705	"pxor %%mm0, %%mm0 \n\t"
				3706	"packuswb %%mm1, %%mm7 \n\t"
				3707	"movq %%mm2, %%mm3 \n\t" // load c=Prior(x-bpp) step 1
				3708	"pand _ActiveMask, %%mm7 \n\t"
				3709	"punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
				3710	"psllq _ShiftBpp, %%mm7 \n\t" // shift bytes to 2nd group of
				3711	// 3 bytes
				3712	// pav = p - a = (a + b - c) - a = b - c
				3713	"movq %%mm2, %%mm4 \n\t"
				3714	"paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
				3715	"psllq _ShiftBpp, %%mm3 \n\t" // load c=Prior(x-bpp) step 2
				3716	"movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
				3717	"movq %%mm7, %%mm1 \n\t"
				3718	"punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
				3719	"psllq _ShiftBpp, %%mm1 \n\t" // shift bytes
				3720	// now mm1 will be used as Raw(x-bpp)
				3721	// now do Paeth for 3rd, and final, set of bytes (6-7)
				3722	"pxor %%mm7, %%mm7 \n\t"
				3723	"punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
				3724	"psubw %%mm3, %%mm4 \n\t"
				3725	// pbv = p - b = (a + b - c) - b = a - c
				3726	"movq %%mm1, %%mm5 \n\t"
				3727	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
				3728	"movq %%mm4, %%mm6 \n\t"
				3729	"psubw %%mm3, %%mm5 \n\t"
				3730	"pxor %%mm0, %%mm0 \n\t"
				3731	"paddw %%mm5, %%mm6 \n\t"
				3732
				3733	// pa = abs(p-a) = abs(pav)
				3734	// pb = abs(p-b) = abs(pbv)
				3735	// pc = abs(p-c) = abs(pcv)
				3736	"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
				3737	"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
				3738	"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
				3739	"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
				3740	"psubw %%mm0, %%mm4 \n\t"
				3741	"psubw %%mm7, %%mm5 \n\t"
				3742	"psubw %%mm0, %%mm4 \n\t"
				3743	"psubw %%mm7, %%mm5 \n\t"
				3744	"pxor %%mm0, %%mm0 \n\t"
				3745	"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
				3746	"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
				3747	"psubw %%mm0, %%mm6 \n\t"
				3748	// test pa <= pb
				3749	"movq %%mm4, %%mm7 \n\t"
				3750	"psubw %%mm0, %%mm6 \n\t"
				3751	"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
				3752	"movq %%mm7, %%mm0 \n\t"
				3753	// use mm0 mask copy to merge a & b
				3754	"pand %%mm0, %%mm2 \n\t"
				3755	// use mm7 mask to merge pa & pb
				3756	"pand %%mm7, %%mm5 \n\t"
				3757	"pandn %%mm1, %%mm0 \n\t"
				3758	"pandn %%mm4, %%mm7 \n\t"
				3759	"paddw %%mm2, %%mm0 \n\t"
				3760	"paddw %%mm5, %%mm7 \n\t"
				3761	// test ((pa <= pb)? pa:pb) <= pc
				3762	"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
				3763	"pand %%mm7, %%mm3 \n\t"
				3764	"pandn %%mm0, %%mm7 \n\t"
				3765	"paddw %%mm3, %%mm7 \n\t"
				3766	"pxor %%mm1, %%mm1 \n\t"
				3767	"packuswb %%mm7, %%mm1 \n\t"
				3768	// step ecx to next set of 8 bytes and repeat loop til done
				3769	"addl $8, %%ecx \n\t"
				3770	"pand _ActiveMaskEnd, %%mm1 \n\t"
				3771	"paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with
				3772	// Raw(x)
				3773
				3774	"cmpl _MMXLength, %%ecx \n\t"
				3775	"pxor %%mm0, %%mm0 \n\t" // pxor does not affect flags
				3776	"movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
				3777	// mm1 will be used as Raw(x-bpp) next loop
				3778	// mm3 ready to be used as Prior(x-bpp) next loop
				3779	"jb paeth_3lp \n\t"
				3780
				3781	: "=S" (dummy_value_S), // output regs (dummy)
				3782	"=D" (dummy_value_D)
				3783
				3784	: "0" (prev_row), // esi // input regs
				3785	"1" (row) // edi
				3786
				3787	: "%ecx" // clobber list
				3788	#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
				3789	, "%mm0", "%mm1", "%mm2", "%mm3"
				3790	, "%mm4", "%mm5", "%mm6", "%mm7"
				3791	#endif
				3792	);
				3793	}
				3794	break; // end 3 bpp
				3795
				3796	case 6:
				3797	//case 7: // GRR BOGUS
				3798	//case 5: // GRR BOGUS
				3799	{
				3800	_ActiveMask.use = 0x00000000ffffffffLL;
				3801	_ActiveMask2.use = 0xffffffff00000000LL;
				3802	_ShiftBpp.use = bpp << 3; // == bpp * 8
				3803	_ShiftRem.use = 64 - _ShiftBpp.use;
				3804
				3805	__asm__ __volatile__ (
				3806	"movl _dif, %%ecx \n\t"
				3807	// preload "movl row, %%edi \n\t"
				3808	// preload "movl prev_row, %%esi \n\t"
				3809	// prime the pump: load the first Raw(x-bpp) data set
				3810	"movq -8(%%edi,%%ecx,), %%mm1 \n\t"
				3811	"pxor %%mm0, %%mm0 \n\t"
				3812
				3813	"paeth_6lp: \n\t"
				3814	// must shift to position Raw(x-bpp) data
				3815	"psrlq _ShiftRem, %%mm1 \n\t"
				3816	// do first set of 4 bytes
				3817	"movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
				3818	"punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
				3819	"movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
				3820	"punpcklbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
				3821	// must shift to position Prior(x-bpp) data
				3822	"psrlq _ShiftRem, %%mm3 \n\t"
				3823	// pav = p - a = (a + b - c) - a = b - c
				3824	"movq %%mm2, %%mm4 \n\t"
				3825	"punpcklbw %%mm0, %%mm3 \n\t" // unpack Low bytes of c
				3826	// pbv = p - b = (a + b - c) - b = a - c
				3827	"movq %%mm1, %%mm5 \n\t"
				3828	"psubw %%mm3, %%mm4 \n\t"
				3829	"pxor %%mm7, %%mm7 \n\t"
				3830	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
				3831	"movq %%mm4, %%mm6 \n\t"
				3832	"psubw %%mm3, %%mm5 \n\t"
				3833	// pa = abs(p-a) = abs(pav)
				3834	// pb = abs(p-b) = abs(pbv)
				3835	// pc = abs(p-c) = abs(pcv)
				3836	"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
				3837	"paddw %%mm5, %%mm6 \n\t"
				3838	"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
				3839	"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
				3840	"psubw %%mm0, %%mm4 \n\t"
				3841	"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
				3842	"psubw %%mm0, %%mm4 \n\t"
				3843	"psubw %%mm7, %%mm5 \n\t"
				3844	"pxor %%mm0, %%mm0 \n\t"
				3845	"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
				3846	"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
				3847	"psubw %%mm7, %%mm5 \n\t"
				3848	"psubw %%mm0, %%mm6 \n\t"
				3849	// test pa <= pb
				3850	"movq %%mm4, %%mm7 \n\t"
				3851	"psubw %%mm0, %%mm6 \n\t"
				3852	"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
				3853	"movq %%mm7, %%mm0 \n\t"
				3854	// use mm7 mask to merge pa & pb
				3855	"pand %%mm7, %%mm5 \n\t"
				3856	// use mm0 mask copy to merge a & b
				3857	"pand %%mm0, %%mm2 \n\t"
				3858	"pandn %%mm4, %%mm7 \n\t"
				3859	"pandn %%mm1, %%mm0 \n\t"
				3860	"paddw %%mm5, %%mm7 \n\t"
				3861	"paddw %%mm2, %%mm0 \n\t"
				3862	// test ((pa <= pb)? pa:pb) <= pc
				3863	"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
				3864	"pxor %%mm1, %%mm1 \n\t"
				3865	"pand %%mm7, %%mm3 \n\t"
				3866	"pandn %%mm0, %%mm7 \n\t"
				3867	"paddw %%mm3, %%mm7 \n\t"
				3868	"pxor %%mm0, %%mm0 \n\t"
				3869	"packuswb %%mm1, %%mm7 \n\t"
				3870	"movq -8(%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
				3871	"pand _ActiveMask, %%mm7 \n\t"
				3872	"psrlq _ShiftRem, %%mm3 \n\t"
				3873	"movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x) step 1
				3874	"paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor and Raw(x)
				3875	"movq %%mm2, %%mm6 \n\t"
				3876	"movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
				3877	"movq -8(%%edi,%%ecx,), %%mm1 \n\t"
				3878	"psllq _ShiftBpp, %%mm6 \n\t"
				3879	"movq %%mm7, %%mm5 \n\t"
				3880	"psrlq _ShiftRem, %%mm1 \n\t"
				3881	"por %%mm6, %%mm3 \n\t"
				3882	"psllq _ShiftBpp, %%mm5 \n\t"
				3883	"punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
				3884	"por %%mm5, %%mm1 \n\t"
				3885	// do second set of 4 bytes
				3886	"punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
				3887	"punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
				3888	// pav = p - a = (a + b - c) - a = b - c
				3889	"movq %%mm2, %%mm4 \n\t"
				3890	// pbv = p - b = (a + b - c) - b = a - c
				3891	"movq %%mm1, %%mm5 \n\t"
				3892	"psubw %%mm3, %%mm4 \n\t"
				3893	"pxor %%mm7, %%mm7 \n\t"
				3894	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
				3895	"movq %%mm4, %%mm6 \n\t"
				3896	"psubw %%mm3, %%mm5 \n\t"
				3897	// pa = abs(p-a) = abs(pav)
				3898	// pb = abs(p-b) = abs(pbv)
				3899	// pc = abs(p-c) = abs(pcv)
				3900	"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
				3901	"paddw %%mm5, %%mm6 \n\t"
				3902	"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
				3903	"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
				3904	"psubw %%mm0, %%mm4 \n\t"
				3905	"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
				3906	"psubw %%mm0, %%mm4 \n\t"
				3907	"psubw %%mm7, %%mm5 \n\t"
				3908	"pxor %%mm0, %%mm0 \n\t"
				3909	"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
				3910	"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
				3911	"psubw %%mm7, %%mm5 \n\t"
				3912	"psubw %%mm0, %%mm6 \n\t"
				3913	// test pa <= pb
				3914	"movq %%mm4, %%mm7 \n\t"
				3915	"psubw %%mm0, %%mm6 \n\t"
				3916	"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
				3917	"movq %%mm7, %%mm0 \n\t"
				3918	// use mm7 mask to merge pa & pb
				3919	"pand %%mm7, %%mm5 \n\t"
				3920	// use mm0 mask copy to merge a & b
				3921	"pand %%mm0, %%mm2 \n\t"
				3922	"pandn %%mm4, %%mm7 \n\t"
				3923	"pandn %%mm1, %%mm0 \n\t"
				3924	"paddw %%mm5, %%mm7 \n\t"
				3925	"paddw %%mm2, %%mm0 \n\t"
				3926	// test ((pa <= pb)? pa:pb) <= pc
				3927	"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
				3928	"pxor %%mm1, %%mm1 \n\t"
				3929	"pand %%mm7, %%mm3 \n\t"
				3930	"pandn %%mm0, %%mm7 \n\t"
				3931	"pxor %%mm1, %%mm1 \n\t"
				3932	"paddw %%mm3, %%mm7 \n\t"
				3933	"pxor %%mm0, %%mm0 \n\t"
				3934	// step ecx to next set of 8 bytes and repeat loop til done
				3935	"addl $8, %%ecx \n\t"
				3936	"packuswb %%mm7, %%mm1 \n\t"
				3937	"paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
				3938	"cmpl _MMXLength, %%ecx \n\t"
				3939	"movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
				3940	// mm1 will be used as Raw(x-bpp) next loop
				3941	"jb paeth_6lp \n\t"
				3942
				3943	: "=S" (dummy_value_S), // output regs (dummy)
				3944	"=D" (dummy_value_D)
				3945
				3946	: "0" (prev_row), // esi // input regs
				3947	"1" (row) // edi
				3948
				3949	: "%ecx" // clobber list
				3950	#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
				3951	, "%mm0", "%mm1", "%mm2", "%mm3"
				3952	, "%mm4", "%mm5", "%mm6", "%mm7"
				3953	#endif
				3954	);
				3955	}
				3956	break; // end 6 bpp
				3957
				3958	case 4:
				3959	{
				3960	_ActiveMask.use = 0x00000000ffffffffLL;
				3961
				3962	__asm__ __volatile__ (
				3963	"movl _dif, %%ecx \n\t"
				3964	// preload "movl row, %%edi \n\t"
				3965	// preload "movl prev_row, %%esi \n\t"
				3966	"pxor %%mm0, %%mm0 \n\t"
				3967	// prime the pump: load the first Raw(x-bpp) data set
				3968	"movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
				3969	// a=Raw(x-bpp) bytes
				3970	"paeth_4lp: \n\t"
				3971	// do first set of 4 bytes
				3972	"movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
				3973	"punpckhbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
				3974	"movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
				3975	"punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
				3976	// pav = p - a = (a + b - c) - a = b - c
				3977	"movq %%mm2, %%mm4 \n\t"
				3978	"punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
				3979	// pbv = p - b = (a + b - c) - b = a - c
				3980	"movq %%mm1, %%mm5 \n\t"
				3981	"psubw %%mm3, %%mm4 \n\t"
				3982	"pxor %%mm7, %%mm7 \n\t"
				3983	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
				3984	"movq %%mm4, %%mm6 \n\t"
				3985	"psubw %%mm3, %%mm5 \n\t"
				3986	// pa = abs(p-a) = abs(pav)
				3987	// pb = abs(p-b) = abs(pbv)
				3988	// pc = abs(p-c) = abs(pcv)
				3989	"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
				3990	"paddw %%mm5, %%mm6 \n\t"
				3991	"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
				3992	"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
				3993	"psubw %%mm0, %%mm4 \n\t"
				3994	"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
				3995	"psubw %%mm0, %%mm4 \n\t"
				3996	"psubw %%mm7, %%mm5 \n\t"
				3997	"pxor %%mm0, %%mm0 \n\t"
				3998	"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
				3999	"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
				4000	"psubw %%mm7, %%mm5 \n\t"
				4001	"psubw %%mm0, %%mm6 \n\t"
				4002	// test pa <= pb
				4003	"movq %%mm4, %%mm7 \n\t"
				4004	"psubw %%mm0, %%mm6 \n\t"
				4005	"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
				4006	"movq %%mm7, %%mm0 \n\t"
				4007	// use mm7 mask to merge pa & pb
				4008	"pand %%mm7, %%mm5 \n\t"
				4009	// use mm0 mask copy to merge a & b
				4010	"pand %%mm0, %%mm2 \n\t"
				4011	"pandn %%mm4, %%mm7 \n\t"
				4012	"pandn %%mm1, %%mm0 \n\t"
				4013	"paddw %%mm5, %%mm7 \n\t"
				4014	"paddw %%mm2, %%mm0 \n\t"
				4015	// test ((pa <= pb)? pa:pb) <= pc
				4016	"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
				4017	"pxor %%mm1, %%mm1 \n\t"
				4018	"pand %%mm7, %%mm3 \n\t"
				4019	"pandn %%mm0, %%mm7 \n\t"
				4020	"paddw %%mm3, %%mm7 \n\t"
				4021	"pxor %%mm0, %%mm0 \n\t"
				4022	"packuswb %%mm1, %%mm7 \n\t"
				4023	"movq (%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
				4024	"pand _ActiveMask, %%mm7 \n\t"
				4025	"movq %%mm3, %%mm2 \n\t" // load b=Prior(x) step 1
				4026	"paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
				4027	"punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
				4028	"movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
				4029	"movq %%mm7, %%mm1 \n\t" // now mm1 will be used as Raw(x-bpp)
				4030	// do second set of 4 bytes
				4031	"punpckhbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
				4032	"punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
				4033	// pav = p - a = (a + b - c) - a = b - c
				4034	"movq %%mm2, %%mm4 \n\t"
				4035	// pbv = p - b = (a + b - c) - b = a - c
				4036	"movq %%mm1, %%mm5 \n\t"
				4037	"psubw %%mm3, %%mm4 \n\t"
				4038	"pxor %%mm7, %%mm7 \n\t"
				4039	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
				4040	"movq %%mm4, %%mm6 \n\t"
				4041	"psubw %%mm3, %%mm5 \n\t"
				4042	// pa = abs(p-a) = abs(pav)
				4043	// pb = abs(p-b) = abs(pbv)
				4044	// pc = abs(p-c) = abs(pcv)
				4045	"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
				4046	"paddw %%mm5, %%mm6 \n\t"
				4047	"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
				4048	"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
				4049	"psubw %%mm0, %%mm4 \n\t"
				4050	"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
				4051	"psubw %%mm0, %%mm4 \n\t"
				4052	"psubw %%mm7, %%mm5 \n\t"
				4053	"pxor %%mm0, %%mm0 \n\t"
				4054	"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
				4055	"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
				4056	"psubw %%mm7, %%mm5 \n\t"
				4057	"psubw %%mm0, %%mm6 \n\t"
				4058	// test pa <= pb
				4059	"movq %%mm4, %%mm7 \n\t"
				4060	"psubw %%mm0, %%mm6 \n\t"
				4061	"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
				4062	"movq %%mm7, %%mm0 \n\t"
				4063	// use mm7 mask to merge pa & pb
				4064	"pand %%mm7, %%mm5 \n\t"
				4065	// use mm0 mask copy to merge a & b
				4066	"pand %%mm0, %%mm2 \n\t"
				4067	"pandn %%mm4, %%mm7 \n\t"
				4068	"pandn %%mm1, %%mm0 \n\t"
				4069	"paddw %%mm5, %%mm7 \n\t"
				4070	"paddw %%mm2, %%mm0 \n\t"
				4071	// test ((pa <= pb)? pa:pb) <= pc
				4072	"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
				4073	"pxor %%mm1, %%mm1 \n\t"
				4074	"pand %%mm7, %%mm3 \n\t"
				4075	"pandn %%mm0, %%mm7 \n\t"
				4076	"pxor %%mm1, %%mm1 \n\t"
				4077	"paddw %%mm3, %%mm7 \n\t"
				4078	"pxor %%mm0, %%mm0 \n\t"
				4079	// step ecx to next set of 8 bytes and repeat loop til done
				4080	"addl $8, %%ecx \n\t"
				4081	"packuswb %%mm7, %%mm1 \n\t"
				4082	"paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add predictor with Raw(x)
				4083	"cmpl _MMXLength, %%ecx \n\t"
				4084	"movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
				4085	// mm1 will be used as Raw(x-bpp) next loop
				4086	"jb paeth_4lp \n\t"
				4087
				4088	: "=S" (dummy_value_S), // output regs (dummy)
				4089	"=D" (dummy_value_D)
				4090
				4091	: "0" (prev_row), // esi // input regs
				4092	"1" (row) // edi
				4093
				4094	: "%ecx" // clobber list
				4095	#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
				4096	, "%mm0", "%mm1", "%mm2", "%mm3"
				4097	, "%mm4", "%mm5", "%mm6", "%mm7"
				4098	#endif
				4099	);
				4100	}
				4101	break; // end 4 bpp
				4102
				4103	case 8: // bpp == 8
				4104	{
				4105	_ActiveMask.use = 0x00000000ffffffffLL;
				4106
				4107	__asm__ __volatile__ (
				4108	"movl _dif, %%ecx \n\t"
				4109	// preload "movl row, %%edi \n\t"
				4110	// preload "movl prev_row, %%esi \n\t"
				4111	"pxor %%mm0, %%mm0 \n\t"
				4112	// prime the pump: load the first Raw(x-bpp) data set
				4113	"movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
				4114	// a=Raw(x-bpp) bytes
				4115	"paeth_8lp: \n\t"
				4116	// do first set of 4 bytes
				4117	"movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
				4118	"punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
				4119	"movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
				4120	"punpcklbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
				4121	// pav = p - a = (a + b - c) - a = b - c
				4122	"movq %%mm2, %%mm4 \n\t"
				4123	"punpcklbw %%mm0, %%mm3 \n\t" // unpack Low bytes of c
				4124	// pbv = p - b = (a + b - c) - b = a - c
				4125	"movq %%mm1, %%mm5 \n\t"
				4126	"psubw %%mm3, %%mm4 \n\t"
				4127	"pxor %%mm7, %%mm7 \n\t"
				4128	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
				4129	"movq %%mm4, %%mm6 \n\t"
				4130	"psubw %%mm3, %%mm5 \n\t"
				4131	// pa = abs(p-a) = abs(pav)
				4132	// pb = abs(p-b) = abs(pbv)
				4133	// pc = abs(p-c) = abs(pcv)
				4134	"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
				4135	"paddw %%mm5, %%mm6 \n\t"
				4136	"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
				4137	"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
				4138	"psubw %%mm0, %%mm4 \n\t"
				4139	"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
				4140	"psubw %%mm0, %%mm4 \n\t"
				4141	"psubw %%mm7, %%mm5 \n\t"
				4142	"pxor %%mm0, %%mm0 \n\t"
				4143	"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
				4144	"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
				4145	"psubw %%mm7, %%mm5 \n\t"
				4146	"psubw %%mm0, %%mm6 \n\t"
				4147	// test pa <= pb
				4148	"movq %%mm4, %%mm7 \n\t"
				4149	"psubw %%mm0, %%mm6 \n\t"
				4150	"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
				4151	"movq %%mm7, %%mm0 \n\t"
				4152	// use mm7 mask to merge pa & pb
				4153	"pand %%mm7, %%mm5 \n\t"
				4154	// use mm0 mask copy to merge a & b
				4155	"pand %%mm0, %%mm2 \n\t"
				4156	"pandn %%mm4, %%mm7 \n\t"
				4157	"pandn %%mm1, %%mm0 \n\t"
				4158	"paddw %%mm5, %%mm7 \n\t"
				4159	"paddw %%mm2, %%mm0 \n\t"
				4160	// test ((pa <= pb)? pa:pb) <= pc
				4161	"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
				4162	"pxor %%mm1, %%mm1 \n\t"
				4163	"pand %%mm7, %%mm3 \n\t"
				4164	"pandn %%mm0, %%mm7 \n\t"
				4165	"paddw %%mm3, %%mm7 \n\t"
				4166	"pxor %%mm0, %%mm0 \n\t"
				4167	"packuswb %%mm1, %%mm7 \n\t"
				4168	"movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
				4169	"pand _ActiveMask, %%mm7 \n\t"
				4170	"movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
				4171	"paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
				4172	"punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
				4173	"movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
				4174	"movq -8(%%edi,%%ecx,), %%mm1 \n\t" // read a=Raw(x-bpp) bytes
				4175
				4176	// do second set of 4 bytes
				4177	"punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
				4178	"punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
				4179	// pav = p - a = (a + b - c) - a = b - c
				4180	"movq %%mm2, %%mm4 \n\t"
				4181	// pbv = p - b = (a + b - c) - b = a - c
				4182	"movq %%mm1, %%mm5 \n\t"
				4183	"psubw %%mm3, %%mm4 \n\t"
				4184	"pxor %%mm7, %%mm7 \n\t"
				4185	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
				4186	"movq %%mm4, %%mm6 \n\t"
				4187	"psubw %%mm3, %%mm5 \n\t"
				4188	// pa = abs(p-a) = abs(pav)
				4189	// pb = abs(p-b) = abs(pbv)
				4190	// pc = abs(p-c) = abs(pcv)
				4191	"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
				4192	"paddw %%mm5, %%mm6 \n\t"
				4193	"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
				4194	"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
				4195	"psubw %%mm0, %%mm4 \n\t"
				4196	"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
				4197	"psubw %%mm0, %%mm4 \n\t"
				4198	"psubw %%mm7, %%mm5 \n\t"
				4199	"pxor %%mm0, %%mm0 \n\t"
				4200	"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
				4201	"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
				4202	"psubw %%mm7, %%mm5 \n\t"
				4203	"psubw %%mm0, %%mm6 \n\t"
				4204	// test pa <= pb
				4205	"movq %%mm4, %%mm7 \n\t"
				4206	"psubw %%mm0, %%mm6 \n\t"
				4207	"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
				4208	"movq %%mm7, %%mm0 \n\t"
				4209	// use mm7 mask to merge pa & pb
				4210	"pand %%mm7, %%mm5 \n\t"
				4211	// use mm0 mask copy to merge a & b
				4212	"pand %%mm0, %%mm2 \n\t"
				4213	"pandn %%mm4, %%mm7 \n\t"
				4214	"pandn %%mm1, %%mm0 \n\t"
				4215	"paddw %%mm5, %%mm7 \n\t"
				4216	"paddw %%mm2, %%mm0 \n\t"
				4217	// test ((pa <= pb)? pa:pb) <= pc
				4218	"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
				4219	"pxor %%mm1, %%mm1 \n\t"
				4220	"pand %%mm7, %%mm3 \n\t"
				4221	"pandn %%mm0, %%mm7 \n\t"
				4222	"pxor %%mm1, %%mm1 \n\t"
				4223	"paddw %%mm3, %%mm7 \n\t"
				4224	"pxor %%mm0, %%mm0 \n\t"
				4225	// step ecx to next set of 8 bytes and repeat loop til done
				4226	"addl $8, %%ecx \n\t"
				4227	"packuswb %%mm7, %%mm1 \n\t"
				4228	"paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
				4229	"cmpl _MMXLength, %%ecx \n\t"
				4230	"movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
				4231	// mm1 will be used as Raw(x-bpp) next loop
				4232	"jb paeth_8lp \n\t"
				4233
				4234	: "=S" (dummy_value_S), // output regs (dummy)
				4235	"=D" (dummy_value_D)
				4236
				4237	: "0" (prev_row), // esi // input regs
				4238	"1" (row) // edi
				4239
				4240	: "%ecx" // clobber list
				4241	#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
				4242	, "%mm0", "%mm1", "%mm2", "%mm3"
				4243	, "%mm4", "%mm5", "%mm6", "%mm7"
				4244	#endif
				4245	);
				4246	}
				4247	break; // end 8 bpp
				4248
				4249	case 1: // bpp = 1
				4250	case 2: // bpp = 2
				4251	default: // bpp > 8
				4252	{
				4253	__asm__ __volatile__ (
				4254	#ifdef __PIC__
				4255	"pushl %%ebx \n\t" // save Global Offset Table index
				4256	#endif
				4257	"movl _dif, %%ebx \n\t"
				4258	"cmpl _FullLength, %%ebx \n\t"
				4259	"jnb paeth_dend \n\t"
				4260
				4261	// preload "movl row, %%edi \n\t"
				4262	// preload "movl prev_row, %%esi \n\t"
				4263	// do Paeth decode for remaining bytes
				4264	"movl %%ebx, %%edx \n\t"
				4265	// preload "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
				4266	"subl %%ecx, %%edx \n\t" // edx = ebx - bpp
				4267	"xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx
				4268
				4269	"paeth_dlp: \n\t"
				4270	"xorl %%eax, %%eax \n\t"
				4271	// pav = p - a = (a + b - c) - a = b - c
				4272	"movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
				4273	"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
				4274	"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
				4275	"movl %%eax, _patemp \n\t" // Save pav for later use
				4276	"xorl %%eax, %%eax \n\t"
				4277	// pbv = p - b = (a + b - c) - b = a - c
				4278	"movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
				4279	"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
				4280	"movl %%eax, %%ecx \n\t"
				4281	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
				4282	"addl _patemp, %%eax \n\t" // pcv = pav + pbv
				4283	// pc = abs(pcv)
				4284	"testl $0x80000000, %%eax \n\t"
				4285	"jz paeth_dpca \n\t"
				4286	"negl %%eax \n\t" // reverse sign of neg values
				4287
				4288	"paeth_dpca: \n\t"
				4289	"movl %%eax, _pctemp \n\t" // save pc for later use
				4290	// pb = abs(pbv)
				4291	"testl $0x80000000, %%ecx \n\t"
				4292	"jz paeth_dpba \n\t"
				4293	"negl %%ecx \n\t" // reverse sign of neg values
				4294
				4295	"paeth_dpba: \n\t"
				4296	"movl %%ecx, _pbtemp \n\t" // save pb for later use
				4297	// pa = abs(pav)
				4298	"movl _patemp, %%eax \n\t"
				4299	"testl $0x80000000, %%eax \n\t"
				4300	"jz paeth_dpaa \n\t"
				4301	"negl %%eax \n\t" // reverse sign of neg values
				4302
				4303	"paeth_dpaa: \n\t"
				4304	"movl %%eax, _patemp \n\t" // save pa for later use
				4305	// test if pa <= pb
				4306	"cmpl %%ecx, %%eax \n\t"
				4307	"jna paeth_dabb \n\t"
				4308	// pa > pb; now test if pb <= pc
				4309	"cmpl _pctemp, %%ecx \n\t"
				4310	"jna paeth_dbbc \n\t"
				4311	// pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
				4312	"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
				4313	"jmp paeth_dpaeth \n\t"
				4314
				4315	"paeth_dbbc: \n\t"
				4316	// pb <= pc; Raw(x) = Paeth(x) + Prior(x)
				4317	"movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
				4318	"jmp paeth_dpaeth \n\t"
				4319
				4320	"paeth_dabb: \n\t"
				4321	// pa <= pb; now test if pa <= pc
				4322	"cmpl _pctemp, %%eax \n\t"
				4323	"jna paeth_dabc \n\t"
				4324	// pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
				4325	"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
				4326	"jmp paeth_dpaeth \n\t"
				4327
				4328	"paeth_dabc: \n\t"
				4329	// pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
				4330	"movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
				4331
				4332	"paeth_dpaeth: \n\t"
				4333	"incl %%ebx \n\t"
				4334	"incl %%edx \n\t"
				4335	// Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
				4336	"addb %%cl, -1(%%edi,%%ebx,) \n\t"
				4337	"cmpl _FullLength, %%ebx \n\t"
				4338	"jb paeth_dlp \n\t"
				4339
				4340	"paeth_dend: \n\t"
				4341	#ifdef __PIC__
				4342	"popl %%ebx \n\t" // index to Global Offset Table
				4343	#endif
				4344
				4345	: "=c" (dummy_value_c), // output regs (dummy)
				4346	"=S" (dummy_value_S),
				4347	"=D" (dummy_value_D)
				4348
				4349	: "0" (bpp), // ecx // input regs
				4350	"1" (prev_row), // esi
				4351	"2" (row) // edi
				4352
				4353	: "%eax", "%edx" // clobber list
				4354	#ifndef __PIC__
				4355	, "%ebx"
				4356	#endif
				4357	);
				4358	}
				4359	return; // No need to go further with this one
				4360
				4361	} // end switch (bpp)
				4362
				4363	__asm__ __volatile__ (
				4364	// MMX acceleration complete; now do clean-up
				4365	// check if any remaining bytes left to decode
				4366	#ifdef __PIC__
				4367	"pushl %%ebx \n\t" // save index to Global Offset Table
				4368	#endif
				4369	"movl _MMXLength, %%ebx \n\t"
				4370	"cmpl _FullLength, %%ebx \n\t"
				4371	"jnb paeth_end \n\t"
				4372	//pre "movl row, %%edi \n\t"
				4373	//pre "movl prev_row, %%esi \n\t"
				4374	// do Paeth decode for remaining bytes
				4375	"movl %%ebx, %%edx \n\t"
				4376	//pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
				4377	"subl %%ecx, %%edx \n\t" // edx = ebx - bpp
				4378	"xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx below
				4379
				4380	"paeth_lp2: \n\t"
				4381	"xorl %%eax, %%eax \n\t"
				4382	// pav = p - a = (a + b - c) - a = b - c
				4383	"movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
				4384	"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
				4385	"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
				4386	"movl %%eax, _patemp \n\t" // Save pav for later use
				4387	"xorl %%eax, %%eax \n\t"
				4388	// pbv = p - b = (a + b - c) - b = a - c
				4389	"movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
				4390	"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
				4391	"movl %%eax, %%ecx \n\t"
				4392	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
				4393	"addl _patemp, %%eax \n\t" // pcv = pav + pbv
				4394	// pc = abs(pcv)
				4395	"testl $0x80000000, %%eax \n\t"
				4396	"jz paeth_pca2 \n\t"
				4397	"negl %%eax \n\t" // reverse sign of neg values
				4398
				4399	"paeth_pca2: \n\t"
				4400	"movl %%eax, _pctemp \n\t" // save pc for later use
				4401	// pb = abs(pbv)
				4402	"testl $0x80000000, %%ecx \n\t"
				4403	"jz paeth_pba2 \n\t"
				4404	"negl %%ecx \n\t" // reverse sign of neg values
				4405
				4406	"paeth_pba2: \n\t"
				4407	"movl %%ecx, _pbtemp \n\t" // save pb for later use
				4408	// pa = abs(pav)
				4409	"movl _patemp, %%eax \n\t"
				4410	"testl $0x80000000, %%eax \n\t"
				4411	"jz paeth_paa2 \n\t"
				4412	"negl %%eax \n\t" // reverse sign of neg values
				4413
				4414	"paeth_paa2: \n\t"
				4415	"movl %%eax, _patemp \n\t" // save pa for later use
				4416	// test if pa <= pb
				4417	"cmpl %%ecx, %%eax \n\t"
				4418	"jna paeth_abb2 \n\t"
				4419	// pa > pb; now test if pb <= pc
				4420	"cmpl _pctemp, %%ecx \n\t"
				4421	"jna paeth_bbc2 \n\t"
				4422	// pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
				4423	"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
				4424	"jmp paeth_paeth2 \n\t"
				4425
				4426	"paeth_bbc2: \n\t"
				4427	// pb <= pc; Raw(x) = Paeth(x) + Prior(x)
				4428	"movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
				4429	"jmp paeth_paeth2 \n\t"
				4430
				4431	"paeth_abb2: \n\t"
				4432	// pa <= pb; now test if pa <= pc
				4433	"cmpl _pctemp, %%eax \n\t"
				4434	"jna paeth_abc2 \n\t"
				4435	// pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
				4436	"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
				4437	"jmp paeth_paeth2 \n\t"
				4438
				4439	"paeth_abc2: \n\t"
				4440	// pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
				4441	"movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
				4442
				4443	"paeth_paeth2: \n\t"
				4444	"incl %%ebx \n\t"
				4445	"incl %%edx \n\t"
				4446	// Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
				4447	"addb %%cl, -1(%%edi,%%ebx,) \n\t"
				4448	"cmpl _FullLength, %%ebx \n\t"
				4449	"jb paeth_lp2 \n\t"
				4450
				4451	"paeth_end: \n\t"
				4452	"EMMS \n\t" // end MMX; prep for poss. FP instrs.
				4453	#ifdef __PIC__
				4454	"popl %%ebx \n\t" // restore index to Global Offset Table
				4455	#endif
				4456
				4457	: "=c" (dummy_value_c), // output regs (dummy)
				4458	"=S" (dummy_value_S),
				4459	"=D" (dummy_value_D)
				4460
				4461	: "0" (bpp), // ecx // input regs
				4462	"1" (prev_row), // esi
				4463	"2" (row) // edi
				4464
				4465	: "%eax", "%edx" // clobber list (no input regs!)
				4466	#ifndef __PIC__
				4467	, "%ebx"
				4468	#endif
				4469	);
				4470
				4471	} /* end png_read_filter_row_mmx_paeth() */
				4472	#endif
				4473
				4474
				4475
				4476
				4477	#ifdef PNG_THREAD_UNSAFE_OK
				4478	//===========================================================================//
				4479	// //
				4480	// P N G _ R E A D _ F I L T E R _ R O W _ M M X _ S U B //
				4481	// //
				4482	//===========================================================================//
				4483
				4484	// Optimized code for PNG Sub filter decoder
				4485
				4486	static void /* PRIVATE */
				4487	png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
				4488	{
				4489	int bpp;
				4490	int dummy_value_a;
				4491	int dummy_value_D;
				4492
				4493	bpp = (row_info->pixel_depth + 7) >> 3; // calc number of bytes per pixel
				4494	_FullLength = row_info->rowbytes - bpp; // number of bytes to filter
				4495
				4496	__asm__ __volatile__ (
				4497	//pre "movl row, %%edi \n\t"
				4498	"movl %%edi, %%esi \n\t" // lp = row
				4499	//pre "movl bpp, %%eax \n\t"
				4500	"addl %%eax, %%edi \n\t" // rp = row + bpp
				4501	//irr "xorl %%eax, %%eax \n\t"
				4502	// get # of bytes to alignment
				4503	"movl %%edi, _dif \n\t" // take start of row
				4504	"addl $0xf, _dif \n\t" // add 7 + 8 to incr past
				4505	// alignment boundary
				4506	"xorl %%ecx, %%ecx \n\t"
				4507	"andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
				4508	"subl %%edi, _dif \n\t" // subtract from start ==> value
				4509	"jz sub_go \n\t" // ecx at alignment
				4510
				4511	"sub_lp1: \n\t" // fix alignment
				4512	"movb (%%esi,%%ecx,), %%al \n\t"
				4513	"addb %%al, (%%edi,%%ecx,) \n\t"
				4514	"incl %%ecx \n\t"
				4515	"cmpl _dif, %%ecx \n\t"
				4516	"jb sub_lp1 \n\t"
				4517
				4518	"sub_go: \n\t"
				4519	"movl _FullLength, %%eax \n\t"
				4520	"movl %%eax, %%edx \n\t"
				4521	"subl %%ecx, %%edx \n\t" // subtract alignment fix
				4522	"andl $0x00000007, %%edx \n\t" // calc bytes over mult of 8
				4523	"subl %%edx, %%eax \n\t" // drop over bytes from length
				4524	"movl %%eax, _MMXLength \n\t"
				4525
				4526	: "=a" (dummy_value_a), // 0 // output regs (dummy)
				4527	"=D" (dummy_value_D) // 1
				4528
				4529	: "0" (bpp), // eax // input regs
				4530	"1" (row) // edi
				4531
				4532	: "%ebx", "%ecx", "%edx" // clobber list
				4533	, "%esi"
				4534
				4535	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
				4536	, "%mm0", "%mm1", "%mm2", "%mm3"
				4537	, "%mm4", "%mm5", "%mm6", "%mm7"
				4538	#endif
				4539	);
				4540
				4541	// now do the math for the rest of the row
				4542	switch (bpp)
				4543	{
				4544	case 3:
				4545	{
				4546	_ActiveMask.use = 0x0000ffffff000000LL;
				4547	_ShiftBpp.use = 24; // == 3 * 8
				4548	_ShiftRem.use = 40; // == 64 - 24
				4549
				4550	__asm__ __volatile__ (
				4551	// preload "movl row, %%edi \n\t"
				4552	"movq _ActiveMask, %%mm7 \n\t" // load _ActiveMask for 2nd
				4553	// active byte group
				4554	"movl %%edi, %%esi \n\t" // lp = row
				4555	// preload "movl bpp, %%eax \n\t"
				4556	"addl %%eax, %%edi \n\t" // rp = row + bpp
				4557	"movq %%mm7, %%mm6 \n\t"
				4558	"movl _dif, %%edx \n\t"
				4559	"psllq _ShiftBpp, %%mm6 \n\t" // move mask in mm6 to cover
				4560	// 3rd active byte group
				4561	// prime the pump: load the first Raw(x-bpp) data set
				4562	"movq -8(%%edi,%%edx,), %%mm1 \n\t"
				4563
				4564	"sub_3lp: \n\t" // shift data for adding first
				4565	"psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
				4566	// shift clears inactive bytes)
				4567	// add 1st active group
				4568	"movq (%%edi,%%edx,), %%mm0 \n\t"
				4569	"paddb %%mm1, %%mm0 \n\t"
				4570
				4571	// add 2nd active group
				4572	"movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
				4573	"psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
				4574	"pand %%mm7, %%mm1 \n\t" // mask to use 2nd active group
				4575	"paddb %%mm1, %%mm0 \n\t"
				4576
				4577	// add 3rd active group
				4578	"movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
				4579	"psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
				4580	"pand %%mm6, %%mm1 \n\t" // mask to use 3rd active group
				4581	"addl $8, %%edx \n\t"
				4582	"paddb %%mm1, %%mm0 \n\t"
				4583
				4584	"cmpl _MMXLength, %%edx \n\t"
				4585	"movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
				4586	"movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
				4587	"jb sub_3lp \n\t"
				4588
				4589	: "=a" (dummy_value_a), // 0 // output regs (dummy)
				4590	"=D" (dummy_value_D) // 1
				4591
				4592	: "0" (bpp), // eax // input regs
				4593	"1" (row) // edi
				4594
				4595	: "%edx", "%esi" // clobber list
				4596	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
				4597	, "%mm0", "%mm1", "%mm6", "%mm7"
				4598	#endif
				4599	);
				4600	}
				4601	break;
				4602
				4603	case 1:
				4604	{
				4605	__asm__ __volatile__ (
				4606	"movl _dif, %%edx \n\t"
				4607	// preload "movl row, %%edi \n\t"
				4608	"cmpl _FullLength, %%edx \n\t"
				4609	"jnb sub_1end \n\t"
				4610	"movl %%edi, %%esi \n\t" // lp = row
				4611	"xorl %%eax, %%eax \n\t"
				4612	// preload "movl bpp, %%eax \n\t"
				4613	"addl %%eax, %%edi \n\t" // rp = row + bpp
				4614
				4615	"sub_1lp: \n\t"
				4616	"movb (%%esi,%%edx,), %%al \n\t"
				4617	"addb %%al, (%%edi,%%edx,) \n\t"
				4618	"incl %%edx \n\t"
				4619	"cmpl _FullLength, %%edx \n\t"
				4620	"jb sub_1lp \n\t"
				4621
				4622	"sub_1end: \n\t"
				4623
				4624	: "=a" (dummy_value_a), // 0 // output regs (dummy)
				4625	"=D" (dummy_value_D) // 1
				4626
				4627	: "0" (bpp), // eax // input regs
				4628	"1" (row) // edi
				4629
				4630	: "%edx", "%esi" // clobber list
				4631	);
				4632	}
				4633	return;
				4634
				4635	case 6:
				4636	case 4:
				4637	//case 7: // GRR BOGUS
				4638	//case 5: // GRR BOGUS
				4639	{
				4640	_ShiftBpp.use = bpp << 3;
				4641	_ShiftRem.use = 64 - _ShiftBpp.use;
				4642
				4643	__asm__ __volatile__ (
				4644	// preload "movl row, %%edi \n\t"
				4645	"movl _dif, %%edx \n\t"
				4646	"movl %%edi, %%esi \n\t" // lp = row
				4647	// preload "movl bpp, %%eax \n\t"
				4648	"addl %%eax, %%edi \n\t" // rp = row + bpp
				4649
				4650	// prime the pump: load the first Raw(x-bpp) data set
				4651	"movq -8(%%edi,%%edx,), %%mm1 \n\t"
				4652
				4653	"sub_4lp: \n\t" // shift data for adding first
				4654	"psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
				4655	// shift clears inactive bytes)
				4656	"movq (%%edi,%%edx,), %%mm0 \n\t"
				4657	"paddb %%mm1, %%mm0 \n\t"
				4658
				4659	// add 2nd active group
				4660	"movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
				4661	"psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
				4662	"addl $8, %%edx \n\t"
				4663	"paddb %%mm1, %%mm0 \n\t"
				4664
				4665	"cmpl _MMXLength, %%edx \n\t"
				4666	"movq %%mm0, -8(%%edi,%%edx,) \n\t"
				4667	"movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
				4668	"jb sub_4lp \n\t"
				4669
				4670	: "=a" (dummy_value_a), // 0 // output regs (dummy)
				4671	"=D" (dummy_value_D) // 1
				4672
				4673	: "0" (bpp), // eax // input regs
				4674	"1" (row) // edi
				4675
				4676	: "%edx", "%esi" // clobber list
				4677	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
				4678	, "%mm0", "%mm1"
				4679	#endif
				4680	);
				4681	}
				4682	break;
				4683
				4684	case 2:
				4685	{
				4686	_ActiveMask.use = 0x00000000ffff0000LL;
				4687	_ShiftBpp.use = 16; // == 2 * 8
				4688	_ShiftRem.use = 48; // == 64 - 16
				4689
				4690	__asm__ __volatile__ (
				4691	"movq _ActiveMask, %%mm7 \n\t" // load _ActiveMask for 2nd
				4692	// active byte group
				4693	"movl _dif, %%edx \n\t"
				4694	"movq %%mm7, %%mm6 \n\t"
				4695	// preload "movl row, %%edi \n\t"
				4696	"psllq _ShiftBpp, %%mm6 \n\t" // move mask in mm6 to cover
				4697	// 3rd active byte group
				4698	"movl %%edi, %%esi \n\t" // lp = row
				4699	"movq %%mm6, %%mm5 \n\t"
				4700	// preload "movl bpp, %%eax \n\t"
				4701	"addl %%eax, %%edi \n\t" // rp = row + bpp
				4702	"psllq _ShiftBpp, %%mm5 \n\t" // move mask in mm5 to cover
				4703	// 4th active byte group
				4704	// prime the pump: load the first Raw(x-bpp) data set
				4705	"movq -8(%%edi,%%edx,), %%mm1 \n\t"
				4706
				4707	"sub_2lp: \n\t" // shift data for adding first
				4708	"psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
				4709	// shift clears inactive bytes)
				4710	// add 1st active group
				4711	"movq (%%edi,%%edx,), %%mm0 \n\t"
				4712	"paddb %%mm1, %%mm0 \n\t"
				4713
				4714	// add 2nd active group
				4715	"movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
				4716	"psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
				4717	"pand %%mm7, %%mm1 \n\t" // mask to use 2nd active group
				4718	"paddb %%mm1, %%mm0 \n\t"
				4719
				4720	// add 3rd active group
				4721	"movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
				4722	"psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
				4723	"pand %%mm6, %%mm1 \n\t" // mask to use 3rd active group
				4724	"paddb %%mm1, %%mm0 \n\t"
				4725
				4726	// add 4th active group
				4727	"movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
				4728	"psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
				4729	"pand %%mm5, %%mm1 \n\t" // mask to use 4th active group
				4730	"addl $8, %%edx \n\t"
				4731	"paddb %%mm1, %%mm0 \n\t"
				4732	"cmpl _MMXLength, %%edx \n\t"
				4733	"movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
				4734	"movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
				4735	"jb sub_2lp \n\t"
				4736
				4737	: "=a" (dummy_value_a), // 0 // output regs (dummy)
				4738	"=D" (dummy_value_D) // 1
				4739
				4740	: "0" (bpp), // eax // input regs
				4741	"1" (row) // edi
				4742
				4743	: "%edx", "%esi" // clobber list
				4744	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
				4745	, "%mm0", "%mm1", "%mm5", "%mm6", "%mm7"
				4746	#endif
				4747	);
				4748	}
				4749	break;
				4750
				4751	case 8:
				4752	{
				4753	__asm__ __volatile__ (
				4754	// preload "movl row, %%edi \n\t"
				4755	"movl _dif, %%edx \n\t"
				4756	"movl %%edi, %%esi \n\t" // lp = row
				4757	// preload "movl bpp, %%eax \n\t"
				4758	"addl %%eax, %%edi \n\t" // rp = row + bpp
				4759	"movl _MMXLength, %%ecx \n\t"
				4760
				4761	// prime the pump: load the first Raw(x-bpp) data set
				4762	"movq -8(%%edi,%%edx,), %%mm7 \n\t"
				4763	"andl $0x0000003f, %%ecx \n\t" // calc bytes over mult of 64
				4764
				4765	"sub_8lp: \n\t"
				4766	"movq (%%edi,%%edx,), %%mm0 \n\t" // load Sub(x) for 1st 8 bytes
				4767	"paddb %%mm7, %%mm0 \n\t"
				4768	"movq 8(%%edi,%%edx,), %%mm1 \n\t" // load Sub(x) for 2nd 8 bytes
				4769	"movq %%mm0, (%%edi,%%edx,) \n\t" // write Raw(x) for 1st 8 bytes
				4770
				4771	// Now mm0 will be used as Raw(x-bpp) for the 2nd group of 8 bytes.
				4772	// This will be repeated for each group of 8 bytes with the 8th
				4773	// group being used as the Raw(x-bpp) for the 1st group of the
				4774	// next loop.
				4775
				4776	"paddb %%mm0, %%mm1 \n\t"
				4777	"movq 16(%%edi,%%edx,), %%mm2 \n\t" // load Sub(x) for 3rd 8 bytes
				4778	"movq %%mm1, 8(%%edi,%%edx,) \n\t" // write Raw(x) for 2nd 8 bytes
				4779	"paddb %%mm1, %%mm2 \n\t"
				4780	"movq 24(%%edi,%%edx,), %%mm3 \n\t" // load Sub(x) for 4th 8 bytes
				4781	"movq %%mm2, 16(%%edi,%%edx,) \n\t" // write Raw(x) for 3rd 8 bytes
				4782	"paddb %%mm2, %%mm3 \n\t"
				4783	"movq 32(%%edi,%%edx,), %%mm4 \n\t" // load Sub(x) for 5th 8 bytes
				4784	"movq %%mm3, 24(%%edi,%%edx,) \n\t" // write Raw(x) for 4th 8 bytes
				4785	"paddb %%mm3, %%mm4 \n\t"
				4786	"movq 40(%%edi,%%edx,), %%mm5 \n\t" // load Sub(x) for 6th 8 bytes
				4787	"movq %%mm4, 32(%%edi,%%edx,) \n\t" // write Raw(x) for 5th 8 bytes
				4788	"paddb %%mm4, %%mm5 \n\t"
				4789	"movq 48(%%edi,%%edx,), %%mm6 \n\t" // load Sub(x) for 7th 8 bytes
				4790	"movq %%mm5, 40(%%edi,%%edx,) \n\t" // write Raw(x) for 6th 8 bytes
				4791	"paddb %%mm5, %%mm6 \n\t"
				4792	"movq 56(%%edi,%%edx,), %%mm7 \n\t" // load Sub(x) for 8th 8 bytes
				4793	"movq %%mm6, 48(%%edi,%%edx,) \n\t" // write Raw(x) for 7th 8 bytes
				4794	"addl $64, %%edx \n\t"
				4795	"paddb %%mm6, %%mm7 \n\t"
				4796	"cmpl %%ecx, %%edx \n\t"
				4797	"movq %%mm7, -8(%%edi,%%edx,) \n\t" // write Raw(x) for 8th 8 bytes
				4798	"jb sub_8lp \n\t"
				4799
				4800	"cmpl _MMXLength, %%edx \n\t"
				4801	"jnb sub_8lt8 \n\t"
				4802
				4803	"sub_8lpA: \n\t"
				4804	"movq (%%edi,%%edx,), %%mm0 \n\t"
				4805	"addl $8, %%edx \n\t"
				4806	"paddb %%mm7, %%mm0 \n\t"
				4807	"cmpl _MMXLength, %%edx \n\t"
				4808	"movq %%mm0, -8(%%edi,%%edx,) \n\t" // -8 to offset early addl edx
				4809	"movq %%mm0, %%mm7 \n\t" // move calculated Raw(x) data
				4810	// to mm1 to be new Raw(x-bpp)
				4811	// for next loop
				4812	"jb sub_8lpA \n\t"
				4813
				4814	"sub_8lt8: \n\t"
				4815
				4816	: "=a" (dummy_value_a), // 0 // output regs (dummy)
				4817	"=D" (dummy_value_D) // 1
				4818
				4819	: "0" (bpp), // eax // input regs
				4820	"1" (row) // edi
				4821
				4822	: "%ecx", "%edx", "%esi" // clobber list
				4823	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
				4824	, "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"
				4825	#endif
				4826	);
				4827	}
				4828	break;
				4829
				4830	default: // bpp greater than 8 bytes GRR BOGUS
				4831	{
				4832	__asm__ __volatile__ (
				4833	"movl _dif, %%edx \n\t"
				4834	// preload "movl row, %%edi \n\t"
				4835	"movl %%edi, %%esi \n\t" // lp = row
				4836	// preload "movl bpp, %%eax \n\t"
				4837	"addl %%eax, %%edi \n\t" // rp = row + bpp
				4838
				4839	"sub_Alp: \n\t"
				4840	"movq (%%edi,%%edx,), %%mm0 \n\t"
				4841	"movq (%%esi,%%edx,), %%mm1 \n\t"
				4842	"addl $8, %%edx \n\t"
				4843	"paddb %%mm1, %%mm0 \n\t"
				4844	"cmpl _MMXLength, %%edx \n\t"
				4845	"movq %%mm0, -8(%%edi,%%edx,) \n\t" // mov does not affect flags;
				4846	// -8 to offset addl edx
				4847	"jb sub_Alp \n\t"
				4848
				4849	: "=a" (dummy_value_a), // 0 // output regs (dummy)
				4850	"=D" (dummy_value_D) // 1
				4851
				4852	: "0" (bpp), // eax // input regs
				4853	"1" (row) // edi
				4854
				4855	: "%edx", "%esi" // clobber list
				4856	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
				4857	, "%mm0", "%mm1"
				4858	#endif
				4859	);
				4860	}
				4861	break;
				4862
				4863	} // end switch (bpp)
				4864
				4865	__asm__ __volatile__ (
				4866	"movl _MMXLength, %%edx \n\t"
				4867	//pre "movl row, %%edi \n\t"
				4868	"cmpl _FullLength, %%edx \n\t"
				4869	"jnb sub_end \n\t"
				4870
				4871	"movl %%edi, %%esi \n\t" // lp = row
				4872	//pre "movl bpp, %%eax \n\t"
				4873	"addl %%eax, %%edi \n\t" // rp = row + bpp
				4874	"xorl %%eax, %%eax \n\t"
				4875
				4876	"sub_lp2: \n\t"
				4877	"movb (%%esi,%%edx,), %%al \n\t"
				4878	"addb %%al, (%%edi,%%edx,) \n\t"
				4879	"incl %%edx \n\t"
				4880	"cmpl _FullLength, %%edx \n\t"
				4881	"jb sub_lp2 \n\t"
				4882
				4883	"sub_end: \n\t"
				4884	"EMMS \n\t" // end MMX instructions
				4885
				4886	: "=a" (dummy_value_a), // 0 // output regs (dummy)
				4887	"=D" (dummy_value_D) // 1
				4888
				4889	: "0" (bpp), // eax // input regs
				4890	"1" (row) // edi
				4891
				4892	: "%edx", "%esi" // clobber list
				4893	);
				4894
				4895	} // end of png_read_filter_row_mmx_sub()
				4896	#endif
				4897
				4898
				4899
				4900
				4901	//===========================================================================//
				4902	// //
				4903	// P N G _ R E A D _ F I L T E R _ R O W _ M M X _ U P //
				4904	// //
				4905	//===========================================================================//
				4906
				4907	// Optimized code for PNG Up filter decoder
				4908
				4909	static void /* PRIVATE */
				4910	png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
				4911	png_bytep prev_row)
				4912	{
				4913	png_uint_32 len;
				4914	int dummy_value_d; // fix 'forbidden register 3 (dx) was spilled' error
				4915	int dummy_value_S;
				4916	int dummy_value_D;
				4917
				4918	len = row_info->rowbytes; // number of bytes to filter
				4919
				4920	__asm__ __volatile__ (
				4921	//pre "movl row, %%edi \n\t"
				4922	// get # of bytes to alignment
				4923	#ifdef __PIC__
				4924	"pushl %%ebx \n\t"
				4925	#endif
				4926	"movl %%edi, %%ecx \n\t"
				4927	"xorl %%ebx, %%ebx \n\t"
				4928	"addl $0x7, %%ecx \n\t"
				4929	"xorl %%eax, %%eax \n\t"
				4930	"andl $0xfffffff8, %%ecx \n\t"
				4931	//pre "movl prev_row, %%esi \n\t"
				4932	"subl %%edi, %%ecx \n\t"
				4933	"jz up_go \n\t"
				4934
				4935	"up_lp1: \n\t" // fix alignment
				4936	"movb (%%edi,%%ebx,), %%al \n\t"
				4937	"addb (%%esi,%%ebx,), %%al \n\t"
				4938	"incl %%ebx \n\t"
				4939	"cmpl %%ecx, %%ebx \n\t"
				4940	"movb %%al, -1(%%edi,%%ebx,) \n\t" // mov does not affect flags; -1 to
				4941	"jb up_lp1 \n\t" // offset incl ebx
				4942
				4943	"up_go: \n\t"
				4944	//pre "movl len, %%edx \n\t"
				4945	"movl %%edx, %%ecx \n\t"
				4946	"subl %%ebx, %%edx \n\t" // subtract alignment fix
				4947	"andl $0x0000003f, %%edx \n\t" // calc bytes over mult of 64
				4948	"subl %%edx, %%ecx \n\t" // drop over bytes from length
				4949
				4950	// unrolled loop - use all MMX registers and interleave to reduce
				4951	// number of branch instructions (loops) and reduce partial stalls
				4952	"up_loop: \n\t"
				4953	"movq (%%esi,%%ebx,), %%mm1 \n\t"
				4954	"movq (%%edi,%%ebx,), %%mm0 \n\t"
				4955	"movq 8(%%esi,%%ebx,), %%mm3 \n\t"
				4956	"paddb %%mm1, %%mm0 \n\t"
				4957	"movq 8(%%edi,%%ebx,), %%mm2 \n\t"
				4958	"movq %%mm0, (%%edi,%%ebx,) \n\t"
				4959	"paddb %%mm3, %%mm2 \n\t"
				4960	"movq 16(%%esi,%%ebx,), %%mm5 \n\t"
				4961	"movq %%mm2, 8(%%edi,%%ebx,) \n\t"
				4962	"movq 16(%%edi,%%ebx,), %%mm4 \n\t"
				4963	"movq 24(%%esi,%%ebx,), %%mm7 \n\t"
				4964	"paddb %%mm5, %%mm4 \n\t"
				4965	"movq 24(%%edi,%%ebx,), %%mm6 \n\t"
				4966	"movq %%mm4, 16(%%edi,%%ebx,) \n\t"
				4967	"paddb %%mm7, %%mm6 \n\t"
				4968	"movq 32(%%esi,%%ebx,), %%mm1 \n\t"
				4969	"movq %%mm6, 24(%%edi,%%ebx,) \n\t"
				4970	"movq 32(%%edi,%%ebx,), %%mm0 \n\t"
				4971	"movq 40(%%esi,%%ebx,), %%mm3 \n\t"
				4972	"paddb %%mm1, %%mm0 \n\t"
				4973	"movq 40(%%edi,%%ebx,), %%mm2 \n\t"
				4974	"movq %%mm0, 32(%%edi,%%ebx,) \n\t"
				4975	"paddb %%mm3, %%mm2 \n\t"
				4976	"movq 48(%%esi,%%ebx,), %%mm5 \n\t"
				4977	"movq %%mm2, 40(%%edi,%%ebx,) \n\t"
				4978	"movq 48(%%edi,%%ebx,), %%mm4 \n\t"
				4979	"movq 56(%%esi,%%ebx,), %%mm7 \n\t"
				4980	"paddb %%mm5, %%mm4 \n\t"
				4981	"movq 56(%%edi,%%ebx,), %%mm6 \n\t"
				4982	"movq %%mm4, 48(%%edi,%%ebx,) \n\t"
				4983	"addl $64, %%ebx \n\t"
				4984	"paddb %%mm7, %%mm6 \n\t"
				4985	"cmpl %%ecx, %%ebx \n\t"
				4986	"movq %%mm6, -8(%%edi,%%ebx,) \n\t" // (+56)movq does not affect flags;
				4987	"jb up_loop \n\t" // -8 to offset addl ebx
				4988
				4989	"cmpl $0, %%edx \n\t" // test for bytes over mult of 64
				4990	"jz up_end \n\t"
				4991
				4992	"cmpl $8, %%edx \n\t" // test for less than 8 bytes
				4993	"jb up_lt8 \n\t" // [added by lcreeve@netins.net]
				4994
				4995	"addl %%edx, %%ecx \n\t"
				4996	"andl $0x00000007, %%edx \n\t" // calc bytes over mult of 8
				4997	"subl %%edx, %%ecx \n\t" // drop over bytes from length
				4998	"jz up_lt8 \n\t"
				4999
				5000	"up_lpA: \n\t" // use MMX regs to update 8 bytes sim.
				5001	"movq (%%esi,%%ebx,), %%mm1 \n\t"
				5002	"movq (%%edi,%%ebx,), %%mm0 \n\t"
				5003	"addl $8, %%ebx \n\t"
				5004	"paddb %%mm1, %%mm0 \n\t"
				5005	"cmpl %%ecx, %%ebx \n\t"
				5006	"movq %%mm0, -8(%%edi,%%ebx,) \n\t" // movq does not affect flags; -8 to
				5007	"jb up_lpA \n\t" // offset add ebx
				5008	"cmpl $0, %%edx \n\t" // test for bytes over mult of 8
				5009	"jz up_end \n\t"
				5010
				5011	"up_lt8: \n\t"
				5012	"xorl %%eax, %%eax \n\t"
				5013	"addl %%edx, %%ecx \n\t" // move over byte count into counter
				5014
				5015	"up_lp2: \n\t" // use x86 regs for remaining bytes
				5016	"movb (%%edi,%%ebx,), %%al \n\t"
				5017	"addb (%%esi,%%ebx,), %%al \n\t"
				5018	"incl %%ebx \n\t"
				5019	"cmpl %%ecx, %%ebx \n\t"
				5020	"movb %%al, -1(%%edi,%%ebx,) \n\t" // mov does not affect flags; -1 to
				5021	"jb up_lp2 \n\t" // offset inc ebx
				5022
				5023	"up_end: \n\t"
				5024	"EMMS \n\t" // conversion of filtered row complete
				5025	#ifdef __PIC__
				5026	"popl %%ebx \n\t"
				5027	#endif
				5028
				5029	: "=d" (dummy_value_d), // 0 // output regs (dummy)
				5030	"=S" (dummy_value_S), // 1
				5031	"=D" (dummy_value_D) // 2
				5032
				5033	: "0" (len), // edx // input regs
				5034	"1" (prev_row), // esi
				5035	"2" (row) // edi
				5036
				5037	: "%eax", "%ebx", "%ecx" // clobber list (no input regs!)
				5038
				5039	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
				5040	, "%mm0", "%mm1", "%mm2", "%mm3"
				5041	, "%mm4", "%mm5", "%mm6", "%mm7"
				5042	#endif
				5043	);
				5044
				5045	} // end of png_read_filter_row_mmx_up()
				5046
				5047	#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
				5048
				5049
				5050
				5051
				5052	/===========================================================================/
				5053	/* */
				5054	/* P N G _ R E A D _ F I L T E R _ R O W */
				5055	/* */
				5056	/===========================================================================/
				5057
				5058
				5059	/* Optimized png_read_filter_row routines */
				5060
				5061	void /* PRIVATE */
				5062	png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
				5063	row, png_bytep prev_row, int filter)
				5064	{
				5065	#ifdef PNG_DEBUG
				5066	char filnm[10];
				5067	#endif
				5068
				5069	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
				5070	/* GRR: these are superseded by png_ptr->asm_flags: */
				5071	#define UseMMX_sub 1 // GRR: converted 20000730
				5072	#define UseMMX_up 1 // GRR: converted 20000729
				5073	#define UseMMX_avg 1 // GRR: converted 20000828 (+ 16-bit bugfix 20000916)
				5074	#define UseMMX_paeth 1 // GRR: converted 20000828
				5075
				5076	if (_mmx_supported == 2) {
				5077	/* this should have happened in png_init_mmx_flags() already */
				5078	#if !defined(PNG_1_0_X)
				5079	png_warning(png_ptr, "asm_flags may not have been initialized");
				5080	#endif
				5081	png_mmx_support();
				5082	}
				5083	#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
				5084
				5085	#ifdef PNG_DEBUG
				5086	png_debug(1, "in png_read_filter_row (pnggccrd.c)\n");
				5087	switch (filter)
				5088	{
				5089	case 0: sprintf(filnm, "none");
				5090	break;
				5091	case 1: sprintf(filnm, "sub-%s",
				5092	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
				5093	#if !defined(PNG_1_0_X)
				5094	(png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" :
				5095	#endif
				5096	#endif
				5097	"x86");
				5098	break;
				5099	case 2: sprintf(filnm, "up-%s",
				5100	#ifdef PNG_ASSEMBLER_CODE_SUPPORTED
				5101	#if !defined(PNG_1_0_X)
				5102	(png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" :
				5103	#endif
				5104	#endif
				5105	"x86");
				5106	break;
				5107	case 3: sprintf(filnm, "avg-%s",
				5108	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
				5109	#if !defined(PNG_1_0_X)
				5110	(png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" :
				5111	#endif
				5112	#endif
				5113	"x86");
				5114	break;
				5115	case 4: sprintf(filnm, "Paeth-%s",
				5116	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
				5117	#if !defined(PNG_1_0_X)
				5118	(png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":
				5119	#endif
				5120	#endif
				5121	"x86");
				5122	break;
				5123	default: sprintf(filnm, "unknw");
				5124	break;
				5125	}
				5126	png_debug2(0, "row_number=%5ld, %5s, ", png_ptr->row_number, filnm);
				5127	png_debug1(0, "row=0x%08lx, ", (unsigned long)row);
				5128	png_debug2(0, "pixdepth=%2d, bytes=%d, ", (int)row_info->pixel_depth,
				5129	(int)((row_info->pixel_depth + 7) >> 3));
				5130	png_debug1(0,"rowbytes=%8ld\n", row_info->rowbytes);
				5131	#endif /* PNG_DEBUG */
				5132
				5133	switch (filter)
				5134	{
				5135	case PNG_FILTER_VALUE_NONE:
				5136	break;
				5137
				5138	case PNG_FILTER_VALUE_SUB:
				5139	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
				5140	#if !defined(PNG_1_0_X)
				5141	if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
				5142	(row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
				5143	(row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
				5144	#else
				5145	if (_mmx_supported)
				5146	#endif
				5147	{
				5148	png_read_filter_row_mmx_sub(row_info, row);
				5149	}
				5150	else
				5151	#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
				5152	{
				5153	png_uint_32 i;
				5154	png_uint_32 istop = row_info->rowbytes;
				5155	png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
				5156	png_bytep rp = row + bpp;
				5157	png_bytep lp = row;
				5158
				5159	for (i = bpp; i < istop; i++)
				5160	{
				5161	rp = (png_byte)(((int)(rp) + (int)(*lp++)) & 0xff);
				5162	rp++;
				5163	}
				5164	} /* end !UseMMX_sub */
				5165	break;
				5166
				5167	case PNG_FILTER_VALUE_UP:
				5168	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
				5169	#if !defined(PNG_1_0_X)
				5170	if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
				5171	(row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
				5172	(row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
				5173	#else
				5174	if (_mmx_supported)
				5175	#endif
				5176	{
				5177	png_read_filter_row_mmx_up(row_info, row, prev_row);
				5178	}
				5179	else
				5180	#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
				5181	{
				5182	png_uint_32 i;
				5183	png_uint_32 istop = row_info->rowbytes;
				5184	png_bytep rp = row;
				5185	png_bytep pp = prev_row;
				5186
				5187	for (i = 0; i < istop; ++i)
				5188	{
				5189	rp = (png_byte)(((int)(rp) + (int)(*pp++)) & 0xff);
				5190	rp++;
				5191	}
				5192	} /* end !UseMMX_up */
				5193	break;
				5194
				5195	case PNG_FILTER_VALUE_AVG:
				5196	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
				5197	#if !defined(PNG_1_0_X)
				5198	if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
				5199	(row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
				5200	(row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
				5201	#else
				5202	if (_mmx_supported)
				5203	#endif
				5204	{
				5205	png_read_filter_row_mmx_avg(row_info, row, prev_row);
				5206	}
				5207	else
				5208	#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
				5209	{
				5210	png_uint_32 i;
				5211	png_bytep rp = row;
				5212	png_bytep pp = prev_row;
				5213	png_bytep lp = row;
				5214	png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
				5215	png_uint_32 istop = row_info->rowbytes - bpp;
				5216
				5217	for (i = 0; i < bpp; i++)
				5218	{
				5219	rp = (png_byte)(((int)(rp) +
				5220	((int)(*pp++) >> 1)) & 0xff);
				5221	rp++;
				5222	}
				5223
				5224	for (i = 0; i < istop; i++)
				5225	{
				5226	rp = (png_byte)(((int)(rp) +
				5227	((int)(pp++ + lp++) >> 1)) & 0xff);
				5228	rp++;
				5229	}
				5230	} /* end !UseMMX_avg */
				5231	break;
				5232
				5233	case PNG_FILTER_VALUE_PAETH:
				5234	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
				5235	#if !defined(PNG_1_0_X)
				5236	if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
				5237	(row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
				5238	(row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
				5239	#else
				5240	if (_mmx_supported)
				5241	#endif
				5242	{
				5243	png_read_filter_row_mmx_paeth(row_info, row, prev_row);
				5244	}
				5245	else
				5246	#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
				5247	{
				5248	png_uint_32 i;
				5249	png_bytep rp = row;
				5250	png_bytep pp = prev_row;
				5251	png_bytep lp = row;
				5252	png_bytep cp = prev_row;
				5253	png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
				5254	png_uint_32 istop = row_info->rowbytes - bpp;
				5255
				5256	for (i = 0; i < bpp; i++)
				5257	{
				5258	rp = (png_byte)(((int)(rp) + (int)(*pp++)) & 0xff);
				5259	rp++;
				5260	}
				5261
				5262	for (i = 0; i < istop; i++) /* use leftover rp,pp */
				5263	{
				5264	int a, b, c, pa, pb, pc, p;
				5265
				5266	a = *lp++;
				5267	b = *pp++;
				5268	c = *cp++;
				5269
				5270	p = b - c;
				5271	pc = a - c;
				5272
				5273	#ifdef PNG_USE_ABS
				5274	pa = abs(p);
				5275	pb = abs(pc);
				5276	pc = abs(p + pc);
				5277	#else
				5278	pa = p < 0 ? -p : p;
				5279	pb = pc < 0 ? -pc : pc;
				5280	pc = (p + pc) < 0 ? -(p + pc) : p + pc;
				5281	#endif
				5282
				5283	/*
				5284	if (pa <= pb && pa <= pc)
				5285	p = a;
				5286	else if (pb <= pc)
				5287	p = b;
				5288	else
				5289	p = c;
				5290	*/
				5291
				5292	p = (pa <= pb && pa <= pc) ? a : (pb <= pc) ? b : c;
				5293
				5294	rp = (png_byte)(((int)(rp) + p) & 0xff);
				5295	rp++;
				5296	}
				5297	} /* end !UseMMX_paeth */
				5298	break;
				5299
				5300	default:
				5301	png_warning(png_ptr, "Ignoring bad row-filter type");
				5302	*row=0;
				5303	break;
				5304	}
				5305	}
				5306
				5307	#endif /* PNG_HAVE_ASSEMBLER_READ_FILTER_ROW */
				5308
				5309
				5310	/===========================================================================/
				5311	/* */
				5312	/* P N G _ M M X _ S U P P O R T */
				5313	/* */
				5314	/===========================================================================/
				5315
				5316	/* GRR NOTES: (1) the following code assumes 386 or better (pushfl/popfl)
				5317	* (2) all instructions compile with gcc 2.7.2.3 and later
				5318	* (3) the function is moved down here to prevent gcc from
				5319	* inlining it in multiple places and then barfing be-
				5320	* cause the ".NOT_SUPPORTED" label is multiply defined
				5321	* [is there a way to signal that a single function should
				5322	* not be inlined? is there a way to modify the label for
				5323	* each inlined instance, e.g., by appending _1, _2, etc.?
				5324	* maybe if don't use leading "." in label name? (nope...sigh)]
				5325	*/
				5326
				5327	int PNGAPI
				5328	png_mmx_support(void)
				5329	{
				5330	#if defined(PNG_MMX_CODE_SUPPORTED)
				5331	__asm__ __volatile__ (
				5332	"pushl %%ebx \n\t" // ebx gets clobbered by CPUID instruction
				5333	"pushl %%ecx \n\t" // so does ecx...
				5334	"pushl %%edx \n\t" // ...and edx (but ecx & edx safe on Linux)
				5335	// ".byte 0x66 \n\t" // convert 16-bit pushf to 32-bit pushfd
				5336	// "pushf \n\t" // 16-bit pushf
				5337	"pushfl \n\t" // save Eflag to stack
				5338	"popl %%eax \n\t" // get Eflag from stack into eax
				5339	"movl %%eax, %%ecx \n\t" // make another copy of Eflag in ecx
				5340	"xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21)
				5341	"pushl %%eax \n\t" // save modified Eflag back to stack
				5342	// ".byte 0x66 \n\t" // convert 16-bit popf to 32-bit popfd
				5343	// "popf \n\t" // 16-bit popf
				5344	"popfl \n\t" // restore modified value to Eflag reg
				5345	"pushfl \n\t" // save Eflag to stack
				5346	"popl %%eax \n\t" // get Eflag from stack
				5347	"pushl %%ecx \n\t" // save original Eflag to stack
				5348	"popfl \n\t" // restore original Eflag
				5349	"xorl %%ecx, %%eax \n\t" // compare new Eflag with original Eflag
				5350	"jz 0f \n\t" // if same, CPUID instr. is not supported
				5351
				5352	"xorl %%eax, %%eax \n\t" // set eax to zero
				5353	// ".byte 0x0f, 0xa2 \n\t" // CPUID instruction (two-byte opcode)
				5354	"cpuid \n\t" // get the CPU identification info
				5355	"cmpl $1, %%eax \n\t" // make sure eax return non-zero value
				5356	"jl 0f \n\t" // if eax is zero, MMX is not supported
				5357
				5358	"xorl %%eax, %%eax \n\t" // set eax to zero and...
				5359	"incl %%eax \n\t" // ...increment eax to 1. This pair is
				5360	// faster than the instruction "mov eax, 1"
				5361	"cpuid \n\t" // get the CPU identification info again
				5362	"andl $0x800000, %%edx \n\t" // mask out all bits but MMX bit (23)
				5363	"cmpl $0, %%edx \n\t" // 0 = MMX not supported
				5364	"jz 0f \n\t" // non-zero = yes, MMX IS supported
				5365
				5366	"movl $1, %%eax \n\t" // set return value to 1
				5367	"jmp 1f \n\t" // DONE: have MMX support
				5368
				5369	"0: \n\t" // .NOT_SUPPORTED: target label for jump instructions
				5370	"movl $0, %%eax \n\t" // set return value to 0
				5371	"1: \n\t" // .RETURN: target label for jump instructions
				5372	"movl %%eax, _mmx_supported \n\t" // save in global static variable, too
				5373	"popl %%edx \n\t" // restore edx
				5374	"popl %%ecx \n\t" // restore ecx
				5375	"popl %%ebx \n\t" // restore ebx
				5376
				5377	// "ret \n\t" // DONE: no MMX support
				5378	// (fall through to standard C "ret")
				5379
				5380	: // output list (none)
				5381
				5382	: // any variables used on input (none)
				5383
				5384	: "%eax" // clobber list
				5385	// , "%ebx", "%ecx", "%edx" // GRR: we handle these manually
				5386	// , "memory" // if write to a variable gcc thought was in a reg
				5387	// , "cc" // "condition codes" (flag bits)
				5388	);
				5389	#else
				5390	_mmx_supported = 0;
				5391	#endif /* PNG_MMX_CODE_SUPPORTED */
				5392
				5393	return _mmx_supported;
				5394	}
				5395
				5396
				5397	#endif /* PNG_USE_PNGGCCRD */