Imported from libpng-1.0.4c.tar
diff --git a/ANNOUNCE b/ANNOUNCE
index d988691..dcdd135 100644
--- a/ANNOUNCE
+++ b/ANNOUNCE
@@ -1,10 +1,12 @@
 
-Libpng 1.0.4 - September 19, 1999
+Libpng 1.0.4c - October 1, 1999
 
-This is a public release of libpng, intended for use in production codes.
+This is not intended to be a public release.  It will be replaced
+within a few weeks by a public version or by another test version.
 
 Changes since the last public release (1.0.3):
 
+version 1.0.3a [August 12, 1999]
   Added check for PNG_READ_INTERLACE_SUPPORTED in pngread.c; issue a warning
      if an attempt is made to read an interlaced image when it's not supported.
   Added check if png_ptr->trans is defined before free'ing it in pngread.c
@@ -32,25 +34,50 @@
     consistent with PNG-1.2, and allow variance of 500 before complaining.
   Added assembler code contributed by Intel in file pngvcrd.c and modified
     makefile.w32 to use it (Nirav Chhatrapati, INTEL Corporation, Gilles Vollant)
-  Define PNG_USE_PNGVCRD in makefile.w32, to get MMX assembler code.
   Changed "ln -s -f" to "ln -f -s" in the makefiles to make Solaris happy.
+  Added some aliases for png_set_expand() in pngrtran.c, namely
+    png_set_expand_PLTE(), png_set_expand_depth(), and png_set_expand_tRNS()
+    (Greg Roelofs, in "PNG: The Definitive Guide").
   Added makefile.beo for BEOS on X86, contributed by Sander Stok.
+version 1.0.3b [August 26, 1999]
   Replaced 2147483647L several places with PNG_MAX_UINT macro, defined in png.h
   Changed leading blanks to tabs in all makefiles.
+  Define PNG_USE_PNGVCRD in makefile.w32, to get MMX assembler code.
   Made alternate versions of  png_set_expand() in pngrtran.c, namely
     png_set_gray_1_2_4_to_8, png_set_palette_to_rgb, and png_set_tRNS_to_alpha
-    (Greg Roelofs, in "PNG: The Definitive Guide").
+    (Greg Roelofs, in "PNG: The Definitive Guide").  Deleted the 1.0.3a aliases.
   Relocated start of 'extern "C"' block in png.h so it doesn't include pngconf.h
   Revised calculation of num_blocks in pngmem.c to avoid a potentially
     negative shift distance, whose results are undefined in the C language.
   Added a check in pngset.c to prevent writing multiple tIME chunks.
   Added a check in pngwrite.c to detect invalid small window_bits sizes.
+version 1.0.3d [September 4, 1999]
+  Fixed type casting of igamma in pngrutil.c
+  Added new png_expand functions to scripts/pngdef.pas and pngos2.def
   Added a demo read_user_transform_fn that examines the row filters in pngtest.c
+version 1.0.4 [September 24, 1999]
   Define PNG_ALWAYS_EXTERN in pngconf.h if __STDC__ is defined
+  Delete #define PNG_INTERNAL and include "png.h" from pngasmrd.h
   Made several minor corrections to pngtest.c
   Changed "hptr += 16L" to "hptr = hptr + 16L" in pngmem.c for Turbo 3.0
   Renamed the makefiles with longer but more user friendly extensions.
   Copied the PNG copyright and license to a separate LICENSE file.
+  Revised documentation, png.h, and example.c to remove reference to
+    "viewing_gamma" which no longer appears in the PNG specification.
+  Revised pngvcrd.c to use MMX code for interlacing only on the final pass.
+  Updated pngvcrd.c to use the faster C filter algorithms from libpng-1.0.1a
+  Split makefile.win32vc into two versions, makefile.vcawin32 (uses MMX
+    assembler code) and makefile.vcwin32 (doesn't).
+  Added a CPU timing report to pngtest.c (enabled by defining PNGTEST_TIMING)
+version 1.0.4a September 25, 1999
+  Increase max_pixel_depth in pngrutil.c if a user transform needs it.
+  Changed several division operations to right-shifts in pngvcrd.c
+version 1.0.4b September 30, 1999
+  Added parentheses in line 3732 of pngvcrd.c
+  Added a comment in makefile.linux warning about buggy -O3 in pgcc 2.95.1
+version 1.0.4c [October 1, 1999]
+  Added a "png_check_version" function in png.c and pngtest.c that will generate
+    a helpful compiler error if an old png.h is found in the search path.
 
 Send comments/corrections/commendations to
 png-implement@ccrc.wustl.edu or to randeg@alum.rpi.edu
diff --git a/CHANGES b/CHANGES
index 7135912..ee7c17f 100644
--- a/CHANGES
+++ b/CHANGES
@@ -436,9 +436,25 @@
   Fixed type casting of igamma in pngrutil.c
   Added new png_expand functions to scripts/pngdef.pas and pngos2.def
   Added a demo read_user_transform_fn that examines the row filters in pngtest.c
-version 1.0.4 [September 19, 1999]
+version 1.0.4 [September 24, 1999]
   Define PNG_ALWAYS_EXTERN in pngconf.h if __STDC__ is defined
   Delete #define PNG_INTERNAL and include "png.h" from pngasmrd.h
   Made several minor corrections to pngtest.c
   Renamed the makefiles with longer but more user friendly extensions.
   Copied the PNG copyright and license to a separate LICENSE file.
+  Revised documentation, png.h, and example.c to remove reference to
+    "viewing_gamma" which no longer appears in the PNG specification.
+  Revised pngvcrd.c to use MMX code for interlacing only on the final pass.
+  Updated pngvcrd.c to use the faster C filter algorithms from libpng-1.0.1a
+  Split makefile.win32vc into two versions, makefile.vcawin32 (uses MMX
+    assembler code) and makefile.vcwin32 (doesn't).
+  Added a CPU timing report to pngtest.c (enabled by defining PNGTEST_TIMING)
+version 1.0.4a [September 25, 1999]
+  Increase max_pixel_depth in pngrutil.c if a user transform needs it.
+  Changed several division operations to right-shifts in pngvcrd.c
+version 1.0.4b [September 30, 1999]
+  Added parentheses in line 3732 of pngvcrd.c
+  Added a comment in makefile.linux warning about buggy -O3 in pgcc 2.95.1
+version 1.0.4c [October 1, 1999]
+  Added a "png_check_version" function in png.c and pngtest.c that will generate
+    a helpful compiler error if an old png.h is found in the search path.
diff --git a/INSTALL b/INSTALL
index 1cd8b63..475a8d4 100644
--- a/INSTALL
+++ b/INSTALL
@@ -1,5 +1,5 @@
 
-Installing libpng version 1.0.4 - September 19, 1999
+Installing libpng version 1.0.4c - October 1, 1999
 
 Before installing libpng, you must first install zlib.  zlib
 can usually be found wherever you got libpng.  zlib can be
@@ -10,7 +10,7 @@
 version of zlib that's installed.
 
 You can rename the directories that you downloaded (they
-might be called "libpng-1.0.4" or "lpng103" and "zlib-1.1.3"
+might be called "libpng-1.0.4c" or "lpng103" and "zlib-1.1.3"
 or "zlib113") so that you have directories called "zlib" and "libpng".
 
 Your directory structure should look like this:
@@ -47,8 +47,8 @@
  makefile.hpux    =>  HPUX (10.20 and 11.00) makefile
  makefile.sgi     =>  Silicon Graphics IRIX makefile
  makefile.sunos   =>  Sun makefile
- makefile.solaris =>  Solaris 2.X makefile (gcc, creates libpng.so.2.1.0.4)
- makefile.linux   =>  Linux/ELF makefile (gcc, creates libpng.so.2.1.0.4)
+ makefile.solaris =>  Solaris 2.X makefile (gcc, creates libpng.so.2.1.0.4c)
+ makefile.linux   =>  Linux/ELF makefile (gcc, creates libpng.so.2.1.0.4c)
  makefile.sco     =>  For SCO OSr5  ELF and Unixware 7 with Native cc
  makefile.mips    =>  MIPS makefile
  makefile.acorn   =>  Acorn makefile
@@ -61,7 +61,10 @@
  build.bat        =>  MS-DOS batch file for Borland compiler
  makefile.dj2     =>  DJGPP 2 makefile
  makefile.msc     =>  Microsoft C makefile
- makefile.win32vc =>  makefile for Microsoft Visual C++ 4.0 and later
+ makefile.vcawin32 => makefile for Microsoft Visual C++ 5.0 and later (uses
+                      assembler code)
+ makefile.vcwin32 =>  makefile for Microsoft Visual C++ 4.0 and later (does not
+                      use assembler code)
  makefile.turboc3 =>  Turbo C 3.0 makefile
  makefile.os2     =>  OS/2 Makefile (gcc and emx, requires pngos2.def)
  pngos2.def       =>  OS/2 module definition file used by makefile.os2
diff --git a/KNOWNBUG b/KNOWNBUG
index 7c640a8..1c5629d 100644
--- a/KNOWNBUG
+++ b/KNOWNBUG
@@ -14,12 +14,12 @@
    Question whether i-- or --i is better.
 
    STATUS: Under investigation, postponed until after
-   libpng-1.0.4.  About 160 loops will be turned around
+   libpng-1.0.5.  About 160 loops will be turned around
    in libpng-1.0.Nn, for testing.
 
 2. July 4, 1998 -- ENHANCEMENT -- Glenn R-P
 
-   libpng-1.0.4 and earlier transform colors to gamma=1.0 space for
+   libpng-1.0.5 and earlier transform colors to gamma=1.0 space for
    merging with background, and then back to the image's gamma.  The
    bit_depth of the intermediate (gamma=1.0) representation is probably
    not sufficient.  In the typical gamma=1/2.2 situation, the linear
@@ -34,7 +34,7 @@
    It should be possible to use libpng without floating-point aritmetic.
 
    STATUS: Under investigation, implementation postponed until after
-   libpng-1.0.4.  The application interface will change because replacements
+   libpng-1.0.5.  The application interface will change because replacements
    for the png_set_gAMA(), png_set_cHRM(), and corresponding png_get_()
    functions will be needed.
 
diff --git a/LICENSE b/LICENSE
index 92dd0ea..22fc3f2 100644
--- a/LICENSE
+++ b/LICENSE
@@ -5,7 +5,25 @@
 Copyright (c) 1996, 1997 Andreas Dilger
 (libpng versions 0.90, December 1996, through 0.96, May 1997)
 Copyright (c) 1998, 1999 Glenn Randers-Pehrson
-(libpng versions 0.97, January 1998, through 1.0.4, September 19, 1999)
+(libpng versions 0.97, January 1998, through 1.0.4c, October 1, 1999)
+
+For the purposes of this copyright and license, "Contributing Authors"
+is defined as the following set of individuals:
+
+   John Bowler
+   Kevin Bracey
+   Sam Bushell
+   Andreas Dilger
+   Magnus Holmgren
+   Tom Lane
+   Dave Martindale
+   Glenn Randers-Pehrson
+   Greg Roelofs
+   Guy Eric Schalnat
+   Paul Schmidt
+   Tom Tanner
+   Willem van Schaik
+   Tim Wegner
 
 The PNG Reference Library is supplied "AS IS".  The Contributing Authors
 and Group 42, Inc. disclaim all warranties, expressed or implied,
@@ -37,5 +55,5 @@
 appreciated.
 
 Glenn Randers-Pehrson
-randeg at alum.rpi.edu
-September 19, 1999
+randeg@alum.rpi.edu
+October 1, 1999
diff --git a/README b/README
index ae4f728..e19f306 100644
--- a/README
+++ b/README
@@ -1,4 +1,4 @@
-README for libpng 1.0.4 - September 19, 1999 (shared library 2.1)
+README for libpng 1.0.4c - October 1, 1999 (shared library 2.1)
 See the note about version numbers near the top of png.h
 
 See INSTALL for instructions on how to install libpng.
@@ -163,9 +163,9 @@
        makefile.sgi     =>  Silicon Graphics IRIX makefile
        makefile.sunos   =>  Sun makefile
        makefile.solaris =>  Solaris 2.X makefile
-                            (gcc, creates libpng.so.2.1.0.4)
+                            (gcc, creates libpng.so.2.1.0.4c)
        makefile.linux   =>  Linux/ELF makefile
-                            (gcc, creates libpng.so.2.1.0.4)
+                            (gcc, creates libpng.so.2.1.0.4c)
        makefile.sco     =>  For SCO OSr5  ELF and Unixware 7 with Native cc
        makefile.mips    =>  MIPS makefile
        makefile.acorn   =>  Acorn makefile
@@ -179,7 +179,10 @@
        build.bat        =>  MS-DOS batch file for Borland compiler
        makefile.dj2     =>  DJGPP 2 makefile
        makefile.msc     =>  Microsoft C makefile
-       makefile.win32vc =>  makefile for Microsoft Visual C++ 4.0 and later
+       makefile.vcawin32 => makefile for Microsoft Visual C++ 5.0 and
+                            later (uses assembler code)
+       makefile.vcwin32 =>  makefile for Microsoft Visual C++ 4.0 and
+                            later (does not use assembler code)
        makefile.turboc3 =>  Turbo C 3.0 makefile
        makefile.os2     =>  OS/2 Makefile (gcc and emx, requires pngos2.def)
        pngos2.def       =>  OS/2 module definition file used by makefile.os2
diff --git a/Y2KINFO b/Y2KINFO
index bdd2244..103579a 100644
--- a/Y2KINFO
+++ b/Y2KINFO
@@ -1,13 +1,13 @@
    Y2K compliance in libpng:
    =========================
       
-      September 19, 1999
+      October 1, 1999
       
       Since the PNG Development group is an ad-hoc body, we can't make
       an official declaration.
       
       This is your unofficial assurance that libpng from version 0.71 and
-      upward through 1.0.4 are Y2K compliant.  It is my belief that earlier
+      upward through 1.0.4c are Y2K compliant.  It is my belief that earlier
       versions were also Y2K compliant.
       
       Libpng only has three year fields.  One is a 2-byte unsigned integer
diff --git a/example.c b/example.c
index a83ea48..52afbdf 100644
--- a/example.c
+++ b/example.c
@@ -197,7 +197,8 @@
 
    /* Some suggestions as to how to get a screen gamma value */
 
-   /* Note that screen gamma is (display_gamma/viewing_gamma) */
+   /* Note that screen gamma is the display_exponent, which includes
+    * the CRT_exponent and any correction for viewing conditions */
    if (/* We have a user-defined screen gamma value */)
    {
       screen_gamma = user-defined screen_gamma;
diff --git a/libpng.3 b/libpng.3
index 48f3eb5..8fbfa99 100644
--- a/libpng.3
+++ b/libpng.3
@@ -1,6 +1,6 @@
-.TH LIBPNG 3 "September 19, 1999"
+.TH LIBPNG 3 "October 1, 1999"
 .SH NAME
-libpng \- Portable Network Graphics (PNG) Reference Library 1.0.4 - September 19, 1999
+libpng \- Portable Network Graphics (PNG) Reference Library 1.0.4c - October 1, 1999
 .SH SYNOPSIS
 \fI\fB
 
@@ -617,7 +617,7 @@
 .SH LIBPNG.TXT
 libpng.txt - A description on how to use and modify libpng
 
- libpng version 1.0.4 - September 19, 1999
+ libpng version 1.0.4c - October 1, 1999
  Updated and distributed by Glenn Randers-Pehrson
  <randeg@alum.rpi.edu>
  Copyright (c) 1998, 1999 Glenn Randers-Pehrson
@@ -1314,17 +1314,15 @@
 To properly display PNG images on any kind of system, the application needs
 to know what the display gamma is.  Ideally, the user will know this, and
 the application will allow them to set it.  One method of allowing the user
-to set the display gamma separately for each system is to check for the
-DISPLAY_GAMMA and VIEWING_GAMMA environment variables or for a SCREEN_GAMMA
-environment variable, which will hopefully be correctly set.
+to set the display gamma separately for each system is to check for a
+SCREEN_GAMMA or DISPLAY_GAMMA environment variable, which will hopefully be
+correctly set.
 
-Note that display_gamma is the gamma of your display, while screen_gamma is
-the overall gamma correction required to produce pleasing results,
-which depends on the lighting conditions in the surrounding environment.
-Screen_gamma is display_gamma/viewing_gamma, where viewing_gamma is
-the amount of additional gamma correction needed to compensate for
-a (viewing_gamma=1.25) environment.  In a dim or brightly lit room, no
-compensation other than the display_gamma is needed (viewing_gamma=1.0).
+Note that display_gamma is the overall gamma correction required to produce
+pleasing results, which depends on the lighting conditions in the surrounding
+environment.  In a dim or brightly lit room, no compensation other than
+the physical gamma exponent of the monitor is needed, while in a dark room
+a slightly smaller exponent is better.
 
    double gamma, screen_gamma;
 
@@ -2677,13 +2675,13 @@
 
 .SH VII. Y2K Compliance in libpng
 
-January 13, 1999
+October 1, 1999
 
 Since the PNG Development group is an ad-hoc body, we can't make
 an official declaration.
 
 This is your unofficial assurance that libpng from version 0.71 and
-upward through 1.0.4 are Y2K compliant.  It is my belief that earlier
+upward through 1.0.4c are Y2K compliant.  It is my belief that earlier
 versions were also Y2K compliant.
 
 Libpng only has three year fields.  One is a 2-byte unsigned integer that
@@ -2802,12 +2800,6 @@
 .SH AUTHORS
 This man page: Glenn Randers-Pehrson
 <randeg@alum.rpi.edu>
-  
-Contributing Authors: John Bowler, Kevin Bracey, Sam Bushell, Andreas Dilger,
-Magnus Holmgren, Tom Lane, Dave Martindale, Glenn Randers-Pehrson,
-Greg Roelofs, Guy Eric Schalnat, Paul Schmidt, Tom Tanner, Willem van
-Schaik, Tim Wegner.
-<png-implement@ccrc.wustl.edu>
 
 The contributing authors would like to thank all those who helped
 with testing, bug fixes, and patience.  This wouldn't have been
@@ -2815,7 +2807,7 @@
 
 Thanks to Frank J. T. Wojcik for helping with the documentation.
   
-Libpng version 1.0.4 - September 19, 1999:
+Libpng version 1.0.4c - October 1, 1999:
 Initially created in 1995 by Guy Eric Schalnat, then of Group 42, Inc.
 Currently maintained by Glenn Randers-Pehrson (randeg@alum.rpi.edu).
 
@@ -2830,7 +2822,25 @@
 Copyright (c) 1996, 1997 Andreas Dilger
 (libpng versions 0.90, December 1996, through 0.96, May 1997)
 Copyright (c) 1998, 1999 Glenn Randers-Pehrson
-(libpng versions 0.97, January 1998, through 1.0.4, September 19, 1999)
+(libpng versions 0.97, January 1998, through 1.0.4c, October 1, 1999)
+
+For the purposes of this copyright and license, "Contributing Authors"
+is defined as the following set of individuals:
+
+   John Bowler
+   Kevin Bracey
+   Sam Bushell
+   Andreas Dilger
+   Magnus Holmgren
+   Tom Lane
+   Dave Martindale
+   Glenn Randers-Pehrson
+   Greg Roelofs
+   Guy Eric Schalnat
+   Paul Schmidt
+   Tom Tanner
+   Willem van Schaik
+   Tim Wegner
 
 The PNG Reference Library (libpng) is supplied "AS IS".  The Contributing
 Authors and Group 42, Inc. disclaim all warranties, expressed or implied,
@@ -2869,5 +2879,8 @@
 Also, the PNG logo (in PNG format, of course) is supplied in the
 file "pngnow.png".
 
+Libpng is OSI Certified Open Source Software.  OSI Certified is a
+certification mark of the Open Source Initiative.
+
 .\" end of man page
 
diff --git a/libpng.txt b/libpng.txt
index 7fd60af..8023e79 100644
--- a/libpng.txt
+++ b/libpng.txt
@@ -1,6 +1,6 @@
 libpng.txt - A description on how to use and modify libpng
 
- libpng version 1.0.4 - September 19, 1999
+ libpng version 1.0.4c - October 1, 1999
  Updated and distributed by Glenn Randers-Pehrson
  <randeg@alum.rpi.edu>
  Copyright (c) 1998, 1999 Glenn Randers-Pehrson
@@ -697,17 +697,15 @@
 To properly display PNG images on any kind of system, the application needs
 to know what the display gamma is.  Ideally, the user will know this, and
 the application will allow them to set it.  One method of allowing the user
-to set the display gamma separately for each system is to check for the
-DISPLAY_GAMMA and VIEWING_GAMMA environment variables or for a SCREEN_GAMMA
-environment variable, which will hopefully be correctly set.
+to set the display gamma separately for each system is to check for a
+SCREEN_GAMMA or DISPLAY_GAMMA environment variable, which will hopefully be
+correctly set.
 
-Note that display_gamma is the gamma of your display, while screen_gamma is
-the overall gamma correction required to produce pleasing results,
-which depends on the lighting conditions in the surrounding environment.
-Screen_gamma is display_gamma/viewing_gamma, where viewing_gamma is
-the amount of additional gamma correction needed to compensate for
-a (viewing_gamma=1.25) environment.  In a dim or brightly lit room, no
-compensation other than the display_gamma is needed (viewing_gamma=1.0).
+Note that display_gamma is the overall gamma correction required to produce
+pleasing results, which depends on the lighting conditions in the surrounding
+environment.  In a dim or brightly lit room, no compensation other than
+the physical gamma exponent of the monitor is needed, while in a dark room
+a slightly smaller exponent is better.
 
    double gamma, screen_gamma;
 
@@ -2060,13 +2058,13 @@
 
 VII. Y2K Compliance in libpng
 
-January 13, 1999
+October 1, 1999
 
 Since the PNG Development group is an ad-hoc body, we can't make
 an official declaration.
 
 This is your unofficial assurance that libpng from version 0.71 and
-upward through 1.0.4 are Y2K compliant.  It is my belief that earlier
+upward through 1.0.4c are Y2K compliant.  It is my belief that earlier
 versions were also Y2K compliant.
 
 Libpng only has three year fields.  One is a 2-byte unsigned integer that
diff --git a/libpngpf.3 b/libpngpf.3
index c808ae8..fb0803f 100644
--- a/libpngpf.3
+++ b/libpngpf.3
@@ -1,6 +1,6 @@
-.TH LIBPNGPF 3 September 19, 1999
+.TH LIBPNGPF 3 October 1, 1999
 .SH NAME
-libpng \- Portable Network Graphics (PNG) Reference Library 1.0.4 - September 19, 1999
+libpng \- Portable Network Graphics (PNG) Reference Library 1.0.4c - October 1, 1999
 (private functions)
 .SH SYNOPSIS
 \fB#include <png.h>\fP
diff --git a/png.5 b/png.5
index 0bfb130..081adb5 100644
--- a/png.5
+++ b/png.5
@@ -1,4 +1,4 @@
-.TH PNG 5 "September 19, 1999"
+.TH PNG 5 "October 1, 1999"
 .SH NAME
 png \- Portable Network Graphics (PNG) format
 .SH DESCRIPTION
diff --git a/png.c b/png.c
index dfca050..1b0141b 100644
--- a/png.c
+++ b/png.c
@@ -1,7 +1,7 @@
 
 /* png.c - location for general purpose libpng functions
  *
- * libpng version 1.0.4 - September 19, 1999
+ * libpng version 1.0.4c - October 1, 1999
  * Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
  * Copyright (c) 1996, 1997 Andreas Dilger
  * Copyright (c) 1998, 1999 Glenn Randers-Pehrson
@@ -16,7 +16,7 @@
  * string defined in png.h.
  */
 
-char png_libpng_ver[12] = "1.0.4";
+char png_libpng_ver[12] = "1.0.4c";
 
 /* Place to hold the signature string for a PNG file. */
 png_byte FARDATA png_sig[8] = {137, 80, 78, 71, 13, 10, 26, 10};
@@ -73,12 +73,12 @@
 /* Mask to determine which pixels to overwrite while displaying */
 int FARDATA png_pass_dsp_mask[] = {0xff, 0x0f, 0xff, 0x33, 0xff, 0x55, 0xff};
 
-
 /* Tells libpng that we have already handled the first "num_bytes" bytes
  * of the PNG file signature.  If the PNG data is embedded into another
  * stream we can set num_bytes = 8 so that libpng will not attempt to read
  * or write any of the magic bytes before it starts on the IHDR.
  */
+
 void
 png_set_sig_bytes(png_structp png_ptr, int num_bytes)
 {
@@ -352,8 +352,17 @@
 {
    if(png_ptr == NULL)
      /* silence compiler warning about unused png_ptr */ ;
-   return("\n libpng version 1.0.4 - September 19, 1999\n\
+   return("\n libpng version 1.0.4c - October 1, 1999\n\
    Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.\n\
    Copyright (c) 1996, 1997 Andreas Dilger\n\
    Copyright (c) 1998, 1999 Glenn Randers-Pehrson\n");
 }
+
+/* Generate a compiler error if there is an old png.h in the search path. */
+void
+png_check_version
+   (version_1_0_4c png_h_is_not_version_1_0_4c)
+{
+   if(png_h_is_not_version_1_0_4c == NULL)
+     /* silence compiler warning about unused parameter */ ;
+}
diff --git a/png.h b/png.h
index d370703..67a29e5 100644
--- a/png.h
+++ b/png.h
@@ -1,7 +1,7 @@
 
 /* png.h - header file for PNG reference library
  *
- * libpng version 1.0.4 - September 19, 1999
+ * libpng version 1.0.4c - October 1, 1999
  * Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
  * Copyright (c) 1996, 1997 Andreas Dilger
  * Copyright (c) 1998, 1999 Glenn Randers-Pehrson
@@ -9,19 +9,19 @@
  * Authors and maintainers:
  *  libpng versions 0.71, May 1995, through 0.89c, May 1996: Guy Schalnat
  *  libpng versions 0.90, December 1996, through 0.96, May 1997: Andreas Dilger
- *  libpng versions 0.97, January 1998, through 1.0.4 - September 19, 1999: Glenn R-P
+ *  libpng versions 0.97, January 1998, through 1.0.4c - October 1, 1999: Glenn
  *  See also "Contributing Authors", below.
  *
  * Y2K compliance in libpng:
  * =========================
  *    
- *    January 13, 1999
+ *    October 1, 1999
  *    
  *    Since the PNG Development group is an ad-hoc body, we can't make
  *    an official declaration.
  *    
  *    This is your unofficial assurance that libpng from version 0.71 and
- *    upward through 1.0.4 are Y2K compliant.  It is my belief that earlier
+ *    upward through 1.0.4c are Y2K compliant.  It is my belief that earlier
  *    versions were also Y2K compliant.
  *    
  *    Libpng only has three year fields.  One is a 2-byte unsigned integer
@@ -86,8 +86,8 @@
  *    0.98                     0.98        98  2.0.98
  *    0.99                     0.99        98  2.0.99
  *    0.99a-m                  0.99        99  2.0.99
- *    1.00                     1.00       100  2.1.0 [int should be 10000]
- *    1.0.0                    1.0.0      100  2.1.0 [int should be 10000]
+ *    1.00                     1.00       100  2.1.0 [100 should be 10000]
+ *    1.0.0                    1.0.0      100  2.1.0 [100 should be 10000]
  *    1.0.1                    1.0.1    10001  2.1.0
  *    1.0.1a-e                 1.0.1a-e 10002  2.1.0.1a-e
  *    1.0.2                    1.0.2    10002  2.1.0.2
@@ -95,6 +95,8 @@
  *    1.0.3                    1.0.3    10003  2.1.0.3
  *    1.0.3a-d                 1.0.3a-d 10004  2.1.0.3a-d
  *    1.0.4                    1.0.4    10004  2.1.0.4
+ *    1.0.4a-c                 1.0.4a-c 10005  2.1.0.4a-c
+ *    1.0.5                    1.0.5    10005  2.1.0.5
  *
  *    Henceforth the source version will match the shared-library minor
  *    and patch numbers; the shared-library major version number will be
@@ -108,7 +110,18 @@
  * is available as RFC 2083 <ftp://ftp.uu.net/graphics/png/documents/>
  * and as a W3C Recommendation <http://www.w3.org/TR/REC.png.html>
  *
- * Contributing Authors:
+ * COPYRIGHT NOTICE:
+ *
+ * Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
+ * (libpng versions 0.5, May 1995, through 0.89c, May 1996)
+ * Copyright (c) 1996, 1997 Andreas Dilger
+ * (libpng versions 0.90, December 1996, through 0.96, May 1997)
+ * Copyright (c) 1998, 1999 Glenn Randers-Pehrson
+ * (libpng versions 0.97, January 1998, through 1.0.4c, October 1, 1999)
+ *
+ * For the purposes of this copyright and license, "Contributing Authors"
+ * is defined as the following set of individuals:
+ *
  *    John Bowler
  *    Kevin Bracey
  *    Sam Bushell
@@ -124,21 +137,6 @@
  *    Willem van Schaik
  *    Tim Wegner
  *
- * The contributing authors would like to thank all those who helped
- * with testing, bug fixes, and patience.  This wouldn't have been
- * possible without all of you.
- *
- * Thanks to Frank J. T. Wojcik for helping with the documentation.
- *
- * COPYRIGHT NOTICE:
- *
- * Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
- * (libpng versions 0.5, May 1995, through 0.89c, May 1996)
- * Copyright (c) 1996, 1997 Andreas Dilger
- * (libpng versions 0.90, December 1996, through 0.96, May 1997)
- * Copyright (c) 1998, 1999 Glenn Randers-Pehrson
- * (libpng versions 0.97, January 1998, through 1.0.4, September 19, 1999)
- *
  * The PNG Reference Library is supplied "AS IS".  The Contributing Authors
  * and Group 42, Inc. disclaim all warranties, expressed or implied,
  * including, without limitation, the warranties of merchantability and of
@@ -169,6 +167,20 @@
  * appreciated.
  */
 
+/*
+ *
+ * Libpng is OSI Certified Open Source Software.  OSI Certified is a
+ * certification mark of the Open Source Initiative.
+ */
+
+/*
+ * The contributing authors would like to thank all those who helped
+ * with testing, bug fixes, and patience.  This wouldn't have been
+ * possible without all of you.
+ *
+ * Thanks to Frank J. T. Wojcik for helping with the documentation.
+ */
+
 #ifndef _PNG_H
 #define _PNG_H
 
@@ -196,14 +208,16 @@
  */
 
 /* Version information for png.h - this should match the version in png.c */
-#define PNG_LIBPNG_VER_STRING "1.0.4"
+#define PNG_LIBPNG_VER_STRING "1.0.4c"
 
 /* Careful here.  At one time, Guy wanted to use 082, but that would be octal.
  * We must not include leading zeros.
  * Versions 0.7 through 1.0.0 were in the range 0 to 100 here (only
  * version 1.0.0 was mis-numbered 100 instead of 10000).  From
  * version 1.0.1 it's    xxyyzz, where x=major, y=minor, z=bugfix */
-#define PNG_LIBPNG_VER    10004  /* 1.0.4 */
+#define PNG_LIBPNG_VER    10005  /* 1.0.5 */
+
+/* Note to maintainer: update this number in scripts/pngdef.pas as well */
 
 /* variables declared in png.c - only it needs to define PNG_NO_EXTERN */
 #if !defined(PNG_NO_EXTERN) || defined(PNG_ALWAYS_EXTERN)
@@ -701,7 +715,7 @@
 #if defined(PNG_READ_GAMMA_SUPPORTED) || defined(PNG_READ_BACKGROUND_SUPPORTED)
    int gamma_shift;      /* number of "insignificant" bits 16-bit gamma */
    float gamma;          /* file gamma value */
-   float screen_gamma;   /* screen gamma value (display_gamma/viewing_gamma */
+   float screen_gamma;   /* screen gamma value (display_exponent) */
 #endif /* PNG_READ_GAMMA_SUPPORTED */
 #if defined(PNG_READ_GAMMA_SUPPORTED) || defined(PNG_READ_BACKGROUND_SUPPORTED)
    png_bytep gamma_table;     /* gamma table for 8-bit depth files */
@@ -791,6 +805,11 @@
 #endif
 };
 
+/* This prevents a compiler error in png_get_copyright() in png.c if png.c
+and png.h are both at * version 1.0.4c
+ */
+typedef png_structp version_1_0_4c;
+
 typedef png_struct FAR * FAR * png_structpp;
 
 /* Here are the function definitions most commonly used.  This is not
@@ -993,7 +1012,7 @@
 #endif /* PNG_READ_DITHER_SUPPORTED */
 
 #if defined(PNG_READ_GAMMA_SUPPORTED)
-/* Handle gamma correction. Screen_gamma=(display_gamma/viewing_gamma) */
+/* Handle gamma correction. Screen_gamma=(display_exponent) */
 extern PNG_EXPORT(void,png_set_gamma) PNGARG((png_structp png_ptr,
    double screen_gamma, double default_file_gamma));
 #endif /* PNG_READ_GAMMA_SUPPORTED */
@@ -1610,7 +1629,7 @@
 {
    if(png_ptr == NULL)
      /* silence compiler warning about unused png_ptr */ ;
-   return("\n libpng version 1.0.4 - September 19, 1999 (header)\n");
+   return("\n libpng version 1.0.4c - October 1, 1999 (header)\n");
 }
 #endif
 
diff --git a/pngasmrd.h b/pngasmrd.h
index ae9853c..e6c9c02 100644
--- a/pngasmrd.h
+++ b/pngasmrd.h
@@ -1,6 +1,6 @@
 /* pngasmrd.h - assembler version of utilities to read a PNG file
  *
- * libpng 1.0.4 - September 19, 1999
+ * libpng 1.0.4c - October 1, 1999
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1999 Glenn Randers-Pehrson
  *
@@ -21,7 +21,7 @@
 /* Set this in the makefile for gcc on Pentium, not in pngconf.h */
 #ifdef PNG_USE_PNGGCCRD
 /* Platform must be Pentium.  Makefile must assemble and load pnggccrd.c
- * (not available in libpng 1.0.4).
+ * (not available in libpng 1.0.4c).
  * MMX will be detected at run time and used if present.
  */
 #define PNG_HAVE_ASSEMBLER_COMBINE_ROW
diff --git a/pngconf.h b/pngconf.h
index 3f546bc..41316d9 100644
--- a/pngconf.h
+++ b/pngconf.h
@@ -1,7 +1,7 @@
 
 /* pngconf.h - machine configurable file for libpng
  *
- * libpng 1.0.4 - September 19, 1999
+ * libpng 1.0.4c - October 1, 1999
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
  * Copyright (c) 1996, 1997 Andreas Dilger
diff --git a/pngerror.c b/pngerror.c
index 2d2cede..63a2b48 100644
--- a/pngerror.c
+++ b/pngerror.c
@@ -1,7 +1,7 @@
 
 /* pngerror.c - stub functions for i/o and memory allocation
  *
- * libpng 1.0.4 - September 19, 1999
+ * libpng 1.0.4c - October 1, 1999
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
  * Copyright (c) 1996, 1997 Andreas Dilger
diff --git a/pngget.c b/pngget.c
index 248db8e..c27d9b9 100644
--- a/pngget.c
+++ b/pngget.c
@@ -1,7 +1,7 @@
 
 /* pngget.c - retrieval of values from info struct
  *
- * libpng 1.0.4 - September 19, 1999
+ * libpng 1.0.4c - October 1, 1999
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
  * Copyright (c) 1996, 1997 Andreas Dilger
diff --git a/pngmem.c b/pngmem.c
index 06eb090..cbaa27e 100644
--- a/pngmem.c
+++ b/pngmem.c
@@ -1,7 +1,7 @@
 
 /* pngmem.c - stub functions for memory allocation
  *
- * libpng 1.0.4 - September 19, 1999
+ * libpng 1.0.4c - October 1, 1999
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
  * Copyright (c) 1996, 1997 Andreas Dilger
diff --git a/pngnow.png b/pngnow.png
new file mode 100644
index 0000000..16280e7
--- /dev/null
+++ b/pngnow.png
Binary files differ
diff --git a/pngpread.c b/pngpread.c
index 9d2fe48..d703b81 100644
--- a/pngpread.c
+++ b/pngpread.c
@@ -1,7 +1,7 @@
 
 /* pngpread.c - read a png file in push mode
  *
- * libpng 1.0.4 - September 19, 1999
+ * libpng 1.0.4c - October 1, 1999
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
  * Copyright (c) 1996, 1997 Andreas Dilger
diff --git a/pngread.c b/pngread.c
index fc700f4..1517535 100644
--- a/pngread.c
+++ b/pngread.c
@@ -1,7 +1,7 @@
 
 /* pngread.c - read a PNG file
  *
- * libpng 1.0.4 - September 19, 1999
+ * libpng 1.0.4c - October 1, 1999
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
  * Copyright (c) 1996, 1997 Andreas Dilger
@@ -531,7 +531,7 @@
  * not called png_set_interlace_handling(), the display_row buffer will
  * be ignored, so pass NULL to it.
  *
- * [*] png_handle_alpha() does not exist yet, as of libpng version 1.0.4.
+ * [*] png_handle_alpha() does not exist yet, as of libpng version 1.0.4c.
  */
 
 void
@@ -580,7 +580,7 @@
  * only call this function once.  If you desire to have an image for
  * each pass of a interlaced image, use png_read_rows() instead.
  *
- * [*] png_handle_alpha() does not exist yet, as of libpng version 1.0.4.
+ * [*] png_handle_alpha() does not exist yet, as of libpng version 1.0.4c.
  */
 void
 png_read_image(png_structp png_ptr, png_bytepp image)
diff --git a/pngrio.c b/pngrio.c
index 8d4390c..4cc33a7 100644
--- a/pngrio.c
+++ b/pngrio.c
@@ -1,7 +1,7 @@
 
 /* pngrio.c - functions for data input
  *
- * libpng 1.0.4 - September 19, 1999
+ * libpng 1.0.4c - October 1, 1999
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
  * Copyright (c) 1996, 1997 Andreas Dilger
diff --git a/pngrtran.c b/pngrtran.c
index f8b8e80..9c2b0ed 100644
--- a/pngrtran.c
+++ b/pngrtran.c
@@ -1,7 +1,7 @@
 
 /* pngrtran.c - transforms the data in a row for PNG readers
  *
- * libpng 1.0.4 - September 19, 1999
+ * libpng 1.0.4c - October 1, 1999
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
  * Copyright (c) 1996, 1997 Andreas Dilger
@@ -1069,6 +1069,7 @@
    info_ptr->pixel_depth = (png_byte)(info_ptr->channels *
       info_ptr->bit_depth);
    info_ptr->rowbytes = ((info_ptr->width * info_ptr->pixel_depth + 7) >> 3);
+
 }
 
 /* Transform the row.  The order of transformations is significant,
diff --git a/pngrutil.c b/pngrutil.c
index c49ac61..bbf0838 100644
--- a/pngrutil.c
+++ b/pngrutil.c
@@ -1,7 +1,7 @@
 
 /* pngrutil.c - utilities to read a PNG file
  *
- * libpng 1.0.4 - September 19, 1999
+ * libpng 1.0.4c - October 1, 1999
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
  * Copyright (c) 1996, 1997 Andreas Dilger
@@ -945,7 +945,7 @@
       return;
    }
 
-   num = (int)length / 2;
+   num = (int)length / 2 ;
    png_ptr->hist = (png_uint_16p)png_malloc(png_ptr,
       (png_uint_32)(num * sizeof (png_uint_16)));
    png_ptr->flags |= PNG_FLAG_FREE_HIST;
@@ -1892,6 +1892,7 @@
             png_size_t pixel_bytes = (row_info->pixel_depth >> 3);
             png_bytep sp = row + (png_size_t)(row_info->width - 1) * pixel_bytes;
             png_bytep dp = row + (png_size_t)(final_width - 1) * pixel_bytes;
+
             int jstop = png_pass_inc[pass];
             png_uint_32 i;
 
@@ -1937,7 +1938,7 @@
       {
          png_uint_32 i;
          png_uint_32 istop = row_info->rowbytes;
-         png_uint_32 bpp = (row_info->pixel_depth + 7) / 8;
+         png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
          png_bytep rp = row + bpp;
          png_bytep lp = row;
 
@@ -1968,20 +1969,20 @@
          png_bytep rp = row;
          png_bytep pp = prev_row;
          png_bytep lp = row;
-         png_uint_32 bpp = (row_info->pixel_depth + 7) / 8;
+         png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
          png_uint_32 istop = row_info->rowbytes - bpp;
 
          for (i = 0; i < bpp; i++)
          {
             *rp = (png_byte)(((int)(*rp) +
-               ((int)(*pp++) / 2)) & 0xff);
+               ((int)(*pp++) / 2 )) & 0xff);
             rp++;
          }
 
          for (i = 0; i < istop; i++)
          {
             *rp = (png_byte)(((int)(*rp) +
-               (int)(*pp++ + *lp++) / 2) & 0xff);
+               (int)(*pp++ + *lp++) / 2 ) & 0xff);
             rp++;
          }
          break;
@@ -1993,7 +1994,7 @@
          png_bytep pp = prev_row;
          png_bytep lp = row;
          png_bytep cp = prev_row;
-         png_uint_32 bpp = (row_info->pixel_depth + 7) / 8;
+         png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
          png_uint_32 istop=row_info->rowbytes - bpp;
 
          for (i = 0; i < bpp; i++)
@@ -2267,6 +2268,16 @@
    }
 #endif
 
+#if defined(PNG_READ_USER_TRANSFORM_SUPPORTED)
+   if(png_ptr->transformations & PNG_USER_TRANSFORM)
+     {
+       int user_pixel_depth=png_ptr->user_transform_depth*
+         png_ptr->user_transform_channels;
+       if(user_pixel_depth > max_pixel_depth)
+         max_pixel_depth=user_pixel_depth;
+     }
+#endif
+
    /* align the width on the next larger 8 pixels.  Mainly used
       for interlacing */
    row_bytes = ((png_ptr->width + 7) & ~((png_uint_32)7));
diff --git a/pngset.c b/pngset.c
index e0f9e0a..bfec907 100644
--- a/pngset.c
+++ b/pngset.c
@@ -1,7 +1,7 @@
 
 /* pngset.c - storage of image information into info struct
  *
- * libpng 1.0.4 - September 19, 1999
+ * libpng 1.0.4c - October 1, 1999
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
  * Copyright (c) 1996, 1997 Andreas Dilger
diff --git a/pngtest.c b/pngtest.c
index afd5c37..51d289c 100644
--- a/pngtest.c
+++ b/pngtest.c
@@ -1,7 +1,7 @@
 
 /* pngtest.c - a simple test program to test libpng
  *
- * libpng 1.0.4 - September 19, 1999
+ * libpng 1.0.4c - October 1, 1999
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
  * Copyright (c) 1996, 1997 Andreas Dilger
@@ -35,8 +35,24 @@
 #define PNG_DEBUG 0
 #endif
 
+/* Turn on CPU timing
+#define PNGTEST_TIMING
+*/
+
+#ifdef PNGTEST_TIMING
+static float t_start, t_stop, t_decode, t_encode, t_misc;
+#include <time.h>
+#endif
+
 #include "png.h"
 
+#ifdef PNGTEST_TIMING
+static float t_start, t_stop, t_decode, t_encode, t_misc;
+#if !defined(PNG_READ_tIME_SUPPORTED) && !defined(PNG_WRITE_tIME_SUPPORTED)
+#include <time.h>
+#endif
+#endif
+
 #if defined(PNG_TIME_RFC1123_SUPPORTED)
 static int tIME_chunk_present=0;
 static char tIME_string[30] = "no tIME chunk present in file";
@@ -800,16 +816,36 @@
    }
    png_debug(0, "Writing row data\n");
 
+#if defined(PNG_READ_INTERLACING_SUPPORTED) || \
+  defined(PNG_WRITE_INTERLACING_SUPPORTED)
    num_pass = png_set_interlace_handling(read_ptr);
    png_set_interlace_handling(write_ptr);
+#else
+   num_pass=1;
+#endif
 
+#ifdef PNGTEST_TIMING
+   t_stop = (float)clock();
+   t_misc += (t_stop - t_start);
+   t_start = t_stop;
+#endif
    for (pass = 0; pass < num_pass; pass++)
    {
       png_debug1(0, "Writing row data for pass %d\n",pass);
       for (y = 0; y < height; y++)
       {
          png_read_rows(read_ptr, (png_bytepp)&row_buf, (png_bytepp)NULL, 1);
+#ifdef PNGTEST_TIMING
+         t_stop = (float)clock();
+         t_decode += (t_stop - t_start);
+         t_start = t_stop;
+#endif
          png_write_rows(write_ptr, (png_bytepp)&row_buf, 1);
+#ifdef PNGTEST_TIMING
+         t_stop = (float)clock();
+         t_encode += (t_stop - t_start);
+         t_start = t_stop;
+#endif
       }
    }
 
@@ -1040,7 +1076,9 @@
 #endif
       }
 #ifdef PNG_USER_MEM_SUPPORTED
-         fprintf(STDERR, "Maximum memory allocation: %d bytes\n",
+         fprintf(STDERR, " Current memory allocation: %d bytes\n",
+            current_allocation);
+         fprintf(STDERR, " Maximum memory allocation: %d bytes\n",
             maximum_allocation);
 #endif
    }
@@ -1103,11 +1141,27 @@
 #endif
        }
 #ifdef PNG_USER_MEM_SUPPORTED
-       fprintf(STDERR, "Maximum memory allocation: %d bytes\n",
+       fprintf(STDERR, " Current memory allocation: %d bytes\n",
+          current_allocation);
+       fprintf(STDERR, " Maximum memory allocation: %d bytes\n",
           maximum_allocation);
 #endif
    }
 
+#ifdef PNGTEST_TIMING
+   t_stop = (float)clock();
+   t_misc += (t_stop - t_start);
+   t_start = t_stop;
+   fprintf(STDERR," CPU time used = %.3f seconds",
+      (t_misc+t_decode+t_encode)/(float)CLOCKS_PER_SEC);
+   fprintf(STDERR," (decoding %.3f,\n",
+      t_decode/(float)CLOCKS_PER_SEC);
+   fprintf(STDERR,"        encoding %.3f ,",
+      t_encode/(float)CLOCKS_PER_SEC);
+   fprintf(STDERR," other %.3f seconds)\n\n",
+      t_misc/(float)CLOCKS_PER_SEC);
+#endif
+
    if (ierror == 0)
       fprintf(STDERR, "libpng passes test\n");
    else
@@ -1115,3 +1169,10 @@
    return (int)(ierror != 0);
 }
 
+/* Generate a compiler error if there is an old png.h in the search path. */
+void
+png_check_pngtest_version
+   (version_1_0_4c png_h_is_not_version_1_0_4c)
+{
+   if(png_h_is_not_version_1_0_4c == NULL) return;
+}
diff --git a/pngtrans.c b/pngtrans.c
index bf14018..57a1f94 100644
--- a/pngtrans.c
+++ b/pngtrans.c
@@ -1,7 +1,7 @@
 
 /* pngtrans.c - transforms the data in a row (used by both readers and writers)
  *
- * libpng 1.0.4 - September 19, 1999
+ * libpng 1.0.4c - October 1, 1999
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
  * Copyright (c) 1996, 1997 Andreas Dilger
diff --git a/pngvcrd.c b/pngvcrd.c
index 8f429d9..4ab0a91 100644
--- a/pngvcrd.c
+++ b/pngvcrd.c
@@ -1,13 +1,13 @@
-/* pngvcrd.c - assembler version of utilities to read a PNG file
+/* pngvcrd.c - mixed C/assembler version of utilities to read a PNG file
  *
- * For Intel CPU and Microsoft Visual C++ compiler
+ * For Intel x86 CPU and Microsoft Visual C++ compiler
  *
- * libpng 1.0.4 - September 19, 1999
+ * libpng 1.0.4c - October 1, 1999
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1998, Intel Corporation
  * Copyright (c) 1998, 1999 Glenn Randers-Pehrson
  *
- * Contributed by Nirav Chhatrapati, INTEL Corporation, 1998
+ * Contributed by Nirav Chhatrapati, Intel Corporation, 1998
  * Interface to libpng contributed by Gilles Vollant, 1999
  *
  */
@@ -15,7 +15,7 @@
 #define PNG_INTERNAL
 #include "png.h"
 
-#ifdef PNG_ASSEMBLER_CODE_SUPPORTED
+#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGVCRD)
 
 static int mmx_supported=2;
 
@@ -68,8 +68,8 @@
 
   }
 
-//mmx_supported_local=0; // test code for force don't support MMX
-    //printf("MMX : %u (1=MMX supported)\n",mmx_supported_local);
+  //mmx_supported_local=0; // test code for force don't support MMX
+  //printf("MMX : %u (1=MMX supported)\n",mmx_supported_local);
 
   return mmx_supported_local;
 }
@@ -85,858 +85,857 @@
    to any alpha or transparency value associated with the pixel.  If
    you want all pixels to be combined, pass 0xff (255) in mask.  */
 
-/* Use this routine for X86 platform - uses faster MMX routine if machine
-supports MMX */
+/* Use this routine for x86 platform - uses faster MMX routine if machine
+   supports MMX */
 
 void
-png_combine_row(png_structp png_ptr, png_bytep row,
-   int mask)
+png_combine_row(png_structp png_ptr, png_bytep row, int mask)
 {
-   //int mmx_supported=0; // another test code for remove MMX in this routine
+   int save_mmx_supported = mmx_supported;
    png_debug(1,"in png_combine_row_asm\n");
-   //if (mmx_supported==2)
-   //    mmx_supported=mmxsupport();
+
+   if ((png_ptr->transformations & PNG_INTERLACE) && png_ptr->pass != 6)
+       mmx_supported = 0;
+   else
+      if (mmx_supported == 2)
+          mmx_supported = mmxsupport();
 
    if (mask == 0xff)
    {
       png_memcpy(row, png_ptr->row_buf + 1,
-   (png_size_t)((png_ptr->width *
-   png_ptr->row_info.pixel_depth + 7) >> 3));
+       (png_size_t)((png_ptr->width * png_ptr->row_info.pixel_depth + 7) >> 3));
    }
    else
    {
-    switch (png_ptr->row_info.pixel_depth)
+      switch (png_ptr->row_info.pixel_depth)
       {
-   case 1:
-   {
-      png_bytep sp;
-      png_bytep dp;
-      int s_inc, s_start, s_end;
-      int m;
-      int shift;
-      png_uint_32 i;
+         case 1:
+         {
+            png_bytep sp;
+            png_bytep dp;
+            int s_inc, s_start, s_end;
+            int m;
+            int shift;
+            png_uint_32 i;
 
-      sp = png_ptr->row_buf + 1;
-      dp = row;
-      m = 0x80;
+            sp = png_ptr->row_buf + 1;
+            dp = row;
+            m = 0x80;
 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
-      if (png_ptr->transformations & PNG_PACKSWAP)
-      {
-    s_start = 0;
-    s_end = 7;
-    s_inc = 1;
-      }
-      else
+            if (png_ptr->transformations & PNG_PACKSWAP)
+            {
+                s_start = 0;
+                s_end = 7;
+                s_inc = 1;
+            }
+            else
 #endif
-      {
-    s_start = 7;
-    s_end = 0;
-    s_inc = -1;
-      }
+            {
+                s_start = 7;
+                s_end = 0;
+                s_inc = -1;
+            }
 
-      shift = s_start;
+            shift = s_start;
 
-      for (i = 0; i < png_ptr->width; i++)
-      {
-         if (m & mask)
-         {
-      int value;
+            for (i = 0; i < png_ptr->width; i++)
+            {
+               if (m & mask)
+               {
+                  int value;
 
-      value = (*sp >> shift) & 0x1;
-      *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
-      *dp |= (png_byte)(value << shift);
+                  value = (*sp >> shift) & 0x1;
+                  *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
+                  *dp |= (png_byte)(value << shift);
+               }
+
+               if (shift == s_end)
+               {
+                  shift = s_start;
+                  sp++;
+                  dp++;
+               }
+               else
+                  shift += s_inc;
+
+               if (m == 1)
+                  m = 0x80;
+               else
+                  m >>= 1;
+            }
+            break;
          }
 
-         if (shift == s_end)
+         case 2:
          {
-      shift = s_start;
-      sp++;
-      dp++;
-         }
-         else
-      shift += s_inc;
+            png_bytep sp;
+            png_bytep dp;
+            int s_start, s_end, s_inc;
+            int m;
+            int shift;
+            png_uint_32 i;
+            int value;
 
-         if (m == 1)
-      m = 0x80;
-         else
-      m >>= 1;
-      }
-      break;
-   }
-   case 2:
-   {
-      png_bytep sp;
-      png_bytep dp;
-      int s_start, s_end, s_inc;
-      int m;
-      int shift;
-      png_uint_32 i;
-      int value;
-
-      sp = png_ptr->row_buf + 1;
-      dp = row;
-      m = 0x80;
+            sp = png_ptr->row_buf + 1;
+            dp = row;
+            m = 0x80;
 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
-      if (png_ptr->transformations & PNG_PACKSWAP)
-      {
-         s_start = 0;
-         s_end = 6;
-         s_inc = 2;
-      }
-      else
+            if (png_ptr->transformations & PNG_PACKSWAP)
+            {
+               s_start = 0;
+               s_end = 6;
+               s_inc = 2;
+            }
+            else
 #endif
-      {
-         s_start = 6;
-         s_end = 0;
-         s_inc = -2;
-      }
+            {
+               s_start = 6;
+               s_end = 0;
+               s_inc = -2;
+            }
 
-      shift = s_start;
+            shift = s_start;
 
-      for (i = 0; i < png_ptr->width; i++)
-      {
-         if (m & mask)
-         {
-      value = (*sp >> shift) & 0x3;
-      *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
-      *dp |= (png_byte)(value << shift);
+            for (i = 0; i < png_ptr->width; i++)
+            {
+               if (m & mask)
+               {
+                  value = (*sp >> shift) & 0x3;
+                  *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
+                  *dp |= (png_byte)(value << shift);
+               }
+
+               if (shift == s_end)
+               {
+                  shift = s_start;
+                  sp++;
+                  dp++;
+               }
+               else
+                  shift += s_inc;
+               if (m == 1)
+                  m = 0x80;
+               else
+                  m >>= 1;
+            }
+            break;
          }
-
-         if (shift == s_end)
+         case 4:
          {
-      shift = s_start;
-      sp++;
-      dp++;
-         }
-         else
-      shift += s_inc;
-         if (m == 1)
-      m = 0x80;
-         else
-      m >>= 1;
-      }
-      break;
-   }
-   case 4:
-   {
-      png_bytep sp;
-      png_bytep dp;
-      int s_start, s_end, s_inc;
-      int m;
-      int shift;
-      png_uint_32 i;
-      int value;
+            png_bytep sp;
+            png_bytep dp;
+            int s_start, s_end, s_inc;
+            int m;
+            int shift;
+            png_uint_32 i;
+            int value;
 
-      sp = png_ptr->row_buf + 1;
-      dp = row;
-      m = 0x80;
+            sp = png_ptr->row_buf + 1;
+            dp = row;
+            m = 0x80;
 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
-      if (png_ptr->transformations & PNG_PACKSWAP)
-      {
-         s_start = 0;
-         s_end = 4;
-         s_inc = 4;
-      }
-      else
+            if (png_ptr->transformations & PNG_PACKSWAP)
+            {
+               s_start = 0;
+               s_end = 4;
+               s_inc = 4;
+            }
+            else
 #endif
-      {
-         s_start = 4;
-         s_end = 0;
-         s_inc = -4;
-      }
-      shift = s_start;
+            {
+               s_start = 4;
+               s_end = 0;
+               s_inc = -4;
+            }
+            shift = s_start;
 
-      for (i = 0; i < png_ptr->width; i++)
-      {
-         if (m & mask)
-         {
-      value = (*sp >> shift) & 0xf;
-      *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
-      *dp |= (png_byte)(value << shift);
+            for (i = 0; i < png_ptr->width; i++)
+            {
+               if (m & mask)
+               {
+                  value = (*sp >> shift) & 0xf;
+                  *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
+                  *dp |= (png_byte)(value << shift);
+               }
+
+               if (shift == s_end)
+               {
+                  shift = s_start;
+                  sp++;
+                  dp++;
+               }
+               else
+                  shift += s_inc;
+               if (m == 1)
+                  m = 0x80;
+               else
+                  m >>= 1;
+            }
+            break;
          }
-
-         if (shift == s_end)
+         case 8:
          {
-      shift = s_start;
-      sp++;
-      dp++;
-         }
-         else
-      shift += s_inc;
-         if (m == 1)
-      m = 0x80;
-         else
-      m >>= 1;
-      }
-      break;
-   }
-     case 8:
-        {
-        png_bytep srcptr;
-        png_bytep dstptr;
-        png_uint_32 len;
-        int m;
-        int diff, unmask;
+            png_bytep srcptr;
+            png_bytep dstptr;
+            png_uint_32 len;
+            int m;
+            int diff, unmask;
 
-        __int64 mask0=0x0102040810204080;
+            __int64 mask0=0x0102040810204080;
 
-        if (mmx_supported)
-        {
-        srcptr = png_ptr->row_buf + 1;
-        dstptr = row;
-        m = 0x80;
-        unmask = ~mask;
-        len  = png_ptr->width &~7;  //reduce to multiple of 8
-        diff = png_ptr->width & 7;  //amount lost
-        _asm {
-          movd    mm7, unmask        //load bit pattern
-          psubb   mm6,mm6            //zero mm6
-          punpcklbw mm7,mm7
-          punpcklwd mm7,mm7
-          punpckldq mm7,mm7          //fill register with 8 masks
+            if (mmx_supported)
+            {
+               srcptr = png_ptr->row_buf + 1;
+               dstptr = row;
+               m = 0x80;
+               unmask = ~mask;
+               len  = png_ptr->width &~7;  //reduce to multiple of 8
+               diff = png_ptr->width & 7;  //amount lost
 
-          movq    mm0,mask0
+               _asm
+               {
+                  movd       mm7, unmask   //load bit pattern
+                  psubb      mm6,mm6       //zero mm6
+                  punpcklbw  mm7,mm7
+                  punpcklwd  mm7,mm7
+                  punpckldq  mm7,mm7       //fill register with 8 masks
 
-          pand    mm0,mm7            //nonzero if keep byte
-          pcmpeqb mm0,mm6            //zeros->1s, v versa
+                  movq       mm0,mask0
 
-          mov             ecx,len    //load length of line
-          mov             esi,srcptr //load source
-          mov             ebx,dstptr //load dest
-               cmp   ecx,0 //lcr
-               je    mainloop8end
+                  pand       mm0,mm7       //nonzero if keep byte
+                  pcmpeqb    mm0,mm6       //zeros->1s, v versa
+
+                  mov        ecx,len       //load length of line
+                  mov        esi,srcptr    //load source
+                  mov        ebx,dstptr    //load dest
+                  cmp        ecx,0         //lcr
+                  je         mainloop8end
 
 mainloop8:
-          movq    mm4,[esi]
-          pand    mm4,mm0
-          movq    mm6,mm0
-          pandn   mm6,[ebx]
-          por             mm4,mm6
-          movq    [ebx],mm4
+                  movq       mm4,[esi]
+                  pand       mm4,mm0
+                  movq       mm6,mm0
+                  pandn      mm6,[ebx]
+                  por        mm4,mm6
+                  movq       [ebx],mm4
 
-          add             esi,8       //inc by 8 bytes processed
-          add             ebx,8
-          sub             ecx,8       //dec by 8 pixels processed
+                  add        esi,8         //inc by 8 bytes processed
+                  add        ebx,8
+                  sub        ecx,8         //dec by 8 pixels processed
 
-          ja              mainloop8
+                  ja         mainloop8
 mainloop8end:
 
-          mov             ecx,diff
-          cmp             ecx,0
-          jz              end8
+                  mov        ecx,diff
+                  cmp        ecx,0
+                  jz         end8
 
-          mov             edx,mask
-          sal             edx,24      //make low byte the high byte
+                  mov        edx,mask
+                  sal        edx,24        //make low byte the high byte
 
 secondloop8:
-          sal             edx,1       //move high bit to CF
-          jnc             skip8       //if CF = 0
-          mov             al,[esi]
-          mov             [ebx],al
+                  sal        edx,1         //move high bit to CF
+                  jnc        skip8         //if CF = 0
+                  mov        al,[esi]
+                  mov        [ebx],al
 skip8:
-          inc             esi
-          inc             ebx
+                  inc        esi
+                  inc        ebx
 
-          dec             ecx
-          jnz secondloop8
+                  dec        ecx
+                  jnz        secondloop8
 end8:
-          emms
-          }
-        }
-        else /* mmx _not supported - Use modified C routine*/
-        {
-          register unsigned int incr1, initial_val, final_val;
-          png_size_t pixel_bytes;
-          png_uint_32 i;
-          //if ((mask != 0x0f) && (mask != 0x33))
-          register int disp = png_pass_inc[png_ptr->pass];
-          int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
-          pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
-          srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
-             pixel_bytes;
-          dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
-          initial_val = offset_table[png_ptr->pass]*pixel_bytes;
-          final_val = png_ptr->width*pixel_bytes;
-          incr1 = (disp)*pixel_bytes;
-          for (i = initial_val; i < final_val; i += incr1)
-          {
-          png_memcpy(dstptr, srcptr, pixel_bytes);
-          srcptr += incr1;
-          dstptr += incr1;
-          }
-        } /* end of else */
+                  emms
+               }
+            }
+            else /* mmx not supported - use modified C routine */
+            {
+               register unsigned int incr1, initial_val, final_val;
+               png_size_t pixel_bytes;
+               png_uint_32 i;
+               //if ((mask != 0x0f) && (mask != 0x33))
+               register int disp = png_pass_inc[png_ptr->pass];
+               int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
+               pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
+               srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
+                  pixel_bytes;
+               dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
+               initial_val = offset_table[png_ptr->pass]*pixel_bytes;
+               final_val = png_ptr->width*pixel_bytes;
+               incr1 = (disp)*pixel_bytes;
+               for (i = initial_val; i < final_val; i += incr1)
+               {
+                  png_memcpy(dstptr, srcptr, pixel_bytes);
+                  srcptr += incr1;
+                  dstptr += incr1;
+               }
+            } /* end of else */
 
-        break;
-        }       //end 8bpp
+            break;
+         }       //end 8bpp
 
-        case 16:
-        {
-        png_bytep srcptr;
-          png_bytep dstptr;
-          png_uint_32 len;
-        int unmask, diff;
+         case 16:
+         {
+            png_bytep srcptr;
+            png_bytep dstptr;
+            png_uint_32 len;
+            int unmask, diff;
+            __int64 mask1=0x0101020204040808,
+                    mask0=0x1010202040408080;
 
-        __int64 mask1=0x0101020204040808,
-            mask0=0x1010202040408080;
+            if (mmx_supported)
+            {
+               srcptr = png_ptr->row_buf + 1;
+               dstptr = row;
 
-        if (mmx_supported)
-        {
-          srcptr = png_ptr->row_buf + 1;
-          dstptr = row;
+               unmask = ~mask;
+               len     = (png_ptr->width)&~7;
+               diff = (png_ptr->width)&7;
+               _asm
+               {
+                  movd       mm7, unmask       //load bit pattern
+                  psubb      mm6,mm6           //zero mm6
+                  punpcklbw  mm7,mm7
+                  punpcklwd  mm7,mm7
+                  punpckldq  mm7,mm7           //fill register with 8 masks
 
-        unmask = ~mask;
-        len     = (png_ptr->width)&~7;
-        diff = (png_ptr->width)&7;
-        _asm {
-          movd    mm7, unmask       //load bit pattern
-          psubb   mm6,mm6           //zero mm6
-          punpcklbw mm7,mm7
-          punpcklwd mm7,mm7
-          punpckldq mm7,mm7         //fill register with 8 masks
+                  movq       mm0,mask0
+                  movq       mm1,mask1
 
-          movq    mm0,mask0
-          movq    mm1,mask1
+                  pand       mm0,mm7
+                  pand       mm1,mm7
 
-          pand    mm0,mm7
-          pand    mm1,mm7
+                  pcmpeqb    mm0,mm6
+                  pcmpeqb    mm1,mm6
 
-          pcmpeqb mm0,mm6
-          pcmpeqb mm1,mm6
-
-          mov             ecx,len    //load length of line
-          mov             esi,srcptr //load source
-          mov             ebx,dstptr //load dest
-               cmp   ecx,0 //lcr
-               jz    mainloop16end
+                  mov        ecx,len           //load length of line
+                  mov        esi,srcptr        //load source
+                  mov        ebx,dstptr        //load dest
+                  cmp        ecx,0             //lcr
+                  jz         mainloop16end
 
 mainloop16:
-          movq    mm4,[esi]
-          pand    mm4,mm0
-          movq    mm6,mm0
-          movq    mm7,[ebx]
-          pandn   mm6,mm7
-          por             mm4,mm6
-          movq    [ebx],mm4
+                  movq       mm4,[esi]
+                  pand       mm4,mm0
+                  movq       mm6,mm0
+                  movq       mm7,[ebx]
+                  pandn      mm6,mm7
+                  por        mm4,mm6
+                  movq       [ebx],mm4
 
-          movq    mm5,[esi+8]
-          pand    mm5,mm1
-          movq    mm7,mm1
-          movq    mm6,[ebx+8]
-          pandn   mm7,mm6
-          por             mm5,mm7
-          movq    [ebx+8],mm5
+                  movq       mm5,[esi+8]
+                  pand       mm5,mm1
+                  movq       mm7,mm1
+                  movq       mm6,[ebx+8]
+                  pandn      mm7,mm6
+                  por        mm5,mm7
+                  movq       [ebx+8],mm5
 
-          add             esi,16      //inc by 16 bytes processed
-          add             ebx,16
-          sub             ecx,8       //dec by 8 pixels processed
+                  add        esi,16            //inc by 16 bytes processed
+                  add        ebx,16
+                  sub        ecx,8             //dec by 8 pixels processed
 
-          ja              mainloop16
+                  ja         mainloop16
+
 mainloop16end:
+                  mov        ecx,diff
+                  cmp        ecx,0
+                  jz         end16
 
-          mov             ecx,diff
-          cmp             ecx,0
-          jz              end16
-
-          mov             edx,mask
-          sal             edx,24      //make low byte the high byte
-
+                  mov        edx,mask
+                  sal        edx,24            //make low byte the high byte
 secondloop16:
-          sal             edx,1       //move high bit to CF
-          jnc             skip16      //if CF = 0
-          mov             ax,[esi]
-          mov             [ebx],ax
+                  sal        edx,1             //move high bit to CF
+                  jnc        skip16            //if CF = 0
+                  mov        ax,[esi]
+                  mov        [ebx],ax
 skip16:
-          add             esi,2
-          add             ebx,2
+                  add        esi,2
+                  add        ebx,2
 
-          dec             ecx
-          jnz secondloop16
-
+                  dec        ecx
+                  jnz        secondloop16
 end16:
-          emms
-          }
-        }
-        else /* mmx _not supported - Use modified C routine */
-        {
-          register unsigned int incr1, initial_val, final_val;
-          png_size_t pixel_bytes;
-          png_uint_32 i;
-          register int disp = png_pass_inc[png_ptr->pass];
-          int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
-          pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
-          srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
-             pixel_bytes;
-          dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
-          initial_val = offset_table[png_ptr->pass]*pixel_bytes;
-          final_val = png_ptr->width*pixel_bytes;
-          incr1 = (disp)*pixel_bytes;
-          for (i = initial_val; i < final_val; i += incr1)
-          {
-          png_memcpy(dstptr, srcptr, pixel_bytes);
-          srcptr += incr1;
-          dstptr += incr1;
-          }
-        } /* end of else */
+                  emms
+               }
+            }
+            else /* mmx not supported - use modified C routine */
+            {
+               register unsigned int incr1, initial_val, final_val;
+               png_size_t pixel_bytes;
+               png_uint_32 i;
+               register int disp = png_pass_inc[png_ptr->pass];
+               int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
 
-        break;
-        }
-      case 24:
-        {
-        png_bytep srcptr;
-          png_bytep dstptr;
-          png_uint_32 len;
-        int unmask, diff;
+               pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
+               srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
+                  pixel_bytes;
+               dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
+               initial_val = offset_table[png_ptr->pass]*pixel_bytes;
+               final_val = png_ptr->width*pixel_bytes;
+               incr1 = (disp)*pixel_bytes;
+               for (i = initial_val; i < final_val; i += incr1)
+               {
+                  png_memcpy(dstptr, srcptr, pixel_bytes);
+                  srcptr += incr1;
+                  dstptr += incr1;
+               }
+            } /* end of else */
 
-        __int64 mask2=0x0101010202020404, //24bpp
-            mask1=0x0408080810101020,
-            mask0=0x2020404040808080;
+            break;
+         }
 
-          srcptr = png_ptr->row_buf + 1;
-          dstptr = row;
+         case 24:
+         {
+            png_bytep srcptr;
+            png_bytep dstptr;
+            png_uint_32 len;
+            int unmask, diff;
 
-        unmask = ~mask;
-        len     = (png_ptr->width)&~7;
-        diff = (png_ptr->width)&7;
+            __int64 mask2=0x0101010202020404,  //24bpp
+                    mask1=0x0408080810101020,
+                    mask0=0x2020404040808080;
 
-        if (mmx_supported)
-        {
-        _asm {
-          movd    mm7, unmask         //load bit pattern
-          psubb   mm6,mm6             //zero mm6
-          punpcklbw mm7,mm7
-          punpcklwd mm7,mm7
-          punpckldq mm7,mm7           //fill register with 8 masks
+            srcptr = png_ptr->row_buf + 1;
+            dstptr = row;
 
-          movq    mm0,mask0
-          movq    mm1,mask1
-          movq    mm2,mask2
+            unmask = ~mask;
+            len     = (png_ptr->width)&~7;
+            diff = (png_ptr->width)&7;
 
+            if (mmx_supported)
+            {
+               _asm
+               {
+                  movd       mm7, unmask       //load bit pattern
+                  psubb      mm6,mm6           //zero mm6
+                  punpcklbw  mm7,mm7
+                  punpcklwd  mm7,mm7
+                  punpckldq  mm7,mm7           //fill register with 8 masks
 
-          pand    mm0,mm7
-          pand    mm1,mm7
-          pand    mm2,mm7
+                  movq       mm0,mask0
+                  movq       mm1,mask1
+                  movq       mm2,mask2
 
-          pcmpeqb mm0,mm6
-          pcmpeqb mm1,mm6
-          pcmpeqb mm2,mm6
+                  pand       mm0,mm7
+                  pand       mm1,mm7
+                  pand       mm2,mm7
 
-          mov             ecx,len     //load length of line
-          mov             esi,srcptr  //load source
-          mov             ebx,dstptr  //load dest
-               cmp   ecx,0
-               jz    mainloop24end
+                  pcmpeqb    mm0,mm6
+                  pcmpeqb    mm1,mm6
+                  pcmpeqb    mm2,mm6
+
+                  mov        ecx,len           //load length of line
+                  mov        esi,srcptr        //load source
+                  mov        ebx,dstptr        //load dest
+                  cmp        ecx,0
+                  jz         mainloop24end
 
 mainloop24:
-          movq    mm4,[esi]
-          pand    mm4,mm0
-          movq    mm6,mm0
-          movq    mm7,[ebx]
-          pandn   mm6,mm7
-          por             mm4,mm6
-          movq    [ebx],mm4
+                  movq       mm4,[esi]
+                  pand       mm4,mm0
+                  movq       mm6,mm0
+                  movq       mm7,[ebx]
+                  pandn      mm6,mm7
+                  por        mm4,mm6
+                  movq       [ebx],mm4
 
 
-          movq    mm5,[esi+8]
-          pand    mm5,mm1
-          movq    mm7,mm1
-          movq    mm6,[ebx+8]
-          pandn   mm7,mm6
-          por             mm5,mm7
-          movq    [ebx+8],mm5
+                  movq       mm5,[esi+8]
+                  pand       mm5,mm1
+                  movq       mm7,mm1
+                  movq       mm6,[ebx+8]
+                  pandn      mm7,mm6
+                  por        mm5,mm7
+                  movq       [ebx+8],mm5
 
-          movq    mm6,[esi+16]
-          pand    mm6,mm2
-          movq    mm4,mm2
-          movq    mm7,[ebx+16]
-          pandn   mm4,mm7
-          por             mm6,mm4
-          movq    [ebx+16],mm6
+                  movq       mm6,[esi+16]
+                  pand       mm6,mm2
+                  movq       mm4,mm2
+                  movq       mm7,[ebx+16]
+                  pandn      mm4,mm7
+                  por        mm6,mm4
+                  movq       [ebx+16],mm6
 
-          add             esi,24      //inc by 24 bytes processed
-          add             ebx,24
-          sub             ecx,8       //dec by 8 pixels processed
+                  add        esi,24            //inc by 24 bytes processed
+                  add        ebx,24
+                  sub        ecx,8             //dec by 8 pixels processed
 
-          ja              mainloop24
+                  ja         mainloop24
+
 mainloop24end:
+                  mov        ecx,diff
+                  cmp        ecx,0
+                  jz         end24
 
-          mov             ecx,diff
-          cmp             ecx,0
-          jz              end24
-
-          mov             edx,mask
-          sal             edx,24      //make low byte the high byte
-
+                  mov        edx,mask
+                  sal        edx,24            //make low byte the high byte
 secondloop24:
-          sal             edx,1       //move high bit to CF
-          jnc             skip24      //if CF = 0
-          mov             ax,[esi]
-          mov             [ebx],ax
-          xor             eax,eax
-          mov             al,[esi+2]
-          mov             [ebx+2],al
+                  sal        edx,1             //move high bit to CF
+                  jnc        skip24            //if CF = 0
+                  mov        ax,[esi]
+                  mov        [ebx],ax
+                  xor        eax,eax
+                  mov        al,[esi+2]
+                  mov        [ebx+2],al
 skip24:
-          add             esi,3
-          add             ebx,3
+                  add        esi,3
+                  add        ebx,3
 
-          dec             ecx
-          jnz secondloop24
+                  dec        ecx
+                  jnz        secondloop24
 
 end24:
-          emms
+                  emms
+               }
+            }
+            else /* mmx not supported - use modified C routine */
+            {
+               register unsigned int incr1, initial_val, final_val;
+               png_size_t pixel_bytes;
+               png_uint_32 i;
+               register int disp = png_pass_inc[png_ptr->pass];
+               int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
+               pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
+               srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
+                  pixel_bytes;
+               dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
+               initial_val = offset_table[png_ptr->pass]*pixel_bytes;
+               final_val = png_ptr->width*pixel_bytes;
+               incr1 = (disp)*pixel_bytes;
+               for (i = initial_val; i < final_val; i += incr1)
+               {
+                  png_memcpy(dstptr, srcptr, pixel_bytes);
+                  srcptr += incr1;
+                  dstptr += incr1;
+               }
+            } /* end of else */
 
-          }
-        }
-        else /* mmx _not supported - Use modified C routine */
-        {
-          register unsigned int incr1, initial_val, final_val;
-          png_size_t pixel_bytes;
-          png_uint_32 i;
-          register int disp = png_pass_inc[png_ptr->pass];
-          int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
-          pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
-          srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]
-             *pixel_bytes;
-          dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
-          initial_val = offset_table[png_ptr->pass]*pixel_bytes;
-          final_val = png_ptr->width*pixel_bytes;
-          incr1 = (disp)*pixel_bytes;
-          for (i = initial_val; i < final_val; i += incr1)
-          {
-          png_memcpy(dstptr, srcptr, pixel_bytes);
-          srcptr += incr1;
-          dstptr += incr1;
-          }
-        } /* end of else */
+            break;
+         }       //end 24bpp
 
-        break;
-      }       //end 24bpp
-    case 32:
-      {
-      png_bytep srcptr;
-      png_bytep dstptr;
-      png_uint_32 len;
-      int unmask, diff;
+         case 32:
+         {
+            png_bytep srcptr;
+            png_bytep dstptr;
+            png_uint_32 len;
+            int unmask, diff;
 
+            __int64 mask3=0x0101010102020202,  //32bpp
+                    mask2=0x0404040408080808,
+                    mask1=0x1010101020202020,
+                    mask0=0x4040404080808080;
 
+            srcptr = png_ptr->row_buf + 1;
+            dstptr = row;
 
-      __int64 mask3=0x0101010102020202,       //32bpp
-          mask2=0x0404040408080808,
-          mask1=0x1010101020202020,
-          mask0=0x4040404080808080;
+            unmask = ~mask;
+            len     = (png_ptr->width)&~7;
+            diff = (png_ptr->width)&7;
 
-      srcptr = png_ptr->row_buf + 1;
-      dstptr = row;
+            if (mmx_supported)
+            {
+               _asm
+               {
+                  movd       mm7, unmask       //load bit pattern
+                  psubb      mm6,mm6           //zero mm6
+                  punpcklbw  mm7,mm7
+                  punpcklwd  mm7,mm7
+                  punpckldq  mm7,mm7           //fill register with 8 masks
 
-      unmask = ~mask;
-      len     = (png_ptr->width)&~7;
-      diff = (png_ptr->width)&7;
+                  movq       mm0,mask0
+                  movq       mm1,mask1
+                  movq       mm2,mask2
+                  movq       mm3,mask3
 
-      if (mmx_supported)
-      {
-      _asm {
-        movd    mm7, unmask           //load bit pattern
-        psubb   mm6,mm6               //zero mm6
-        punpcklbw mm7,mm7
-        punpcklwd mm7,mm7
-        punpckldq mm7,mm7             //fill register with 8 masks
+                  pand       mm0,mm7
+                  pand       mm1,mm7
+                  pand       mm2,mm7
+                  pand       mm3,mm7
 
-        movq    mm0,mask0
-        movq    mm1,mask1
-        movq    mm2,mask2
-        movq    mm3,mask3
+                  pcmpeqb    mm0,mm6
+                  pcmpeqb    mm1,mm6
+                  pcmpeqb    mm2,mm6
+                  pcmpeqb    mm3,mm6
 
+                  mov        ecx,len           //load length of line
+                  mov        esi,srcptr        //load source
+                  mov        ebx,dstptr        //load dest
 
-        pand    mm0,mm7
-        pand    mm1,mm7
-        pand    mm2,mm7
-        pand    mm3,mm7
-
-        pcmpeqb mm0,mm6
-        pcmpeqb mm1,mm6
-        pcmpeqb mm2,mm6
-        pcmpeqb mm3,mm6
-
-        mov             ecx,len       //load length of line
-        mov             esi,srcptr    //load source
-        mov             ebx,dstptr    //load dest
-
-            cmp   ecx,0 //lcr
-            jz    mainloop32end
+                  cmp        ecx,0             //lcr
+                  jz         mainloop32end
 
 mainloop32:
-        movq    mm4,[esi]
-        pand    mm4,mm0
-        movq    mm6,mm0
-        movq    mm7,[ebx]
-        pandn   mm6,mm7
-        por             mm4,mm6
-        movq    [ebx],mm4
+                  movq       mm4,[esi]
+                  pand       mm4,mm0
+                  movq       mm6,mm0
+                  movq       mm7,[ebx]
+                  pandn      mm6,mm7
+                  por        mm4,mm6
+                  movq       [ebx],mm4
 
+                  movq       mm5,[esi+8]
+                  pand       mm5,mm1
+                  movq       mm7,mm1
+                  movq       mm6,[ebx+8]
+                  pandn      mm7,mm6
+                  por        mm5,mm7
+                  movq       [ebx+8],mm5
 
-        movq    mm5,[esi+8]
-        pand    mm5,mm1
-        movq    mm7,mm1
-        movq    mm6,[ebx+8]
-        pandn   mm7,mm6
-        por             mm5,mm7
-        movq    [ebx+8],mm5
+                  movq       mm6,[esi+16]
+                  pand       mm6,mm2
+                  movq       mm4,mm2
+                  movq       mm7,[ebx+16]
+                  pandn      mm4,mm7
+                  por        mm6,mm4
+                  movq       [ebx+16],mm6
 
-        movq    mm6,[esi+16]
-        pand    mm6,mm2
-        movq    mm4,mm2
-        movq    mm7,[ebx+16]
-        pandn   mm4,mm7
-        por             mm6,mm4
-        movq    [ebx+16],mm6
+                  movq       mm7,[esi+24]
+                  pand       mm7,mm3
+                  movq       mm5,mm3
+                  movq       mm4,[ebx+24]
+                  pandn      mm5,mm4
+                  por        mm7,mm5
+                  movq       [ebx+24],mm7
 
-        movq    mm7,[esi+24]
-        pand    mm7,mm3
-        movq    mm5,mm3
-        movq    mm4,[ebx+24]
-        pandn   mm5,mm4
-        por             mm7,mm5
-        movq    [ebx+24],mm7
+                  add        esi,32            //inc by 32 bytes processed
+                  add        ebx,32
+                  sub        ecx,8             //dec by 8 pixels processed
 
+                  ja         mainloop32
 
-        add             esi,32        //inc by 32 bytes processed
-        add             ebx,32
-        sub             ecx,8         //dec by 8 pixels processed
-
-        ja              mainloop32
 mainloop32end:
+                  mov        ecx,diff
+                  cmp        ecx,0
+                  jz         end32
 
-        mov             ecx,diff
-        cmp             ecx,0
-        jz              end32
-
-        mov             edx,mask
-        sal             edx,24        //make low byte the high byte
-
+                  mov        edx,mask
+                  sal        edx,24            //make low byte the high byte
 secondloop32:
-        sal             edx,1         //move high bit to CF
-        jnc             skip32        //if CF = 0
-        mov             eax,[esi]
-        mov             [ebx],eax
+                  sal        edx,1             //move high bit to CF
+                  jnc        skip32            //if CF = 0
+                  mov        eax,[esi]
+                  mov        [ebx],eax
 skip32:
-        add             esi,4
-        add             ebx,4
+                  add        esi,4
+                  add        ebx,4
 
-        dec             ecx
-        jnz secondloop32
+                  dec        ecx
+                  jnz        secondloop32
 
 end32:
-        emms
+                  emms
+               }
+            }
+            else /* mmx _not supported - Use modified C routine */
+            {
+               register unsigned int incr1, initial_val, final_val;
+               png_size_t pixel_bytes;
+               png_uint_32 i;
+               register int disp = png_pass_inc[png_ptr->pass];
+               int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
+               pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
+               srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
+                  pixel_bytes;
+               dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
+               initial_val = offset_table[png_ptr->pass]*pixel_bytes;
+               final_val = png_ptr->width*pixel_bytes;
+               incr1 = (disp)*pixel_bytes;
+               for (i = initial_val; i < final_val; i += incr1)
+               {
+                  png_memcpy(dstptr, srcptr, pixel_bytes);
+                  srcptr += incr1;
+                  dstptr += incr1;
+               }
+            } /* end of else */
 
-         }
-      }
-      else /* mmx _not supported - Use modified C routine */
-        {
-          register unsigned int incr1, initial_val, final_val;
-          png_size_t pixel_bytes;
-          png_uint_32 i;
-          register int disp = png_pass_inc[png_ptr->pass];
-          int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
-          pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
-          srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
-             pixel_bytes;
-          dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
-          initial_val = offset_table[png_ptr->pass]*pixel_bytes;
-          final_val = png_ptr->width*pixel_bytes;
-          incr1 = (disp)*pixel_bytes;
-          for (i = initial_val; i < final_val; i += incr1)
-          {
-          png_memcpy(dstptr, srcptr, pixel_bytes);
-          srcptr += incr1;
-          dstptr += incr1;
-          }
-        } /* end of else */
+            break;
+         }       //end 32bpp
 
-      break;
-      }       //end 32bpp
+         case 48:
+         {
+            png_bytep srcptr;
+            png_bytep dstptr;
+            png_uint_32 len;
+            int unmask, diff;
 
+            __int64 mask5=0x0101010101010202,
+                    mask4=0x0202020204040404,
+                    mask3=0x0404080808080808,
+                    mask2=0x1010101010102020,
+                    mask1=0x2020202040404040,
+                    mask0=0x4040808080808080;
 
-      case 48:
-      {
-      png_bytep srcptr;
-      png_bytep dstptr;
-      png_uint_32 len;
-      int unmask, diff;
+            if (mmx_supported)
+            {
+               srcptr = png_ptr->row_buf + 1;
+               dstptr = row;
 
-      __int64 mask5=0x0101010101010202,
-          mask4=0x0202020204040404,
-          mask3=0x0404080808080808,
-          mask2=0x1010101010102020,
-          mask1=0x2020202040404040,
-          mask0=0x4040808080808080;
+               unmask = ~mask;
+               len     = (png_ptr->width)&~7;
+               diff = (png_ptr->width)&7;
+               _asm
+               {
+                  movd       mm7, unmask       //load bit pattern
+                  psubb      mm6,mm6           //zero mm6
+                  punpcklbw  mm7,mm7
+                  punpcklwd  mm7,mm7
+                  punpckldq  mm7,mm7           //fill register with 8 masks
 
-      if (mmx_supported)
-      {
+                  movq       mm0,mask0
+                  movq       mm1,mask1
+                  movq       mm2,mask2
+                  movq       mm3,mask3
+                  movq       mm4,mask4
+                  movq       mm5,mask5
 
-      srcptr = png_ptr->row_buf + 1;
-      dstptr = row;
+                  pand       mm0,mm7
+                  pand       mm1,mm7
+                  pand       mm2,mm7
+                  pand       mm3,mm7
+                  pand       mm4,mm7
+                  pand       mm5,mm7
 
-      unmask = ~mask;
-      len     = (png_ptr->width)&~7;
-      diff = (png_ptr->width)&7;
-      _asm {
-        movd    mm7, unmask   //load bit pattern
-        psubb   mm6,mm6       //zero mm6
-        punpcklbw mm7,mm7
-        punpcklwd mm7,mm7
-        punpckldq mm7,mm7     //fill register with 8 masks
+                  pcmpeqb    mm0,mm6
+                  pcmpeqb    mm1,mm6
+                  pcmpeqb    mm2,mm6
+                  pcmpeqb    mm3,mm6
+                  pcmpeqb    mm4,mm6
+                  pcmpeqb    mm5,mm6
 
-        movq    mm0,mask0
-        movq    mm1,mask1
-        movq    mm2,mask2
-        movq    mm3,mask3
-        movq    mm4,mask4
-        movq    mm5,mask5
+                  mov        ecx,len           //load length of line
+                  mov        esi,srcptr        //load source
+                  mov        ebx,dstptr        //load dest
 
-        pand    mm0,mm7
-        pand    mm1,mm7
-        pand    mm2,mm7
-        pand    mm3,mm7
-        pand    mm4,mm7
-        pand    mm5,mm7
-
-        pcmpeqb mm0,mm6
-        pcmpeqb mm1,mm6
-        pcmpeqb mm2,mm6
-        pcmpeqb mm3,mm6
-        pcmpeqb mm4,mm6
-        pcmpeqb mm5,mm6
-
-        mov             ecx,len       //load length of line
-        mov             esi,srcptr    //load source
-        mov             ebx,dstptr    //load dest
-
-            cmp   ecx,0
-            jz    mainloop48end
+                  cmp        ecx,0
+                  jz         mainloop48end
 
 mainloop48:
-        movq    mm7,[esi]
-        pand    mm7,mm0
-        movq    mm6,mm0
-        pandn   mm6,[ebx]
-        por             mm7,mm6
-        movq    [ebx],mm7
+                  movq       mm7,[esi]
+                  pand       mm7,mm0
+                  movq       mm6,mm0
+                  pandn      mm6,[ebx]
+                  por        mm7,mm6
+                  movq       [ebx],mm7
 
+                  movq       mm6,[esi+8]
+                  pand       mm6,mm1
+                  movq       mm7,mm1
+                  pandn      mm7,[ebx+8]
+                  por        mm6,mm7
+                  movq       [ebx+8],mm6
 
-        movq    mm6,[esi+8]
-        pand    mm6,mm1
-        movq    mm7,mm1
-        pandn   mm7,[ebx+8]
-        por             mm6,mm7
-        movq    [ebx+8],mm6
+                  movq       mm6,[esi+16]
+                  pand       mm6,mm2
+                  movq       mm7,mm2
+                  pandn      mm7,[ebx+16]
+                  por        mm6,mm7
+                  movq       [ebx+16],mm6
 
-        movq    mm6,[esi+16]
-        pand    mm6,mm2
-        movq    mm7,mm2
-        pandn   mm7,[ebx+16]
-        por             mm6,mm7
-        movq    [ebx+16],mm6
+                  movq       mm7,[esi+24]
+                  pand       mm7,mm3
+                  movq       mm6,mm3
+                  pandn      mm6,[ebx+24]
+                  por        mm7,mm6
+                  movq       [ebx+24],mm7
 
-        movq    mm7,[esi+24]
-        pand    mm7,mm3
-        movq    mm6,mm3
-        pandn   mm6,[ebx+24]
-        por             mm7,mm6
-        movq    [ebx+24],mm7
+                  movq       mm6,[esi+32]
+                  pand       mm6,mm4
+                  movq       mm7,mm4
+                  pandn      mm7,[ebx+32]
+                  por        mm6,mm7
+                  movq       [ebx+32],mm6
 
-        movq    mm6,[esi+32]
-        pand    mm6,mm4
-        movq    mm7,mm4
-        pandn   mm7,[ebx+32]
-        por             mm6,mm7
-        movq    [ebx+32],mm6
+                  movq       mm7,[esi+40]
+                  pand       mm7,mm5
+                  movq       mm6,mm5
+                  pandn      mm6,[ebx+40]
+                  por        mm7,mm6
+                  movq       [ebx+40],mm7
 
-        movq    mm7,[esi+40]
-        pand    mm7,mm5
-        movq    mm6,mm5
-        pandn   mm6,[ebx+40]
-        por             mm7,mm6
-        movq    [ebx+40],mm7
+                  add        esi,48            //inc by 32 bytes processed
+                  add        ebx,48
+                  sub        ecx,8             //dec by 8 pixels processed
 
-        add             esi,48   //inc by 32 bytes processed
-        add             ebx,48
-        sub             ecx,8    //dec by 8 pixels processed
-
-        ja              mainloop48
+                  ja         mainloop48
 mainloop48end:
 
-        mov             ecx,diff
-        cmp             ecx,0
-        jz              end48
+                  mov        ecx,diff
+                  cmp        ecx,0
+                  jz         end48
 
-        mov             edx,mask
-        sal             edx,24   //make low byte the high byte
+                  mov        edx,mask
+                  sal        edx,24            //make low byte the high byte
 
 secondloop48:
-        sal             edx,1    //move high bit to CF
-        jnc             skip48   //if CF = 0
-        mov             eax,[esi]
-        mov             [ebx],eax
+                  sal        edx,1             //move high bit to CF
+                  jnc        skip48            //if CF = 0
+                  mov        eax,[esi]
+                  mov        [ebx],eax
 skip48:
-        add             esi,4
-        add             ebx,4
+                  add        esi,4
+                  add        ebx,4
 
-        dec             ecx
-        jnz secondloop48
+                  dec        ecx
+                  jnz        secondloop48
 
 end48:
-        emms
-        }
-        }
-        else /* mmx _not supported - Use modified C routine */
-        {
-          register unsigned int incr1, initial_val, final_val;
-          png_size_t pixel_bytes;
-          png_uint_32 i;
-          register int disp = png_pass_inc[png_ptr->pass];
-          int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
-          pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
-          srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
-             pixel_bytes;
-          dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
-          initial_val = offset_table[png_ptr->pass]*pixel_bytes;
-          final_val = png_ptr->width*pixel_bytes;
-          incr1 = (disp)*pixel_bytes;
-          for (i = initial_val; i < final_val; i += incr1)
-          {
-          png_memcpy(dstptr, srcptr, pixel_bytes);
-          srcptr += incr1;
-          dstptr += incr1;
-          }
-        } /* end of else */
-      break;  // end 48 bpp
-      }
-  default:
-      {
-      png_bytep sptr;
-      png_bytep dp;
-      png_size_t pixel_bytes;
-      int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
-      unsigned int i;
-      register int disp = png_pass_inc[png_ptr->pass];   // get the offset
-      register unsigned int incr1, initial_val, final_val;
-      pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
-      sptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*pixel_bytes;
-      dp = row + offset_table[png_ptr->pass]*pixel_bytes;
-      initial_val = offset_table[png_ptr->pass]*pixel_bytes;
-      final_val = png_ptr->width*pixel_bytes;
-      incr1 = (disp)*pixel_bytes;
-      for (i = initial_val; i < final_val; i += incr1)
-      {
-        png_memcpy(dp, sptr, pixel_bytes);
-        sptr += incr1;
-        dp += incr1;
-      }
+                  emms
+               }
+            }
+            else /* mmx _not supported - Use modified C routine */
+            {
+               register unsigned int incr1, initial_val, final_val;
+               png_size_t pixel_bytes;
+               png_uint_32 i;
+               register int disp = png_pass_inc[png_ptr->pass];
+               int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
+               pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
+               srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
+                  pixel_bytes;
+               dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
+               initial_val = offset_table[png_ptr->pass]*pixel_bytes;
+               final_val = png_ptr->width*pixel_bytes;
+               incr1 = (disp)*pixel_bytes;
+               for (i = initial_val; i < final_val; i += incr1)
+               {
+                  png_memcpy(dstptr, srcptr, pixel_bytes);
+                  srcptr += incr1;
+                  dstptr += incr1;
+               }
+            } /* end of else */
+            break;  // end 48 bpp
+         }
 
-      break;
-      }
-    }
-  }
-}
+         default:
+         {
+            png_bytep sptr;
+            png_bytep dp;
+            png_size_t pixel_bytes;
+            int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
+            unsigned int i;
+            register int disp = png_pass_inc[png_ptr->pass];  // get the offset
+            register unsigned int incr1, initial_val, final_val;
+            pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
+            sptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
+               pixel_bytes;
+            dp = row + offset_table[png_ptr->pass]*pixel_bytes;
+            initial_val = offset_table[png_ptr->pass]*pixel_bytes;
+            final_val = png_ptr->width*pixel_bytes;
+            incr1 = (disp)*pixel_bytes;
+            for (i = initial_val; i < final_val; i += incr1)
+            {
+               png_memcpy(dp, sptr, pixel_bytes);
+               sptr += incr1;
+               dp += incr1;
+            }
+            break;
+         }
+      } /* end switch (png_ptr->row_info.pixel_depth) */
+   }
+   mmx_supported = save_mmx_supported;
+
+} /* end png_combine_row() */
 
 
 #if defined(PNG_READ_INTERLACING_SUPPORTED)
@@ -946,9 +945,11 @@
    png_uint_32 transformations)
 {
 
+   int save_mmx_supported = mmx_supported;
    png_debug(1,"in png_do_read_interlace\n");
-   if (mmx_supported==2)
-       mmx_supported=mmxsupport();
+
+   // mmx_supported = mmxsupport();  // doesn't work
+   mmx_supported = 0;
 
    if (row != NULL && row_info != NULL)
    {
@@ -958,1068 +959,931 @@
 
       switch (row_info->pixel_depth)
       {
-   case 1:
-   {
-      png_bytep sp, dp;
-      int sshift, dshift;
-      int s_start, s_end, s_inc;
-      png_byte v;
-      png_uint_32 i;
-      int j;
+         case 1:
+         {
+            png_bytep sp, dp;
+            int sshift, dshift;
+            int s_start, s_end, s_inc;
+            png_byte v;
+            png_uint_32 i;
+            int j;
 
-      sp = row + (png_size_t)((row_info->width - 1) >> 3);
-      dp = row + (png_size_t)((final_width - 1) >> 3);
+            sp = row + (png_size_t)((row_info->width - 1) >> 3);
+            dp = row + (png_size_t)((final_width - 1) >> 3);
 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
-      if (transformations & PNG_PACKSWAP)
-      {
-    sshift = (int)((row_info->width + 7) & 7);
-    dshift = (int)((final_width + 7) & 7);
-    s_start = 7;
-    s_end = 0;
-    s_inc = -1;
-      }
-      else
+            if (transformations & PNG_PACKSWAP)
+            {
+               sshift = (int)((row_info->width + 7) & 7);
+               dshift = (int)((final_width + 7) & 7);
+               s_start = 7;
+               s_end = 0;
+               s_inc = -1;
+            }
+            else
 #endif
-      {
-    sshift = 7 - (int)((row_info->width + 7) & 7);
-    dshift = 7 - (int)((final_width + 7) & 7);
-    s_start = 0;
-    s_end = 7;
-    s_inc = 1;
-      }
-
-      for (i = row_info->width; i; i--)
-      {
-         v = (png_byte)((*sp >> sshift) & 0x1);
-         for (j = 0; j < png_pass_inc[pass]; j++)
-         {
-      *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
-      *dp |= (png_byte)(v << dshift);
-      if (dshift == s_end)
-      {
-         dshift = s_start;
-         dp--;
-      }
-      else
-         dshift += s_inc;
-         }
-         if (sshift == s_end)
-         {
-      sshift = s_start;
-      sp--;
-         }
-         else
-      sshift += s_inc;
-      }
-      break;
-   }
-   case 2:
-   {
-      png_bytep sp, dp;
-      int sshift, dshift;
-      int s_start, s_end, s_inc;
-      png_uint_32 i;
-
-      sp = row + (png_size_t)((row_info->width - 1) >> 2);
-      dp = row + (png_size_t)((final_width - 1) >> 2);
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
-      if (transformations & PNG_PACKSWAP)
-      {
-         sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
-         dshift = (png_size_t)(((final_width + 3) & 3) << 1);
-         s_start = 6;
-         s_end = 0;
-         s_inc = -2;
-      }
-      else
-#endif
-      {
-         sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
-         dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
-         s_start = 0;
-         s_end = 6;
-         s_inc = 2;
-      }
-
-      for (i = row_info->width; i; i--)
-      {
-         png_byte v;
-         int j;
-
-         v = (png_byte)((*sp >> sshift) & 0x3);
-         for (j = 0; j < png_pass_inc[pass]; j++)
-         {
-      *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
-      *dp |= (png_byte)(v << dshift);
-      if (dshift == s_end)
-      {
-         dshift = s_start;
-         dp--;
-      }
-      else
-         dshift += s_inc;
-         }
-         if (sshift == s_end)
-         {
-      sshift = s_start;
-      sp--;
-         }
-         else
-      sshift += s_inc;
-      }
-      break;
-   }
-   case 4:
-   {
-      png_bytep sp, dp;
-      int sshift, dshift;
-      int s_start, s_end, s_inc;
-      png_uint_32 i;
-
-      sp = row + (png_size_t)((row_info->width - 1) >> 1);
-      dp = row + (png_size_t)((final_width - 1) >> 1);
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
-      if (transformations & PNG_PACKSWAP)
-      {
-         sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
-         dshift = (png_size_t)(((final_width + 1) & 1) << 2);
-         s_start = 4;
-         s_end = 0;
-         s_inc = -4;
-      }
-      else
-#endif
-      {
-         sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
-         dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
-         s_start = 0;
-         s_end = 4;
-         s_inc = 4;
-      }
-
-      for (i = row_info->width; i; i--)
-      {
-         png_byte v;
-         int j;
-
-         v = (png_byte)((*sp >> sshift) & 0xf);
-         for (j = 0; j < png_pass_inc[pass]; j++)
-         {
-      *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
-      *dp |= (png_byte)(v << dshift);
-      if (dshift == s_end)
-      {
-         dshift = s_start;
-         dp--;
-      }
-      else
-         dshift += s_inc;
-         }
-         if (sshift == s_end)
-         {
-      sshift = s_start;
-      sp--;
-         }
-         else
-      sshift += s_inc;
-      }
-      break;
-   }
-   default:         // This is the place where the routine is modified
-   {
-      __int64 const4 = 0x0000000000FFFFFF;
-      __int64 const5 = 0x000000FFFFFF0000;
-      __int64 const6 = 0x00000000000000FF;
-      //int mmx_supported = 1;
-
-      png_bytep sptr, dp;
-      png_uint_32 i;
-      png_size_t pixel_bytes;
-
-      int width = row_info->width;
-
-      pixel_bytes = (row_info->pixel_depth >> 3);
-
-      sptr = row + (row_info->width - 1) * pixel_bytes;
-      dp = row + (final_width - 1) * pixel_bytes;
-      // New code by Nirav Chhatrapati - Intel Corporation
-
-      if (mmx_supported) // If machine supports MMX technology use MMX routine
-      {
-      if (pixel_bytes == 3)
-      {
-        if ((pass == 0) || (pass == 1))
-        {
-          _asm
-          {
-            mov esi, sptr
-
-            mov edi, dp
-
-            mov ecx, width
-
-            sub edi, 21   // (png_pass_inc[pass] - 1)*pixel_bytes
-
-loop_pass0:
-
-            movd mm0, [esi]     ; X X X X X val2 val1 val0
-
-            pand mm0, const4    ; 0 0 0 0 0 val2 val1 val0
-
-            movq mm1, mm0       ; 0 0 0 0 0 val2 val1 val0
-
-            psllq mm0, 16       ; 0 0 0 val2 val1 val0 0 0
-
-            movq mm2, mm0       ; 0 0 0 val2 val1 val0 0 0
-
-            psllq mm0, 24       ; val2 val1 val0 0 0 0 0 0
-
-            psrlq mm1, 8        ; 0 0 0 0 0 0 val2 val1
-
-            por mm0, mm2        ; val2 val1 val0 val2 val1 val0 0 0
-
-            por mm0, mm1        ; val2 val1 val0 val2 val1 val0 val2 val1
-
-            movq mm3, mm0       ; val2 val1 val0 val2 val1 val0 val2 val1
-
-            psllq mm0, 16       ; val0 val2 val1 val0 val2 val1 0 0
-
-            movq mm4, mm3       ; val2 val1 val0 val2 val1 val0 val2 val1
-
-            punpckhdq mm3, mm0  ; val0 val2 val1 val0 val2 val1 val0 val2
-
-            movq [edi+16] , mm4
-
-            psrlq mm0, 32       ; 0 0 0 0 val0 val2 val1 val0
-
-            movq [edi+8] , mm3
-
-            punpckldq mm0, mm4  ; val1 val0 val2 val1 val0 val2 val1 val0
-
-            sub esi, 3
-
-            movq [edi], mm0
-
-            sub edi, 24
-
-            //sub esi, 3
-
-            dec ecx
-
-            jnz loop_pass0
-
-            EMMS
-          }
-
-        }
-
-        else if ((pass == 2) || (pass == 3))
-        {
-          _asm
-          {
-            mov esi, sptr
-
-            mov edi, dp
-
-            mov ecx, width
-
-            sub edi, 9   // (png_pass_inc[pass] - 1)*pixel_bytes
-
-loop_pass2:
-
-            movd mm0, [esi]     ; X X X X X val2 val1 val0
-
-            pand mm0, const4    ; 0 0 0 0 0 val2 val1 val0
-
-            movq mm1, mm0       ; 0 0 0 0 0 val2 val1 val0
-
-            psllq mm0, 16       ; 0 0 0 val2 val1 val0 0 0
-
-            movq mm2, mm0       ; 0 0 0 val2 val1 val0 0 0
-
-            psllq mm0, 24       ; val2 val1 val0 0 0 0 0 0
-
-            psrlq mm1, 8        ; 0 0 0 0 0 0 val2 val1
-
-            por mm0, mm2        ; val2 val1 val0 val2 val1 val0 0 0
-
-            por mm0, mm1        ; val2 val1 val0 val2 val1 val0 val2 val1
-
-            movq [edi+4], mm0   ; move to memory
-
-            psrlq mm0, 16       ; 0 0 val2 val1 val0 val2 val1 val0
-
-            movd [edi], mm0     ; move to memory
-
-            sub esi, 3
-
-            sub edi, 12
-
-            dec ecx
-
-            jnz loop_pass2
-
-            EMMS
-          }
-        }
-
-        else /*if ((pass == 4) || (pass == 5)) */
-        {
-
-          int width_mmx = ((width >> 1) << 1) - 8;
-          width -= width_mmx;
-               if(width_mmx)
-          _asm
-          {
-            mov esi, sptr
-
-            mov edi, dp
-
-            mov ecx, width_mmx
-
-            sub esi, 3
-
-            sub edi, 9
-
-loop_pass4:
-
-            movq mm0, [esi]     ; X X v2 v1 v0 v5 v4 v3
-
-            movq mm7, mm0       ; X X v2 v1 v0 v5 v4 v3
-
-            movq mm6, mm0       ; X X v2 v1 v0 v5 v4 v3
-
-            psllq mm0, 24       ; v1 v0 v5 v4 v3 0 0 0
-
-            pand mm7, const4    ; 0 0 0 0 0 v5 v4 v3
-
-            psrlq mm6, 24       ; 0 0 0 X X v2 v1 v0
-
-            por mm0, mm7        ; v1 v0 v5 v4 v3 v5 v4 v3
-
-            movq mm5, mm6       ; 0 0 0 X X v2 v1 v0
-
-            psllq mm6, 8        ; 0 0 X X v2 v1 v0 0
-
-            movq [edi], mm0     ; move quad to memory
-
-            psrlq mm5, 16       ; 0 0 0 0 0 X X v2
-
-            pand mm5, const6    ; 0 0 0 0 0 0 0 v2
-
-            por mm6, mm5        ; 0 0 X X v2 v1 v0 v2
-
-            movd [edi+8], mm6   ; move double to memory
-
-            sub esi, 6
-
-            sub edi, 12
-
-            sub ecx, 2
-
-            jnz loop_pass4
-
-            EMMS
-          }
-
-          sptr -= width_mmx*3;
-          dp -= width_mmx*6;
-          for (i = width; i; i--)
-          {
-            png_byte v[8];
-            int j;
-
-            png_memcpy(v, sptr, pixel_bytes);
-            for (j = 0; j < png_pass_inc[pass]; j++)
             {
-              png_memcpy(dp, v, pixel_bytes);
-              dp -= pixel_bytes;
+               sshift = 7 - (int)((row_info->width + 7) & 7);
+               dshift = 7 - (int)((final_width + 7) & 7);
+               s_start = 0;
+               s_end = 7;
+               s_inc = 1;
             }
-           sptr -= pixel_bytes;
-           }
 
-        }
-
-      }  /* end of pixel_bytes == 3 */
-
-      else if (pixel_bytes == 1)
-      {
-
-        if ((pass == 0) || (pass == 1))
-        {
-          int width_mmx = ((width >> 2) << 2);
-          width -= width_mmx;
-               if(width_mmx)
-          _asm
-          {
-
-            mov esi, sptr
-
-            mov edi, dp
-
-            mov ecx, width_mmx
-
-            sub edi, 31
-
-            sub esi, 3
-
-loop1_pass0:
-
-            movd mm0, [esi]     ; X X X X v0 v1 v2 v3
-
-            movq mm1, mm0       ; X X X X v0 v1 v2 v3
-
-            punpcklbw mm0, mm0  ; v0 v0 v1 v1 v2 v2 v3 v3
-
-            movq mm2, mm0       ; v0 v0 v1 v1 v2 v2 v3 v3
-
-            punpcklwd mm0, mm0  ; v2 v2 v2 v2 v3 v3 v3 v3
-
-            movq mm3, mm0       ; v2 v2 v2 v2 v3 v3 v3 v3
-
-            punpckldq mm0, mm0  ; v3 v3 v3 v3 v3 v3 v3 v3
-
-            punpckhdq mm3, mm3  ; v2 v2 v2 v2 v2 v2 v2 v2
-
-            movq [edi], mm0     ; move to memory v3
-
-            punpckhwd mm2, mm2  ; v0 v0 v0 v0 v1 v1 v1 v1
-
-            movq [edi+8], mm3   ; move to memory v2
-
-            movq mm4, mm2       ; v0 v0 v0 v0 v1 v1 v1 v1
-
-            punpckldq mm2, mm2  ; v1 v1 v1 v1 v1 v1 v1 v1
-
-            punpckhdq mm4, mm4  ; v0 v0 v0 v0 v0 v0 v0 v0
-
-            movq [edi+16], mm2  ; move to memory v1
-
-            movq [edi+24], mm4  ; move to memory v0
-
-            sub esi, 4
-
-            sub edi, 32
-
-            sub ecx, 4
-
-            jnz loop1_pass0
-
-            EMMS
-          }
-
-          sptr -= width_mmx;
-          dp -= width_mmx*8;
-          for (i = width; i; i--)
-          {
-            png_byte v[8];
-            int j;
-
-            png_memcpy(v, sptr, pixel_bytes);
-            for (j = 0; j < png_pass_inc[pass]; j++)
+            for (i = row_info->width; i; i--)
             {
-              png_memcpy(dp, v, pixel_bytes);
-              dp -= pixel_bytes;
-            }
-           sptr -= pixel_bytes;
-          }
-
-        }
-
-
-        else if ((pass == 2) || (pass == 3))
-        {
-          int width_mmx = ((width >> 2) << 2);
-          width -= width_mmx;
-               if(width_mmx)
-          _asm
-          {
-
-            mov esi, sptr
-
-            mov edi, dp
-
-            mov ecx, width_mmx
-
-            sub edi, 15
-
-            sub esi, 3
-
-loop1_pass2:
-
-            movd mm0, [esi]     ; X X X X v0 v1 v2 v3
-
-            punpcklbw mm0, mm0  ; v0 v0 v1 v1 v2 v2 v3 v3
-
-            movq mm1, mm0       ; v0 v0 v1 v1 v2 v2 v3 v3
-
-            punpcklwd mm0, mm0  ; v2 v2 v2 v2 v3 v3 v3 v3
-
-            punpckhwd mm1, mm1  ; v0 v0 v0 v0 v1 v1 v1 v1
-
-            movq [edi], mm0     ; move to memory v2 and v3
-
-            sub esi, 4
-
-            movq [edi+8], mm1   ; move to memory v1     and v0
-
-            sub edi, 16
-
-            sub ecx, 4
-
-            jnz loop1_pass2
-
-            EMMS
-          }
-
-          sptr -= width_mmx;
-          dp -= width_mmx*4;
-          for (i = width; i; i--)
-          {
-            png_byte v[8];
-            int j;
-
-            png_memcpy(v, sptr, pixel_bytes);
-            for (j = 0; j < png_pass_inc[pass]; j++)
-            {
-              png_memcpy(dp, v, pixel_bytes);
-              dp -= pixel_bytes;
-            }
-           sptr -= pixel_bytes;
-          }
-
-        }
-
-        else //if ((pass == 4) || (pass == 5))
-        {
-          int width_mmx = ((width >> 3) << 3);
-          width -= width_mmx;
-               if(width_mmx)
-          _asm
-          {
-
-            mov esi, sptr
-            mov edi, dp
-            mov ecx, width_mmx
-            sub edi, 15
-            sub esi, 7
-
-loop1_pass4:
-
-            movq mm0, [esi]     ; v0 v1 v2 v3 v4 v5 v6 v7
-            movq mm1, mm0       ; v0 v1 v2 v3 v4 v5 v6 v7
-            punpcklbw mm0, mm0  ; v4 v4 v5 v5 v6 v6 v7 v7
-            //movq mm1, mm0     ; v0 v0 v1 v1 v2 v2 v3 v3
-            punpckhbw mm1, mm1  ;v0 v0 v1 v1 v2 v2 v3 v3
-            movq [edi+8], mm1   ; move to memory v0 v1 v2 and v3
-            sub esi, 8
-            movq [edi], mm0     ; move to memory v4 v5 v6 and v7
-            //sub esi, 4
-            sub edi, 16
-            sub ecx, 8
-            jnz loop1_pass4
-
-            EMMS
-          }
-
-          sptr -= width_mmx;
-          dp -= width_mmx*2;
-          for (i = width; i; i--)
-          {
-            png_byte v[8];
-            int j;
-
-            png_memcpy(v, sptr, pixel_bytes);
-            for (j = 0; j < png_pass_inc[pass]; j++)
-            {
-              png_memcpy(dp, v, pixel_bytes);
-              dp -= pixel_bytes;
-            }
-           sptr -= pixel_bytes;
-          }
-
-        }
-
-      }       /* end of pixel_bytes == 1 */
-
-      else if (pixel_bytes == 2)
-      {
-
-        if ((pass == 0) || (pass == 1))
-        {
-          int width_mmx = ((width >> 1) << 1);
-          width -= width_mmx;
-               if(width_mmx)
-          _asm
-          {
-            mov esi, sptr
-            mov edi, dp
-            mov ecx, width_mmx
-            sub esi, 2
-            sub edi, 30
-
-loop2_pass0:
-            movd mm0, [esi]        ; X X X X v1 v0 v3 v2
-            punpcklwd mm0, mm0     ; v1 v0 v1 v0 v3 v2 v3 v2
-            movq mm1, mm0          ; v1 v0 v1 v0 v3 v2 v3 v2
-            punpckldq mm0, mm0     ; v3 v2 v3 v2 v3 v2 v3 v2
-            punpckhdq mm1, mm1     ; v1 v0 v1 v0 v1 v0 v1 v0
-            movq [edi], mm0
-            movq [edi + 8], mm0
-            movq [edi + 16], mm1
-            movq [edi + 24], mm1
-            sub esi, 4
-            sub edi, 32
-            sub ecx, 2
-            jnz loop2_pass0
-
-            EMMS
-          }
-
-          sptr -= (width_mmx*2 + 2);
-          dp -= (width_mmx*16 + 2);
-
-          for (i = width; i; i--)
-          {
-
-            png_byte v[8];
-            int j;
-            sptr -= pixel_bytes;
-            png_memcpy(v, sptr, pixel_bytes);
-            for (j = 0; j < png_pass_inc[pass]; j++)
-            {
-              dp -= pixel_bytes;
-              png_memcpy(dp, v, pixel_bytes);
-              //dp -= pixel_bytes;
-            }
-            //sptr -= pixel_bytes;
-          }
-        }
-
-        else if ((pass == 2) || (pass == 3))
-        {
-          int width_mmx = ((width >> 1) << 1) ;
-          width -= width_mmx;
-               if(width_mmx)
-          _asm
-          {
-            mov esi, sptr
-            mov edi, dp
-            mov ecx, width_mmx
-            sub esi, 2
-            sub edi, 14
-
-loop2_pass2:
-            movd mm0, [esi]        ; X X X X v1 v0 v3 v2
-            punpcklwd mm0, mm0     ; v1 v0 v1 v0 v3 v2 v3 v2
-            movq mm1, mm0          ; v1 v0 v1 v0 v3 v2 v3 v2
-            punpckldq mm0, mm0     ; v3 v2 v3 v2 v3 v2 v3 v2
-            punpckhdq mm1, mm1     ; v1 v0 v1 v0 v1 v0 v1 v0
-            movq [edi], mm0
-            sub esi, 4
-            movq [edi + 8], mm1
-            //sub esi, 4
-            sub edi, 16
-            sub ecx, 2
-            jnz loop2_pass2
-
-            EMMS
-          }
-
-          sptr -= (width_mmx*2 + 2);
-          dp -= (width_mmx*8 + 2);
-
-          for (i = width; i; i--)
-          {
-
-            png_byte v[8];
-            int j;
-            sptr -= pixel_bytes;
-            png_memcpy(v, sptr, pixel_bytes);
-            for (j = 0; j < png_pass_inc[pass]; j++)
-            {
-              dp -= pixel_bytes;
-              png_memcpy(dp, v, pixel_bytes);
-              //dp -= pixel_bytes;
-            }
-            //sptr -= pixel_bytes;
-          }
-        }
-
-        else // pass == 4 or 5
-        {
-          int width_mmx = ((width >> 1) << 1) ;
-          width -= width_mmx;
-               if(width_mmx)
-          _asm
-          {
-            mov esi, sptr
-            mov edi, dp
-            mov ecx, width_mmx
-            sub esi, 2
-            sub edi, 6
-
-loop2_pass4:
-            movd mm0, [esi]        ; X X X X v1 v0 v3 v2
-            punpcklwd mm0, mm0     ; v1 v0 v1 v0 v3 v2 v3 v2
-            sub esi, 4
-            movq [edi], mm0
-            sub edi, 8
-            sub ecx, 2
-            jnz loop2_pass4
-
-            EMMS
-          }
-
-          sptr -= (width_mmx*2 + 2);
-          dp -= (width_mmx*4 + 2);
-
-          for (i = width; i; i--)
-          {
-
-            png_byte v[8];
-            int j;
-            sptr -= pixel_bytes;
-            png_memcpy(v, sptr, pixel_bytes);
-            for (j = 0; j < png_pass_inc[pass]; j++)
-            {
-              dp -= pixel_bytes;
-              png_memcpy(dp, v, pixel_bytes);
-              //dp -= pixel_bytes;
-            }
-            //sptr -= pixel_bytes;
-          }
-        }
-
-      } /* end of pixel_bytes == 2 */
-
-      else if (pixel_bytes == 4)
-      {
-        if ((pass == 0) || (pass == 1))
-        {
-          int width_mmx = ((width >> 1) << 1) ;
-          width -= width_mmx;
-               if(width_mmx)
-          _asm
-          {
-            mov esi, sptr
-            mov edi, dp
-            mov ecx, width_mmx
-            sub esi, 4
-            sub edi, 60
-
-loop4_pass0:
-            movq mm0, [esi]        ; v3 v2 v1 v0 v7 v6 v5 v4
-            movq mm1, mm0          ; v3 v2 v1 v0 v7 v6 v5 v4
-            punpckldq mm0, mm0     ; v7 v6 v5 v4 v7 v6 v5 v4
-            punpckhdq mm1, mm1     ; v3 v2 v1 v0 v3 v2 v1 v0
-            movq [edi], mm0
-            movq [edi + 8], mm0
-            movq [edi + 16], mm0
-            movq [edi + 24], mm0
-            movq [edi+32], mm1
-            movq [edi + 40], mm1
-            movq [edi+ 48], mm1
-            sub esi, 8
-            movq [edi + 56], mm1
-            sub edi, 64
-            sub ecx, 2
-            jnz loop4_pass0
-
-            EMMS
-          }
-
-          sptr -= (width_mmx*4 + 4);
-          dp -= (width_mmx*32 + 4);
-
-          for (i = width; i; i--)
-          {
-
-            png_byte v[8];
-            int j;
-            sptr -= pixel_bytes;
-            png_memcpy(v, sptr, pixel_bytes);
-            for (j = 0; j < png_pass_inc[pass]; j++)
-            {
-              dp -= pixel_bytes;
-              png_memcpy(dp, v, pixel_bytes);
-              //dp -= pixel_bytes;
-            }
-            //sptr -= pixel_bytes;
-          }
-        }
-
-        else if ((pass == 2) || (pass == 3))
-        {
-          int width_mmx = ((width >> 1) << 1) ;
-          width -= width_mmx;
-               if(width_mmx)
-          _asm
-          {
-            mov esi, sptr
-            mov edi, dp
-            mov ecx, width_mmx
-            sub esi, 4
-            sub edi, 28
-
-loop4_pass2:
-            movq mm0, [esi]      ; v3 v2 v1 v0 v7 v6 v5 v4
-            movq mm1, mm0        ; v3 v2 v1 v0 v7 v6 v5 v4
-            punpckldq mm0, mm0   ; v7 v6 v5 v4 v7 v6 v5 v4
-            punpckhdq mm1, mm1   ; v3 v2 v1 v0 v3 v2 v1 v0
-            movq [edi], mm0
-            movq [edi + 8], mm0
-            movq [edi+16], mm1
-            movq [edi + 24], mm1
-            sub esi, 8
-            sub edi, 32
-            sub ecx, 2
-            jnz loop4_pass2
-
-            EMMS
-          }
-
-          sptr -= (width_mmx*4 + 4);
-          dp -= (width_mmx*16 + 4);
-
-          for (i = width; i; i--)
-          {
-
-            png_byte v[8];
-            int j;
-            sptr -= pixel_bytes;
-            png_memcpy(v, sptr, pixel_bytes);
-            for (j = 0; j < png_pass_inc[pass]; j++)
-            {
-              dp -= pixel_bytes;
-              png_memcpy(dp, v, pixel_bytes);
-              //dp -= pixel_bytes;
-            }
-            //sptr -= pixel_bytes;
-          }
-        }
-
-        else // pass == 4 or 5
-        {
-          int width_mmx = ((width >> 1) << 1) ;
-          width -= width_mmx;
-               if(width_mmx)
-          _asm
-          {
-            mov esi, sptr
-            mov edi, dp
-            mov ecx, width_mmx
-            sub esi, 4
-            sub edi, 12
-
-loop4_pass4:
-            movq mm0, [esi]      ; v3 v2 v1 v0 v7 v6 v5 v4
-            movq mm1, mm0        ; v3 v2 v1 v0 v7 v6 v5 v4
-            punpckldq mm0, mm0   ; v7 v6 v5 v4 v7 v6 v5 v4
-            punpckhdq mm1, mm1   ; v3 v2 v1 v0 v3 v2 v1 v0
-            movq [edi], mm0
-            sub esi, 8
-            movq [edi + 8], mm1
-            sub edi, 16
-            sub ecx, 2
-            jnz loop4_pass4
-
-            EMMS
-          }
-
-          sptr -= (width_mmx*4 + 4);
-          dp -= (width_mmx*8 + 4);
-
-          for (i = width; i; i--)
-          {
-
-            png_byte v[8];
-            int j;
-            sptr -= pixel_bytes;
-            png_memcpy(v, sptr, pixel_bytes);
-            for (j = 0; j < png_pass_inc[pass]; j++)
-            {
-              dp -= pixel_bytes;
-              png_memcpy(dp, v, pixel_bytes);
-              //dp -= pixel_bytes;
-            }
-            //sptr -= pixel_bytes;
-          }
-        }
-
-      } /* end of pixel_bytes == 4 */
-
-      else if (pixel_bytes == 6)
-      {
-        for (i = row_info->width; i; i--)
-        {
-
-          png_byte v[8];
-          int j;
-          png_memcpy(v, sptr, pixel_bytes);
-          for (j = 0; j < png_pass_inc[pass]; j++)
-          {
-            png_memcpy(dp, v, pixel_bytes);
-            dp -= pixel_bytes;
-          }
-          sptr -= pixel_bytes;
-        }
-      } /* end of pixel_bytes == 6 */
-
-      else
-      {
-      for (i = row_info->width; i; i--)
-        {
-
-          png_byte v[8];
-          int j;
-          png_memcpy(v, sptr, pixel_bytes);
-          for (j = 0; j < png_pass_inc[pass]; j++)
-          {
-            png_memcpy(dp, v, pixel_bytes);
-            dp -= pixel_bytes;
-          }
-          sptr-= pixel_bytes;
-        }
-      }
-      }       /* end of mmx_supported */
-
-      else   /* MMX not supported */
-      /* use modified C code - takes advantage of inlining of memcpy for
-         a constant */
-      {
-        if (pixel_bytes == 1)
-        {
-        for (i = row_info->width; i; i--)
-          {
-          png_byte v[8];
-          int j;
-
-          png_memcpy(v, sptr, pixel_bytes);
-          for (j = 0; j < png_pass_inc[pass]; j++)
-            {
-            png_memcpy(dp, v, pixel_bytes);
-            dp -= pixel_bytes;
-            }
-          sptr -= pixel_bytes;
-          }
-        }
-        else if (pixel_bytes == 3)
-        {
-        for (i = row_info->width; i; i--)
-        {
-          png_byte v[8];
-          int j;
-      png_memcpy(v, sptr, pixel_bytes);
-         for (j = 0; j < png_pass_inc[pass]; j++)
-            {
-            png_memcpy(dp, v, pixel_bytes);
-            dp -= pixel_bytes;
-            }
-          sptr -= pixel_bytes;
-          }
-        }
-        else if (pixel_bytes == 2)
-        {
-        for (i = row_info->width; i; i--)
-          {
-          png_byte v[8];
-          int j;
-      png_memcpy(v, sptr, pixel_bytes);
-        for (j = 0; j < png_pass_inc[pass]; j++)
-             {
-            png_memcpy(dp, v, pixel_bytes);
-            dp -= pixel_bytes;
-            }
-          sptr -= pixel_bytes;
-          }
-        }
-        else if (pixel_bytes == 4)
-        {
-        for (i = row_info->width; i; i--)
-          {
-          png_byte v[8];
-          int j;
-      png_memcpy(v, sptr, pixel_bytes);
-        for (j = 0; j < png_pass_inc[pass]; j++)
-             {
-            png_memcpy(dp, v, pixel_bytes);
-            dp -= pixel_bytes;
-            }
-          sptr -= pixel_bytes;
-          }
-        }
-        else if (pixel_bytes == 6)
-        {
-        for (i = row_info->width; i; i--)
-          {
-          png_byte v[8];
-          int j;
-      png_memcpy(v, sptr, pixel_bytes);
-        for (j = 0; j < png_pass_inc[pass]; j++)
-             {
-            png_memcpy(dp, v, pixel_bytes);
-            dp -= pixel_bytes;
-            }
-          sptr -= pixel_bytes;
-          }
-        }
-        else
-        {
-        for (i = row_info->width; i; i--)
-          {
-          png_byte v[8];
-          int j;
-           png_memcpy(v, sptr, pixel_bytes);
-         for (j = 0; j < png_pass_inc[pass]; j++)
+               v = (png_byte)((*sp >> sshift) & 0x1);
+               for (j = 0; j < png_pass_inc[pass]; j++)
                {
-              png_memcpy(dp, v, pixel_bytes);
-            dp -= pixel_bytes;
+                  *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
+                  *dp |= (png_byte)(v << dshift);
+                  if (dshift == s_end)
+                  {
+                     dshift = s_start;
+                     dp--;
+                  }
+                  else
+                     dshift += s_inc;
+               }
+               if (sshift == s_end)
+               {
+                  sshift = s_start;
+                  sp--;
+               }
+               else
+                  sshift += s_inc;
             }
-          sptr -= pixel_bytes;
-          }
-        }
+            break;
+         }
 
-      } /* end of MMX not supported */
-      break;
-   }
-      }
-    row_info->width = final_width;
+         case 2:
+         {
+            png_bytep sp, dp;
+            int sshift, dshift;
+            int s_start, s_end, s_inc;
+            png_uint_32 i;
+
+            sp = row + (png_size_t)((row_info->width - 1) >> 2);
+            dp = row + (png_size_t)((final_width - 1) >> 2);
+#if defined(PNG_READ_PACKSWAP_SUPPORTED)
+            if (transformations & PNG_PACKSWAP)
+            {
+               sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
+               dshift = (png_size_t)(((final_width + 3) & 3) << 1);
+               s_start = 6;
+               s_end = 0;
+               s_inc = -2;
+            }
+            else
+#endif
+            {
+               sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
+               dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
+               s_start = 0;
+               s_end = 6;
+               s_inc = 2;
+            }
+
+            for (i = row_info->width; i; i--)
+            {
+               png_byte v;
+               int j;
+
+               v = (png_byte)((*sp >> sshift) & 0x3);
+               for (j = 0; j < png_pass_inc[pass]; j++)
+               {
+                  *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
+                  *dp |= (png_byte)(v << dshift);
+                  if (dshift == s_end)
+                  {
+                     dshift = s_start;
+                     dp--;
+                  }
+                  else
+                     dshift += s_inc;
+               }
+               if (sshift == s_end)
+               {
+                  sshift = s_start;
+                  sp--;
+               }
+               else
+                  sshift += s_inc;
+            }
+            break;
+         }
+
+         case 4:
+         {
+            png_bytep sp, dp;
+            int sshift, dshift;
+            int s_start, s_end, s_inc;
+            png_uint_32 i;
+
+            sp = row + (png_size_t)((row_info->width - 1) >> 1);
+            dp = row + (png_size_t)((final_width - 1) >> 1);
+#if defined(PNG_READ_PACKSWAP_SUPPORTED)
+            if (transformations & PNG_PACKSWAP)
+            {
+               sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
+               dshift = (png_size_t)(((final_width + 1) & 1) << 2);
+               s_start = 4;
+               s_end = 0;
+               s_inc = -4;
+            }
+            else
+#endif
+            {
+               sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
+               dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
+               s_start = 0;
+               s_end = 4;
+               s_inc = 4;
+            }
+
+            for (i = row_info->width; i; i--)
+            {
+               png_byte v;
+               int j;
+
+               v = (png_byte)((*sp >> sshift) & 0xf);
+               for (j = 0; j < png_pass_inc[pass]; j++)
+               {
+                  *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
+                  *dp |= (png_byte)(v << dshift);
+                  if (dshift == s_end)
+                  {
+                     dshift = s_start;
+                     dp--;
+                  }
+                  else
+                     dshift += s_inc;
+               }
+               if (sshift == s_end)
+               {
+                  sshift = s_start;
+                  sp--;
+               }
+               else
+                  sshift += s_inc;
+            }
+            break;
+         }
+
+         default:         // This is the place where the routine is modified
+         {
+            __int64 const4 = 0x0000000000FFFFFF;
+            __int64 const5 = 0x000000FFFFFF0000;
+            __int64 const6 = 0x00000000000000FF;
+            //int mmx_supported = 1;
+
+            png_bytep sptr, dp;
+            png_uint_32 i;
+            png_size_t pixel_bytes;
+
+            int width = row_info->width;
+
+            pixel_bytes = (row_info->pixel_depth >> 3);
+
+            sptr = row + (row_info->width - 1) * pixel_bytes;
+            dp = row + (final_width - 1) * pixel_bytes;
+            // New code by Nirav Chhatrapati - Intel Corporation
+
+            if (mmx_supported) // use MMX routine if machine supports it
+            {
+               if (pixel_bytes == 3)
+               {
+                  if ((pass == 0) || (pass == 1))
+                  {
+                     _asm
+                     {
+                        mov esi, sptr
+                        mov edi, dp
+                        mov ecx, width
+                        sub edi, 21   // (png_pass_inc[pass] - 1)*pixel_bytes
+loop_pass0:
+                        movd mm0, [esi]     ; X X X X X v2 v1 v0
+                        pand mm0, const4    ; 0 0 0 0 0 v2 v1 v0
+                        movq mm1, mm0       ; 0 0 0 0 0 v2 v1 v0
+                        psllq mm0, 16       ; 0 0 0 v2 v1 v0 0 0
+                        movq mm2, mm0       ; 0 0 0 v2 v1 v0 0 0
+                        psllq mm0, 24       ; v2 v1 v0 0 0 0 0 0
+                        psrlq mm1, 8        ; 0 0 0 0 0 0 v2 v1
+                        por mm0, mm2        ; v2 v1 v0 v2 v1 v0 0 0
+                        por mm0, mm1        ; v2 v1 v0 v2 v1 v0 v2 v1
+                        movq mm3, mm0       ; v2 v1 v0 v2 v1 v0 v2 v1
+                        psllq mm0, 16       ; v0 v2 v1 v0 v2 v1 0 0
+                        movq mm4, mm3       ; v2 v1 v0 v2 v1 v0 v2 v1
+                        punpckhdq mm3, mm0  ; v0 v2 v1 v0 v2 v1 v0 v2
+                        movq [edi+16] , mm4
+                        psrlq mm0, 32       ; 0 0 0 0 v0 v2 v1 v0
+                        movq [edi+8] , mm3
+                        punpckldq mm0, mm4  ; v1 v0 v2 v1 v0 v2 v1 v0
+                        sub esi, 3
+                        movq [edi], mm0
+                        sub edi, 24
+                        //sub esi, 3
+                        dec ecx
+                        jnz loop_pass0
+                        EMMS
+                     }
+                  }
+                  else if ((pass == 2) || (pass == 3))
+                  {
+                     _asm
+                     {
+                        mov esi, sptr
+                        mov edi, dp
+                        mov ecx, width
+                        sub edi, 9   // (png_pass_inc[pass] - 1)*pixel_bytes
+loop_pass2:
+                        movd mm0, [esi]     ; X X X X X v2 v1 v0
+                        pand mm0, const4    ; 0 0 0 0 0 v2 v1 v0
+                        movq mm1, mm0       ; 0 0 0 0 0 v2 v1 v0
+                        psllq mm0, 16       ; 0 0 0 v2 v1 v0 0 0
+                        movq mm2, mm0       ; 0 0 0 v2 v1 v0 0 0
+                        psllq mm0, 24       ; v2 v1 v0 0 0 0 0 0
+                        psrlq mm1, 8        ; 0 0 0 0 0 0 v2 v1
+                        por mm0, mm2        ; v2 v1 v0 v2 v1 v0 0 0
+                        por mm0, mm1        ; v2 v1 v0 v2 v1 v0 v2 v1
+                        movq [edi+4], mm0   ; move to memory
+                        psrlq mm0, 16       ; 0 0 v2 v1 v0 v2 v1 v0
+                        movd [edi], mm0     ; move to memory
+                        sub esi, 3
+                        sub edi, 12
+                        dec ecx
+                        jnz loop_pass2
+                        EMMS
+                     }
+                  }
+                  else /* if ((pass == 4) || (pass == 5)) */
+                  {
+                     int width_mmx = ((width >> 1) << 1) - 8;
+                     width -= width_mmx;
+                     if (width_mmx)
+                     {
+                        _asm
+                        {
+                           mov esi, sptr
+                           mov edi, dp
+                           mov ecx, width_mmx
+                           sub esi, 3
+                           sub edi, 9
+loop_pass4:
+                           movq mm0, [esi]     ; X X v2 v1 v0 v5 v4 v3
+                           movq mm7, mm0       ; X X v2 v1 v0 v5 v4 v3
+                           movq mm6, mm0       ; X X v2 v1 v0 v5 v4 v3
+                           psllq mm0, 24       ; v1 v0 v5 v4 v3 0 0 0
+                           pand mm7, const4    ; 0 0 0 0 0 v5 v4 v3
+                           psrlq mm6, 24       ; 0 0 0 X X v2 v1 v0
+                           por mm0, mm7        ; v1 v0 v5 v4 v3 v5 v4 v3
+                           movq mm5, mm6       ; 0 0 0 X X v2 v1 v0
+                           psllq mm6, 8        ; 0 0 X X v2 v1 v0 0
+                           movq [edi], mm0     ; move quad to memory
+                           psrlq mm5, 16       ; 0 0 0 0 0 X X v2
+                           pand mm5, const6    ; 0 0 0 0 0 0 0 v2
+                           por mm6, mm5        ; 0 0 X X v2 v1 v0 v2
+                           movd [edi+8], mm6   ; move double to memory
+                           sub esi, 6
+                           sub edi, 12
+                           sub ecx, 2
+                           jnz loop_pass4
+                           EMMS
+                        }
+                     }
+
+                     sptr -= width_mmx*3;
+                     dp -= width_mmx*6;
+                     for (i = width; i; i--)
+                     {
+                        png_byte v[8];
+                        int j;
+
+                        png_memcpy(v, sptr, pixel_bytes);
+                        for (j = 0; j < png_pass_inc[pass]; j++)
+                        {
+                           png_memcpy(dp, v, pixel_bytes);
+                           dp -= pixel_bytes;
+                        }
+                        sptr -= pixel_bytes;
+                     }
+                  }
+               } /* end of pixel_bytes == 3 */
+
+               else if (pixel_bytes == 1)
+               {
+                  if ((pass == 0) || (pass == 1))
+                  {
+                     int width_mmx = ((width >> 2) << 2);
+                     width -= width_mmx;
+                     if (width_mmx)
+                     {
+                        _asm
+                        {
+                           mov esi, sptr
+                           mov edi, dp
+                           mov ecx, width_mmx
+                           sub edi, 31
+                           sub esi, 3
+loop1_pass0:
+                           movd mm0, [esi]     ; X X X X v0 v1 v2 v3
+                           movq mm1, mm0       ; X X X X v0 v1 v2 v3
+                           punpcklbw mm0, mm0  ; v0 v0 v1 v1 v2 v2 v3 v3
+                           movq mm2, mm0       ; v0 v0 v1 v1 v2 v2 v3 v3
+                           punpcklwd mm0, mm0  ; v2 v2 v2 v2 v3 v3 v3 v3
+                           movq mm3, mm0       ; v2 v2 v2 v2 v3 v3 v3 v3
+                           punpckldq mm0, mm0  ; v3 v3 v3 v3 v3 v3 v3 v3
+                           punpckhdq mm3, mm3  ; v2 v2 v2 v2 v2 v2 v2 v2
+                           movq [edi], mm0     ; move to memory v3
+                           punpckhwd mm2, mm2  ; v0 v0 v0 v0 v1 v1 v1 v1
+                           movq [edi+8], mm3   ; move to memory v2
+                           movq mm4, mm2       ; v0 v0 v0 v0 v1 v1 v1 v1
+                           punpckldq mm2, mm2  ; v1 v1 v1 v1 v1 v1 v1 v1
+                           punpckhdq mm4, mm4  ; v0 v0 v0 v0 v0 v0 v0 v0
+                           movq [edi+16], mm2  ; move to memory v1
+                           movq [edi+24], mm4  ; move to memory v0
+                           sub esi, 4
+                           sub edi, 32
+                           sub ecx, 4
+                           jnz loop1_pass0
+                           EMMS
+                        }
+                     }
+
+                     sptr -= width_mmx;
+                     dp -= width_mmx*8;
+                     for (i = width; i; i--)
+                     {
+                        png_byte v[8];
+                        int j;
+
+                        png_memcpy(v, sptr, pixel_bytes);
+                        for (j = 0; j < png_pass_inc[pass]; j++)
+                        {
+                           png_memcpy(dp, v, pixel_bytes);
+                           dp -= pixel_bytes;
+                        }
+                        sptr -= pixel_bytes;
+                     }
+                  }
+                  else if ((pass == 2) || (pass == 3))
+                  {
+                     int width_mmx = ((width >> 2) << 2);
+                     width -= width_mmx;
+                     if (width_mmx)
+                     {
+                        _asm
+                        {
+                           mov esi, sptr
+                           mov edi, dp
+                           mov ecx, width_mmx
+                           sub edi, 15
+                           sub esi, 3
+loop1_pass2:
+                           movd mm0, [esi]     ; X X X X v0 v1 v2 v3
+                           punpcklbw mm0, mm0  ; v0 v0 v1 v1 v2 v2 v3 v3
+                           movq mm1, mm0       ; v0 v0 v1 v1 v2 v2 v3 v3
+                           punpcklwd mm0, mm0  ; v2 v2 v2 v2 v3 v3 v3 v3
+                           punpckhwd mm1, mm1  ; v0 v0 v0 v0 v1 v1 v1 v1
+                           movq [edi], mm0     ; move to memory v2 and v3
+                           sub esi, 4
+                           movq [edi+8], mm1   ; move to memory v1     and v0
+                           sub edi, 16
+                           sub ecx, 4
+                           jnz loop1_pass2
+                           EMMS
+                        }
+                     }
+
+                     sptr -= width_mmx;
+                     dp -= width_mmx*4;
+                     for (i = width; i; i--)
+                     {
+                        png_byte v[8];
+                        int j;
+
+                        png_memcpy(v, sptr, pixel_bytes);
+                        for (j = 0; j < png_pass_inc[pass]; j++)
+                        {
+                           png_memcpy(dp, v, pixel_bytes);
+                           dp -= pixel_bytes;
+                        }
+                        sptr -= pixel_bytes;
+                     }
+                  }
+                  else //if ((pass == 4) || (pass == 5))
+                  {
+                     int width_mmx = ((width >> 3) << 3);
+                     width -= width_mmx;
+                     if (width_mmx)
+                     {
+                        _asm
+                        {
+                           mov esi, sptr
+                           mov edi, dp
+                           mov ecx, width_mmx
+                           sub edi, 15
+                           sub esi, 7
+loop1_pass4:
+                           movq mm0, [esi]     ; v0 v1 v2 v3 v4 v5 v6 v7
+                           movq mm1, mm0       ; v0 v1 v2 v3 v4 v5 v6 v7
+                           punpcklbw mm0, mm0  ; v4 v4 v5 v5 v6 v6 v7 v7
+                           //movq mm1, mm0     ; v0 v0 v1 v1 v2 v2 v3 v3
+                           punpckhbw mm1, mm1  ;v0 v0 v1 v1 v2 v2 v3 v3
+                           movq [edi+8], mm1   ; move to memory v0 v1 v2 and v3
+                           sub esi, 8
+                           movq [edi], mm0     ; move to memory v4 v5 v6 and v7
+                           //sub esi, 4
+                           sub edi, 16
+                           sub ecx, 8
+                           jnz loop1_pass4
+                           EMMS
+                        }
+                     }
+
+                     sptr -= width_mmx;
+                     dp -= width_mmx*2;
+                     for (i = width; i; i--)
+                     {
+                        png_byte v[8];
+                        int j;
+
+                        png_memcpy(v, sptr, pixel_bytes);
+                        for (j = 0; j < png_pass_inc[pass]; j++)
+                        {
+                           png_memcpy(dp, v, pixel_bytes);
+                           dp -= pixel_bytes;
+                        }
+                        sptr -= pixel_bytes;
+                     }
+                  }
+               } /* end of pixel_bytes == 1 */
+
+               else if (pixel_bytes == 2)
+               {
+                  if ((pass == 0) || (pass == 1))
+                  {
+                     int width_mmx = ((width >> 1) << 1);
+                     width -= width_mmx;
+                     if (width_mmx)
+                     {
+                        _asm
+                        {
+                           mov esi, sptr
+                           mov edi, dp
+                           mov ecx, width_mmx
+                           sub esi, 2
+                           sub edi, 30
+loop2_pass0:
+                           movd mm0, [esi]        ; X X X X v1 v0 v3 v2
+                           punpcklwd mm0, mm0     ; v1 v0 v1 v0 v3 v2 v3 v2
+                           movq mm1, mm0          ; v1 v0 v1 v0 v3 v2 v3 v2
+                           punpckldq mm0, mm0     ; v3 v2 v3 v2 v3 v2 v3 v2
+                           punpckhdq mm1, mm1     ; v1 v0 v1 v0 v1 v0 v1 v0
+                           movq [edi], mm0
+                           movq [edi + 8], mm0
+                           movq [edi + 16], mm1
+                           movq [edi + 24], mm1
+                           sub esi, 4
+                           sub edi, 32
+                           sub ecx, 2
+                           jnz loop2_pass0
+                           EMMS
+                        }
+                     }
+
+                     sptr -= (width_mmx*2 + 2);
+                     dp -= (width_mmx*16 + 2);
+                     for (i = width; i; i--)
+                     {
+                        png_byte v[8];
+                        int j;
+                        sptr -= pixel_bytes;
+                        png_memcpy(v, sptr, pixel_bytes);
+                        for (j = 0; j < png_pass_inc[pass]; j++)
+                        {
+                           dp -= pixel_bytes;
+                           png_memcpy(dp, v, pixel_bytes);
+                           //dp -= pixel_bytes;
+                        }
+                        //sptr -= pixel_bytes;
+                     }
+                  }
+
+                  else if ((pass == 2) || (pass == 3))
+                  {
+                     int width_mmx = ((width >> 1) << 1) ;
+                     width -= width_mmx;
+                     if (width_mmx)
+                     {
+                        _asm
+                        {
+                           mov esi, sptr
+                           mov edi, dp
+                           mov ecx, width_mmx
+                           sub esi, 2
+                           sub edi, 14
+loop2_pass2:
+                           movd mm0, [esi]        ; X X X X v1 v0 v3 v2
+                           punpcklwd mm0, mm0     ; v1 v0 v1 v0 v3 v2 v3 v2
+                           movq mm1, mm0          ; v1 v0 v1 v0 v3 v2 v3 v2
+                           punpckldq mm0, mm0     ; v3 v2 v3 v2 v3 v2 v3 v2
+                           punpckhdq mm1, mm1     ; v1 v0 v1 v0 v1 v0 v1 v0
+                           movq [edi], mm0
+                           sub esi, 4
+                           movq [edi + 8], mm1
+                           //sub esi, 4
+                           sub edi, 16
+                           sub ecx, 2
+                           jnz loop2_pass2
+                           EMMS
+                        }
+                     }
+
+                     sptr -= (width_mmx*2 + 2);
+                     dp -= (width_mmx*8 + 2);
+                     for (i = width; i; i--)
+                     {
+                        png_byte v[8];
+                        int j;
+                        sptr -= pixel_bytes;
+                        png_memcpy(v, sptr, pixel_bytes);
+                        for (j = 0; j < png_pass_inc[pass]; j++)
+                        {
+                           dp -= pixel_bytes;
+                           png_memcpy(dp, v, pixel_bytes);
+                           //dp -= pixel_bytes;
+                        }
+                        //sptr -= pixel_bytes;
+                     }
+                  }
+
+                  else // pass == 4 or 5
+                  {
+                     int width_mmx = ((width >> 1) << 1) ;
+                     width -= width_mmx;
+                     if (width_mmx)
+                     {
+                        _asm
+                        {
+                           mov esi, sptr
+                           mov edi, dp
+                           mov ecx, width_mmx
+                           sub esi, 2
+                           sub edi, 6
+loop2_pass4:
+                           movd mm0, [esi]        ; X X X X v1 v0 v3 v2
+                           punpcklwd mm0, mm0     ; v1 v0 v1 v0 v3 v2 v3 v2
+                           sub esi, 4
+                           movq [edi], mm0
+                           sub edi, 8
+                           sub ecx, 2
+                           jnz loop2_pass4
+                           EMMS
+                        }
+                     }
+
+                     sptr -= (width_mmx*2 + 2);
+                     dp -= (width_mmx*4 + 2);
+                     for (i = width; i; i--)
+                     {
+                        png_byte v[8];
+                        int j;
+                        sptr -= pixel_bytes;
+                        png_memcpy(v, sptr, pixel_bytes);
+                        for (j = 0; j < png_pass_inc[pass]; j++)
+                        {
+                           dp -= pixel_bytes;
+                           png_memcpy(dp, v, pixel_bytes);
+                           //dp -= pixel_bytes;
+                        }
+                        //sptr -= pixel_bytes;
+                     }
+                  }
+               } /* end of pixel_bytes == 2 */
+
+               else if (pixel_bytes == 4)
+               {
+                  if ((pass == 0) || (pass == 1))
+                  {
+                     int width_mmx = ((width >> 1) << 1) ;
+                     width -= width_mmx;
+                     if (width_mmx)
+                     {
+                        _asm
+                        {
+                           mov esi, sptr
+                           mov edi, dp
+                           mov ecx, width_mmx
+                           sub esi, 4
+                           sub edi, 60
+loop4_pass0:
+                           movq mm0, [esi]        ; v3 v2 v1 v0 v7 v6 v5 v4
+                           movq mm1, mm0          ; v3 v2 v1 v0 v7 v6 v5 v4
+                           punpckldq mm0, mm0     ; v7 v6 v5 v4 v7 v6 v5 v4
+                           punpckhdq mm1, mm1     ; v3 v2 v1 v0 v3 v2 v1 v0
+                           movq [edi], mm0
+                           movq [edi + 8], mm0
+                           movq [edi + 16], mm0
+                           movq [edi + 24], mm0
+                           movq [edi+32], mm1
+                           movq [edi + 40], mm1
+                           movq [edi+ 48], mm1
+                           sub esi, 8
+                           movq [edi + 56], mm1
+                           sub edi, 64
+                           sub ecx, 2
+                           jnz loop4_pass0
+                           EMMS
+                        }
+                     }
+
+                     sptr -= (width_mmx*4 + 4);
+                     dp -= (width_mmx*32 + 4);
+                     for (i = width; i; i--)
+                     {
+                        png_byte v[8];
+                        int j;
+                        sptr -= pixel_bytes;
+                        png_memcpy(v, sptr, pixel_bytes);
+                        for (j = 0; j < png_pass_inc[pass]; j++)
+                        {
+                           dp -= pixel_bytes;
+                           png_memcpy(dp, v, pixel_bytes);
+                           //dp -= pixel_bytes;
+                        }
+                        //sptr -= pixel_bytes;
+                     }
+                  }
+
+                  else if ((pass == 2) || (pass == 3))
+                  {
+                     int width_mmx = ((width >> 1) << 1) ;
+                     width -= width_mmx;
+                     if (width_mmx)
+                     {
+                        _asm
+                        {
+                           mov esi, sptr
+                           mov edi, dp
+                           mov ecx, width_mmx
+                           sub esi, 4
+                           sub edi, 28
+loop4_pass2:
+                           movq mm0, [esi]      ; v3 v2 v1 v0 v7 v6 v5 v4
+                           movq mm1, mm0        ; v3 v2 v1 v0 v7 v6 v5 v4
+                           punpckldq mm0, mm0   ; v7 v6 v5 v4 v7 v6 v5 v4
+                           punpckhdq mm1, mm1   ; v3 v2 v1 v0 v3 v2 v1 v0
+                           movq [edi], mm0
+                           movq [edi + 8], mm0
+                           movq [edi+16], mm1
+                           movq [edi + 24], mm1
+                           sub esi, 8
+                           sub edi, 32
+                           sub ecx, 2
+                           jnz loop4_pass2
+                           EMMS
+                        }
+                     }
+
+                     sptr -= (width_mmx*4 + 4);
+                     dp -= (width_mmx*16 + 4);
+                     for (i = width; i; i--)
+                     {
+                        png_byte v[8];
+                        int j;
+                        sptr -= pixel_bytes;
+                        png_memcpy(v, sptr, pixel_bytes);
+                        for (j = 0; j < png_pass_inc[pass]; j++)
+                        {
+                           dp -= pixel_bytes;
+                           png_memcpy(dp, v, pixel_bytes);
+                           //dp -= pixel_bytes;
+                        }
+                        //sptr -= pixel_bytes;
+                     }
+                  }
+
+                  else // pass == 4 or 5
+                  {
+                     int width_mmx = ((width >> 1) << 1) ;
+                     width -= width_mmx;
+                     if (width_mmx)
+                     {
+                        _asm
+                        {
+                           mov esi, sptr
+                           mov edi, dp
+                           mov ecx, width_mmx
+                           sub esi, 4
+                           sub edi, 12
+loop4_pass4:
+                           movq mm0, [esi]      ; v3 v2 v1 v0 v7 v6 v5 v4
+                           movq mm1, mm0        ; v3 v2 v1 v0 v7 v6 v5 v4
+                           punpckldq mm0, mm0   ; v7 v6 v5 v4 v7 v6 v5 v4
+                           punpckhdq mm1, mm1   ; v3 v2 v1 v0 v3 v2 v1 v0
+                           movq [edi], mm0
+                           sub esi, 8
+                           movq [edi + 8], mm1
+                           sub edi, 16
+                           sub ecx, 2
+                           jnz loop4_pass4
+                           EMMS
+                        }
+                     }
+
+                     sptr -= (width_mmx*4 + 4);
+                     dp -= (width_mmx*8 + 4);
+                     for (i = width; i; i--)
+                     {
+                        png_byte v[8];
+                        int j;
+                        sptr -= pixel_bytes;
+                        png_memcpy(v, sptr, pixel_bytes);
+                        for (j = 0; j < png_pass_inc[pass]; j++)
+                        {
+                           dp -= pixel_bytes;
+                           png_memcpy(dp, v, pixel_bytes);
+                           //dp -= pixel_bytes;
+                        }
+                        //sptr -= pixel_bytes;
+                     }
+                  }
+
+               } /* end of pixel_bytes == 4 */
+
+               else if (pixel_bytes == 6)
+               {
+                  for (i = row_info->width; i; i--)
+                  {
+                     png_byte v[8];
+                     int j;
+                     png_memcpy(v, sptr, pixel_bytes);
+                     for (j = 0; j < png_pass_inc[pass]; j++)
+                     {
+                        png_memcpy(dp, v, pixel_bytes);
+                        dp -= pixel_bytes;
+                     }
+                     sptr -= pixel_bytes;
+                  }
+               } /* end of pixel_bytes == 6 */
+
+               else
+               {
+                  for (i = row_info->width; i; i--)
+                  {
+                     png_byte v[8];
+                     int j;
+                     png_memcpy(v, sptr, pixel_bytes);
+                     for (j = 0; j < png_pass_inc[pass]; j++)
+                     {
+                        png_memcpy(dp, v, pixel_bytes);
+                        dp -= pixel_bytes;
+                     }
+                     sptr-= pixel_bytes;
+                  }
+               }
+            } /* end of mmx_supported */
+
+            else /* MMX not supported:  use modified C code - takes advantage
+                  * of inlining of memcpy for a constant */
+            {
+               if (pixel_bytes == 1)
+               {
+                  for (i = row_info->width; i; i--)
+                  {
+                     png_byte v[8];
+                     int j;
+
+                     png_memcpy(v, sptr, pixel_bytes);
+                     for (j = 0; j < png_pass_inc[pass]; j++)
+                     {
+                        png_memcpy(dp, v, pixel_bytes);
+                        dp -= pixel_bytes;
+                     }
+                     sptr -= pixel_bytes;
+                  }
+               }
+               else if (pixel_bytes == 3)
+               {
+                  for (i = row_info->width; i; i--)
+                  {
+                     png_byte v[8];
+                     int j;
+                     png_memcpy(v, sptr, pixel_bytes);
+                     for (j = 0; j < png_pass_inc[pass]; j++)
+                     {
+                        png_memcpy(dp, v, pixel_bytes);
+                        dp -= pixel_bytes;
+                     }
+                     sptr -= pixel_bytes;
+                  }
+               }
+               else if (pixel_bytes == 2)
+               {
+                  for (i = row_info->width; i; i--)
+                  {
+                     png_byte v[8];
+                     int j;
+                     png_memcpy(v, sptr, pixel_bytes);
+                     for (j = 0; j < png_pass_inc[pass]; j++)
+                     {
+                        png_memcpy(dp, v, pixel_bytes);
+                        dp -= pixel_bytes;
+                     }
+                     sptr -= pixel_bytes;
+                  }
+               }
+               else if (pixel_bytes == 4)
+               {
+                  for (i = row_info->width; i; i--)
+                  {
+                     png_byte v[8];
+                     int j;
+                     png_memcpy(v, sptr, pixel_bytes);
+                     for (j = 0; j < png_pass_inc[pass]; j++)
+                     {
+                        png_memcpy(dp, v, pixel_bytes);
+                        dp -= pixel_bytes;
+                     }
+                     sptr -= pixel_bytes;
+                  }
+               }
+               else if (pixel_bytes == 6)
+               {
+                  for (i = row_info->width; i; i--)
+                  {
+                     png_byte v[8];
+                     int j;
+                     png_memcpy(v, sptr, pixel_bytes);
+                     for (j = 0; j < png_pass_inc[pass]; j++)
+                     {
+                        png_memcpy(dp, v, pixel_bytes);
+                        dp -= pixel_bytes;
+                     }
+                     sptr -= pixel_bytes;
+                  }
+               }
+               else
+               {
+                  for (i = row_info->width; i; i--)
+                  {
+                     png_byte v[8];
+                     int j;
+                     png_memcpy(v, sptr, pixel_bytes);
+                     for (j = 0; j < png_pass_inc[pass]; j++)
+                     {
+                        png_memcpy(dp, v, pixel_bytes);
+                        dp -= pixel_bytes;
+                     }
+                     sptr -= pixel_bytes;
+                  }
+               }
+
+            } /* end of MMX not supported */
+            break;
+         }
+      } /* end switch (row_info->pixel_depth) */
+
+      row_info->width = final_width;
       row_info->rowbytes = ((final_width *
-   (png_uint_32)row_info->pixel_depth + 7) >> 3);
+         (png_uint_32)row_info->pixel_depth + 7) >> 3);
    }
+   mmx_supported = save_mmx_supported;
 }
 
-#endif
-
+#endif /* PNG_READ_INTERLACING_SUPPORTED */
 
 
 // These variables are utilized in the functions below.  They are declared
 // globally here to ensure alignment on 8-byte boundaries.
+
 union uAll {
    __int64 use;
    double  align;
-}  LBCarryMask = {0x0101010101010101}, HBClearMask = {0x7f7f7f7f7f7f7f7f},
-   ActiveMask, ActiveMask2, ActiveMaskEnd, ShiftBpp, ShiftRem;
+} LBCarryMask = {0x0101010101010101},
+  HBClearMask = {0x7f7f7f7f7f7f7f7f},
+  ActiveMask, ActiveMask2, ActiveMaskEnd, ShiftBpp, ShiftRem;
+
 
 // Optimized code for PNG Average filter decoder
 void
 png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row
                             , png_bytep prev_row)
 {
-      int bpp;
-      png_uint_32 FullLength;
-      png_uint_32 MMXLength;
-      //png_uint_32 len;
-      int diff;
-      bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
-    FullLength  = row_info->rowbytes; // # of bytes to filter
-      _asm {
+   int bpp;
+   png_uint_32 FullLength;
+   png_uint_32 MMXLength;
+   //png_uint_32 len;
+   int diff;
+
+   bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
+   FullLength  = row_info->rowbytes; // # of bytes to filter
+   _asm {
          // Init address pointers and offset
          mov edi, row          // edi ==> Avg(x)
          xor ebx, ebx          // ebx ==> x
          mov edx, edi
-   mov esi, prev_row           // esi ==> Prior(x)
+         mov esi, prev_row           // esi ==> Prior(x)
          sub edx, bpp          // edx ==> Raw(x-bpp)
 
          xor eax, eax
@@ -2027,12 +1891,12 @@
          //    Raw(x) = Avg(x) + (Prior(x)/2)
 davgrlp:
          mov al, [esi + ebx]   // Load al with Prior(x)
-   inc ebx
+         inc ebx
          shr al, 1             // divide by 2
          add al, [edi+ebx-1]   // Add Avg(x); -1 to offset inc ebx
          cmp ebx, bpp
-   mov [edi+ebx-1], al    // Write back Raw(x);
-                          // mov does not affect flags; -1 to offset inc ebx
+         mov [edi+ebx-1], al    // Write back Raw(x);
+                            // mov does not affect flags; -1 to offset inc ebx
          jb davgrlp
          // get # of bytes to alignment
          mov diff, edi         // take start of row
@@ -2047,27 +1911,27 @@
          xor ecx, ecx
 davglp1:
          xor eax, eax
-   mov cl, [esi + ebx]        // load cl with Prior(x)
+         mov cl, [esi + ebx]        // load cl with Prior(x)
          mov al, [edx + ebx]  // load al with Raw(x-bpp)
          add ax, cx
-   inc ebx
+         inc ebx
          shr ax, 1            // divide by 2
          add al, [edi+ebx-1]  // Add Avg(x); -1 to offset inc ebx
-   cmp ebx, diff              // Check if at alignment boundary
-   mov [edi+ebx-1], al        // Write back Raw(x);
+         cmp ebx, diff              // Check if at alignment boundary
+         mov [edi+ebx-1], al        // Write back Raw(x);
                             // mov does not affect flags; -1 to offset inc ebx
-   jb davglp1               // Repeat until at alignment boundary
+         jb davglp1               // Repeat until at alignment boundary
 davggo:
-   mov eax, FullLength
+         mov eax, FullLength
          mov ecx, eax
          sub eax, ebx          // subtract alignment fix
          and eax, 0x00000007   // calc bytes over mult of 8
          sub ecx, eax          // drop over bytes from original length
          mov MMXLength, ecx
-      } // end _asm block
-      // Now do the math for the rest of the row
-      switch ( bpp )
-      {
+   } // end _asm block
+   // Now do the math for the rest of the row
+   switch ( bpp )
+   {
       case 3:
       {
          ActiveMask.use  = 0x0000000000ffffff;
@@ -2080,21 +1944,21 @@
             movq mm5, LBCarryMask
             mov edi, row       // edi ==> Avg(x)
             movq mm4, HBClearMask
-      mov esi, prev_row        // esi ==> Prior(x)
+            mov esi, prev_row        // esi ==> Prior(x)
             // PRIME the pump (load the first Raw(x-bpp) data set
-      movq mm2, [edi + ebx - 8]  // Load previous aligned 8 bytes
+            movq mm2, [edi + ebx - 8]  // Load previous aligned 8 bytes
                                // (we correct position in loop below)
 davg3lp:
-      movq mm0, [edi + ebx]      // Load mm0 with Avg(x)
+            movq mm0, [edi + ebx]      // Load mm0 with Avg(x)
             // Add (Prev_row/2) to Average
             movq mm3, mm5
             psrlq mm2, ShiftRem      // Correct position Raw(x-bpp) data
-      movq mm1, [esi + ebx]    // Load mm1 with Prior(x)
+            movq mm1, [esi + ebx]    // Load mm1 with Prior(x)
             movq mm6, mm7
             pand mm3, mm1      // get lsb for each prev_row byte
             psrlq mm1, 1       // divide prev_row bytes by 2
             pand  mm1, mm4     // clear invalid bit 7 of each byte
-      paddb mm0, mm1           // add (Prev_row/2) to Avg for each byte
+            paddb mm0, mm1     // add (Prev_row/2) to Avg for each byte
             // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
             movq mm1, mm3      // now use mm1 for getting LBCarrys
             pand mm1, mm2      // get LBCarrys for each byte where both
@@ -2103,173 +1967,180 @@
             pand  mm2, mm4     // clear invalid bit 7 of each byte
             paddb mm2, mm1     // add LBCarrys to (Raw(x-bpp)/2) for each byte
             pand mm2, mm6      // Leave only Active Group 1 bytes to add to Avg
-      paddb mm0, mm2      // add (Raw/2) + LBCarrys to Avg for each Active byte
+            paddb mm0, mm2     // add (Raw/2) + LBCarrys to Avg for each Active
+                               //  byte
             // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
             psllq mm6, ShiftBpp  // shift the mm6 mask to cover bytes 3-5
             movq mm2, mm0        // mov updated Raws to mm2
             psllq mm2, ShiftBpp  // shift data to position correctly
             movq mm1, mm3        // now use mm1 for getting LBCarrys
-            pand mm1, mm2        // get LBCarrys for each byte where both
-                                 // lsb's were == 1 (Only valid for active group)
-            psrlq mm2, 1         // divide raw bytes by 2
-            pand  mm2, mm4       // clear invalid bit 7 of each byte
-            paddb mm2, mm1       // add LBCarrys to (Raw(x-bpp)/2) for each byte
-            pand mm2, mm6        // Leave only Active Group 2 bytes to add to Avg
-      paddb mm0, mm2     // add (Raw/2) + LBCarrys to Avg for each Active byte
+            pand mm1, mm2      // get LBCarrys for each byte where both
+                               // lsb's were == 1 (Only valid for active group)
+            psrlq mm2, 1       // divide raw bytes by 2
+            pand  mm2, mm4     // clear invalid bit 7 of each byte
+            paddb mm2, mm1     // add LBCarrys to (Raw(x-bpp)/2) for each byte
+            pand mm2, mm6      // Leave only Active Group 2 bytes to add to Avg
+            paddb mm0, mm2     // add (Raw/2) + LBCarrys to Avg for each Active
+                               //  byte
 
             // Add 3rd active group (Raw(x-bpp)/2) to Average with LBCarry
-            psllq mm6, ShiftBpp  // shift the mm6 mask to cover the last two bytes
+            psllq mm6, ShiftBpp  // shift the mm6 mask to cover the last two
+                                 // bytes
             movq mm2, mm0        // mov updated Raws to mm2
             psllq mm2, ShiftBpp  // shift data to position correctly
-                                 // Data only needs to be shifted once here to
-                                 // get the correct x-bpp offset.
-            movq mm1, mm3        // now use mm1 for getting LBCarrys
-            pand mm1, mm2        // get LBCarrys for each byte where both
-                                 // lsb's were == 1 (Only valid for active group)
-            psrlq mm2, 1         // divide raw bytes by 2
-            pand  mm2, mm4       // clear invalid bit 7 of each byte
-            paddb mm2, mm1       // add LBCarrys to (Raw(x-bpp)/2) for each byte
-            pand mm2, mm6        // Leave only Active Group 2 bytes to add to Avg
-      add ebx, 8
-      paddb mm0, mm2     // add (Raw/2) + LBCarrys to Avg for each Active byte
+                              // Data only needs to be shifted once here to
+                              // get the correct x-bpp offset.
+            movq mm1, mm3     // now use mm1 for getting LBCarrys
+            pand mm1, mm2     // get LBCarrys for each byte where both
+                              // lsb's were == 1 (Only valid for active group)
+            psrlq mm2, 1      // divide raw bytes by 2
+            pand  mm2, mm4    // clear invalid bit 7 of each byte
+            paddb mm2, mm1    // add LBCarrys to (Raw(x-bpp)/2) for each byte
+            pand mm2, mm6     // Leave only Active Group 2 bytes to add to Avg
+            add ebx, 8
+            paddb mm0, mm2    // add (Raw/2) + LBCarrys to Avg for each Active
+                              // byte
 
             // Now ready to write back to memory
-      movq [edi + ebx - 8], mm0
+            movq [edi + ebx - 8], mm0
             // Move updated Raw(x) to use as Raw(x-bpp) for next loop
-      cmp ebx, MMXLength
+            cmp ebx, MMXLength
             movq mm2, mm0     // mov updated Raw(x) to mm2
-      jb davg3lp
-   } // end _asm block
+            jb davg3lp
+         } // end _asm block
       }
       break;
+
       case 6:
       case 4:
       case 7:
       case 5:
       {
          ActiveMask.use  = 0xffffffffffffffff;  // use shift below to clear
-                                               // appropriate inactive bytes
+                                                // appropriate inactive bytes
          ShiftBpp.use = bpp << 3;
          ShiftRem.use = 64 - ShiftBpp.use;
-   _asm {
+         _asm {
             movq mm4, HBClearMask
             // Re-init address pointers and offset
             mov ebx, diff       // ebx ==> x = offset to alignment boundary
             // Load ActiveMask and clear all bytes except for 1st active group
             movq mm7, ActiveMask
-            mov edi, row                  // edi ==> Avg(x)
+            mov edi, row         // edi ==> Avg(x)
             psrlq mm7, ShiftRem
-      mov esi, prev_row             // esi ==> Prior(x)
+            mov esi, prev_row    // esi ==> Prior(x)
             movq mm6, mm7
             movq mm5, LBCarryMask
-            psllq mm6, ShiftBpp    // Create mask for 2nd active group
+            psllq mm6, ShiftBpp  // Create mask for 2nd active group
             // PRIME the pump (load the first Raw(x-bpp) data set
-      movq mm2, [edi + ebx - 8]  // Load previous aligned 8 bytes
+            movq mm2, [edi + ebx - 8]  // Load previous aligned 8 bytes
                                  // (we correct position in loop below)
 davg4lp:
-      movq mm0, [edi + ebx]
+            movq mm0, [edi + ebx]
             psrlq mm2, ShiftRem  // shift data to position correctly
-      movq mm1, [esi + ebx]
+            movq mm1, [esi + ebx]
             // Add (Prev_row/2) to Average
             movq mm3, mm5
-            pand mm3, mm1        // get lsb for each prev_row byte
-            psrlq mm1, 1         // divide prev_row bytes by 2
-            pand  mm1, mm4       // clear invalid bit 7 of each byte
-      paddb mm0, mm1             // add (Prev_row/2) to Avg for each byte
+            pand mm3, mm1     // get lsb for each prev_row byte
+            psrlq mm1, 1      // divide prev_row bytes by 2
+            pand  mm1, mm4    // clear invalid bit 7 of each byte
+            paddb mm0, mm1    // add (Prev_row/2) to Avg for each byte
             // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
-            movq mm1, mm3        // now use mm1 for getting LBCarrys
-            pand mm1, mm2        // get LBCarrys for each byte where both
-                                 // lsb's were == 1 (Only valid for active group)
-            psrlq mm2, 1         // divide raw bytes by 2
-            pand  mm2, mm4       // clear invalid bit 7 of each byte
-            paddb mm2, mm1       // add LBCarrys to (Raw(x-bpp)/2) for each byte
-            pand mm2, mm7        // Leave only Active Group 1 bytes to add to Avg
-      paddb mm0, mm2    // add (Raw/2) + LBCarrys to Avg for each Active byte
+            movq mm1, mm3     // now use mm1 for getting LBCarrys
+            pand mm1, mm2     // get LBCarrys for each byte where both
+                              // lsb's were == 1 (Only valid for active group)
+            psrlq mm2, 1      // divide raw bytes by 2
+            pand  mm2, mm4    // clear invalid bit 7 of each byte
+            paddb mm2, mm1    // add LBCarrys to (Raw(x-bpp)/2) for each byte
+            pand mm2, mm7     // Leave only Active Group 1 bytes to add to Avg
+            paddb mm0, mm2    // add (Raw/2) + LBCarrys to Avg for each Active
+                              // byte
             // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
-            movq mm2, mm0        // mov updated Raws to mm2
-            psllq mm2, ShiftBpp  // shift data to position correctly
-      add ebx, 8
-            movq mm1, mm3        // now use mm1 for getting LBCarrys
-            pand mm1, mm2        // get LBCarrys for each byte where both
-                                 // lsb's were == 1 (Only valid for active group)
-            psrlq mm2, 1         // divide raw bytes by 2
-            pand  mm2, mm4       // clear invalid bit 7 of each byte
-            paddb mm2, mm1       // add LBCarrys to (Raw(x-bpp)/2) for each byte
-            pand mm2, mm6        // Leave only Active Group 2 bytes to add to Avg
-      paddb mm0, mm2    // add (Raw/2) + LBCarrys to Avg for each Active byte
-      cmp ebx, MMXLength
+            movq mm2, mm0     // mov updated Raws to mm2
+            psllq mm2, ShiftBpp // shift data to position correctly
+            add ebx, 8
+            movq mm1, mm3     // now use mm1 for getting LBCarrys
+            pand mm1, mm2     // get LBCarrys for each byte where both
+                              // lsb's were == 1 (Only valid for active group)
+            psrlq mm2, 1      // divide raw bytes by 2
+            pand  mm2, mm4    // clear invalid bit 7 of each byte
+            paddb mm2, mm1    // add LBCarrys to (Raw(x-bpp)/2) for each byte
+            pand mm2, mm6     // Leave only Active Group 2 bytes to add to Avg
+            paddb mm0, mm2    // add (Raw/2) + LBCarrys to Avg for each Active
+                              // byte
+            cmp ebx, MMXLength
             // Now ready to write back to memory
-      movq [edi + ebx - 8], mm0
+            movq [edi + ebx - 8], mm0
             // Prep Raw(x-bpp) for next loop
-            movq mm2, mm0        // mov updated Raws to mm2
-      jb davg4lp
-   } // end _asm block
+            movq mm2, mm0     // mov updated Raws to mm2
+            jb davg4lp
+         } // end _asm block
       }
       break;
       case 2:
       {
          ActiveMask.use  = 0x000000000000ffff;
-         ShiftBpp.use = 24;      // == 3 * 8
-         ShiftRem.use = 40;      // == 64 - 24
-   _asm {
+         ShiftBpp.use = 24;   // == 3 * 8
+         ShiftRem.use = 40;   // == 64 - 24
+         _asm {
             // Load ActiveMask
             movq mm7, ActiveMask
             // Re-init address pointers and offset
-            mov ebx, diff        // ebx ==> x = offset to alignment boundary
+            mov ebx, diff     // ebx ==> x = offset to alignment boundary
             movq mm5, LBCarryMask
-            mov edi, row         // edi ==> Avg(x)
+            mov edi, row      // edi ==> Avg(x)
             movq mm4, HBClearMask
-      mov esi, prev_row          // esi ==> Prior(x)
+            mov esi, prev_row  // esi ==> Prior(x)
             // PRIME the pump (load the first Raw(x-bpp) data set
-      movq mm2, [edi + ebx - 8]  // Load previous aligned 8 bytes
-                                 // (we correct position in loop below)
+            movq mm2, [edi + ebx - 8]  // Load previous aligned 8 bytes
+                              // (we correct position in loop below)
 davg2lp:
-      movq mm0, [edi + ebx]
+            movq mm0, [edi + ebx]
             psllq mm2, ShiftRem  // shift data to position correctly
-      movq mm1, [esi + ebx]
+            movq mm1, [esi + ebx]
             // Add (Prev_row/2) to Average
             movq mm3, mm5
-            pand mm3, mm1        // get lsb for each prev_row byte
-            psrlq mm1, 1         // divide prev_row bytes by 2
-            pand  mm1, mm4       // clear invalid bit 7 of each byte
+            pand mm3, mm1     // get lsb for each prev_row byte
+            psrlq mm1, 1      // divide prev_row bytes by 2
+            pand  mm1, mm4    // clear invalid bit 7 of each byte
             movq mm6, mm7
-      paddb mm0, mm1             // add (Prev_row/2) to Avg for each byte
+            paddb mm0, mm1    // add (Prev_row/2) to Avg for each byte
             // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
-            movq mm1, mm3        // now use mm1 for getting LBCarrys
-            pand mm1, mm2        // get LBCarrys for each byte where both
-                                 // lsb's were == 1 (Only valid for active group)
-            psrlq mm2, 1         // divide raw bytes by 2
-            pand  mm2, mm4       // clear invalid bit 7 of each byte
-            paddb mm2, mm1       // add LBCarrys to (Raw(x-bpp)/2) for each byte
-            pand mm2, mm6        // Leave only Active Group 1 bytes to add to Avg
-      paddb mm0, mm2    // add (Raw/2) + LBCarrys to Avg for each Active byte
+            movq mm1, mm3     // now use mm1 for getting LBCarrys
+            pand mm1, mm2     // get LBCarrys for each byte where both
+                              // lsb's were == 1 (Only valid for active group)
+            psrlq mm2, 1      // divide raw bytes by 2
+            pand  mm2, mm4    // clear invalid bit 7 of each byte
+            paddb mm2, mm1    // add LBCarrys to (Raw(x-bpp)/2) for each byte
+            pand mm2, mm6     // Leave only Active Group 1 bytes to add to Avg
+            paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
             // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
-            psllq mm6, ShiftBpp  // shift the mm6 mask to cover bytes 2 & 3
-            movq mm2, mm0        // mov updated Raws to mm2
-            psllq mm2, ShiftBpp  // shift data to position correctly
-            movq mm1, mm3        // now use mm1 for getting LBCarrys
-            pand mm1, mm2        // get LBCarrys for each byte where both
-                                 // lsb's were == 1 (Only valid for active group)
-            psrlq mm2, 1         // divide raw bytes by 2
-            pand  mm2, mm4       // clear invalid bit 7 of each byte
-            paddb mm2, mm1       // add LBCarrys to (Raw(x-bpp)/2) for each byte
-            pand mm2, mm6        // Leave only Active Group 2 bytes to add to Avg
-      paddb mm0, mm2    // add (Raw/2) + LBCarrys to Avg for each Active byte
+            psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 2 & 3
+            movq mm2, mm0       // mov updated Raws to mm2
+            psllq mm2, ShiftBpp // shift data to position correctly
+            movq mm1, mm3       // now use mm1 for getting LBCarrys
+            pand mm1, mm2       // get LBCarrys for each byte where both
+                                // lsb's were == 1 (Only valid for active group)
+            psrlq mm2, 1        // divide raw bytes by 2
+            pand  mm2, mm4      // clear invalid bit 7 of each byte
+            paddb mm2, mm1      // add LBCarrys to (Raw(x-bpp)/2) for each byte
+            pand mm2, mm6       // Leave only Active Group 2 bytes to add to Avg
+            paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
 
             // Add rdd active group (Raw(x-bpp)/2) to Average with LBCarry
-            psllq mm6, ShiftBpp  // shift the mm6 mask to cover bytes 4 & 5
-            movq mm2, mm0        // mov updated Raws to mm2
-            psllq mm2, ShiftBpp  // shift data to position correctly
-                                 // Data only needs to be shifted once here to
-                                 // get the correct x-bpp offset.
-            movq mm1, mm3        // now use mm1 for getting LBCarrys
-            pand mm1, mm2        // get LBCarrys for each byte where both
-                                 // lsb's were == 1 (Only valid for active group)
-            psrlq mm2, 1         // divide raw bytes by 2
-            pand  mm2, mm4       // clear invalid bit 7 of each byte
-            paddb mm2, mm1       // add LBCarrys to (Raw(x-bpp)/2) for each byte
-            pand mm2, mm6        // Leave only Active Group 2 bytes to add to Avg
-      paddb mm0, mm2    // add (Raw/2) + LBCarrys to Avg for each Active byte
+            psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 4 & 5
+            movq mm2, mm0       // mov updated Raws to mm2
+            psllq mm2, ShiftBpp // shift data to position correctly
+                                // Data only needs to be shifted once here to
+                                // get the correct x-bpp offset.
+            movq mm1, mm3       // now use mm1 for getting LBCarrys
+            pand mm1, mm2       // get LBCarrys for each byte where both
+                                // lsb's were == 1 (Only valid for active group)
+            psrlq mm2, 1        // divide raw bytes by 2
+            pand  mm2, mm4      // clear invalid bit 7 of each byte
+            paddb mm2, mm1      // add LBCarrys to (Raw(x-bpp)/2) for each byte
+            pand mm2, mm6       // Leave only Active Group 2 bytes to add to Avg
+            paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
 
             // Add 4th active group (Raw(x-bpp)/2) to Average with LBCarry
             psllq mm6, ShiftBpp  // shift the mm6 mask to cover bytes 6 & 7
@@ -2278,72 +2149,73 @@
                                  // Data only needs to be shifted once here to
                                  // get the correct x-bpp offset.
             add ebx, 8
-            movq mm1, mm3        // now use mm1 for getting LBCarrys
-            pand mm1, mm2        // get LBCarrys for each byte where both
-                                 // lsb's were == 1 (Only valid for active group)
-            psrlq mm2, 1         // divide raw bytes by 2
-            pand  mm2, mm4       // clear invalid bit 7 of each byte
-            paddb mm2, mm1       // add LBCarrys to (Raw(x-bpp)/2) for each byte
-            pand mm2, mm6        // Leave only Active Group 2 bytes to add to Avg
-      paddb mm0, mm2      // add (Raw/2) + LBCarrys to Avg for each Active byte
+            movq mm1, mm3    // now use mm1 for getting LBCarrys
+            pand mm1, mm2    // get LBCarrys for each byte where both
+                             // lsb's were == 1 (Only valid for active group)
+            psrlq mm2, 1     // divide raw bytes by 2
+            pand  mm2, mm4   // clear invalid bit 7 of each byte
+            paddb mm2, mm1   // add LBCarrys to (Raw(x-bpp)/2) for each byte
+            pand mm2, mm6    // Leave only Active Group 2 bytes to add to Avg
+            paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
 
-      cmp ebx, MMXLength
+            cmp ebx, MMXLength
             // Now ready to write back to memory
-      movq [edi + ebx - 8], mm0
+            movq [edi + ebx - 8], mm0
             // Prep Raw(x-bpp) for next loop
-            movq mm2, mm0        // mov updated Raws to mm2
-      jb davg2lp
-  } // end _asm block
+            movq mm2, mm0    // mov updated Raws to mm2
+            jb davg2lp
+        } // end _asm block
       }
       break;
-      case 1:                    // bpp == 1
+
+      case 1:                 // bpp == 1
       {
          _asm {
             // Re-init address pointers and offset
-            mov ebx, diff        // ebx ==> x = offset to alignment boundary
-            mov edi, row         // edi ==> Avg(x)
+            mov ebx, diff     // ebx ==> x = offset to alignment boundary
+            mov edi, row      // edi ==> Avg(x)
             cmp ebx, FullLength  // Test if offset at end of array
-      jnb davg1end
+            jnb davg1end
             // Do Paeth decode for remaining bytes
-        mov esi, prev_row        // esi ==> Prior(x)
+            mov esi, prev_row    // esi ==> Prior(x)
             mov edx, edi
             xor ecx, ecx         // zero ecx before using cl & cx in loop below
             sub edx, bpp         // edx ==> Raw(x-bpp)
 davg1lp:
             // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
             xor eax, eax
-      mov cl, [esi + ebx]        // load cl with Prior(x)
+            mov cl, [esi + ebx]  // load cl with Prior(x)
             mov al, [edx + ebx]  // load al with Raw(x-bpp)
             add ax, cx
-      inc ebx
+            inc ebx
             shr ax, 1            // divide by 2
             add al, [edi+ebx-1]  // Add Avg(x); -1 to offset inc ebx
-      cmp ebx, FullLength        // Check if at end of array
-      mov [edi+ebx-1], al        // Write back Raw(x);
+            cmp ebx, FullLength  // Check if at end of array
+            mov [edi+ebx-1], al  // Write back Raw(x);
                          // mov does not affect flags; -1 to offset inc ebx
-      jb davg1lp
+            jb davg1lp
 davg1end:
-   } // end _asm block
+         } // end _asm block
       }
       return;
 
       case 8:             // bpp == 8
       {
-   _asm {
+         _asm {
             // Re-init address pointers and offset
             mov ebx, diff           // ebx ==> x = offset to alignment boundary
             movq mm5, LBCarryMask
             mov edi, row            // edi ==> Avg(x)
             movq mm4, HBClearMask
-      mov esi, prev_row             // esi ==> Prior(x)
+            mov esi, prev_row       // esi ==> Prior(x)
             // PRIME the pump (load the first Raw(x-bpp) data set
-      movq mm2, [edi + ebx - 8]  // Load previous aligned 8 bytes
+            movq mm2, [edi + ebx - 8]  // Load previous aligned 8 bytes
                                 // (NO NEED to correct position in loop below)
 davg8lp:
-      movq mm0, [edi + ebx]
+            movq mm0, [edi + ebx]
             movq mm3, mm5
-      movq mm1, [esi + ebx]
-      add ebx, 8
+            movq mm1, [esi + ebx]
+            add ebx, 8
             pand mm3, mm1       // get lsb for each prev_row byte
             psrlq mm1, 1        // divide prev_row bytes by 2
             pand mm3, mm2       // get LBCarrys for each byte where both
@@ -2353,31 +2225,31 @@
             paddb mm0, mm3      // add LBCarrys to Avg for each byte
             pand  mm2, mm4      // clear invalid bit 7 of each byte
             paddb mm0, mm1      // add (Prev_row/2) to Avg for each byte
-      paddb mm0, mm2            // add (Raw/2) to Avg for each byte
-      cmp ebx, MMXLength
-      movq [edi + ebx - 8], mm0
+            paddb mm0, mm2      // add (Raw/2) to Avg for each byte
+            cmp ebx, MMXLength
+            movq [edi + ebx - 8], mm0
             movq mm2, mm0       // reuse as Raw(x-bpp)
-      jb davg8lp
-  } // end _asm block
+            jb davg8lp
+        } // end _asm block
       }
       break;
       default:                  // bpp greater than 8
       {
-  _asm {
+        _asm {
             movq mm5, LBCarryMask
             // Re-init address pointers and offset
             mov ebx, diff       // ebx ==> x = offset to alignment boundary
             mov edi, row        // edi ==> Avg(x)
             movq mm4, HBClearMask
             mov edx, edi
-      mov esi, prev_row         // esi ==> Prior(x)
+            mov esi, prev_row   // esi ==> Prior(x)
             sub edx, bpp        // edx ==> Raw(x-bpp)
 davgAlp:
-      movq mm0, [edi + ebx]
+            movq mm0, [edi + ebx]
             movq mm3, mm5
-      movq mm1, [esi + ebx]
+            movq mm1, [esi + ebx]
             pand mm3, mm1       // get lsb for each prev_row byte
-      movq mm2, [edx + ebx]
+            movq mm2, [edx + ebx]
             psrlq mm1, 1        // divide prev_row bytes by 2
             pand mm3, mm2       // get LBCarrys for each byte where both
                                 // lsb's were == 1
@@ -2386,70 +2258,72 @@
             paddb mm0, mm3      // add LBCarrys to Avg for each byte
             pand  mm2, mm4      // clear invalid bit 7 of each byte
             paddb mm0, mm1      // add (Prev_row/2) to Avg for each byte
-      add ebx, 8
-      paddb mm0, mm2             // add (Raw/2) to Avg for each byte
-      cmp ebx, MMXLength
-      movq [edi + ebx - 8], mm0
-      jb davgAlp
-  } // end _asm block
+            add ebx, 8
+            paddb mm0, mm2      // add (Raw/2) to Avg for each byte
+            cmp ebx, MMXLength
+            movq [edi + ebx - 8], mm0
+            jb davgAlp
+        } // end _asm block
       }
       break;
-      }                         // end switch ( bpp )
+   }                         // end switch ( bpp )
 
-      _asm {
+   _asm {
          // MMX acceleration complete now do clean-up
          // Check if any remaining bytes left to decode
-   mov ebx, MMXLength           // ebx ==> x = offset bytes remaining after MMX
-     mov edi, row               // edi ==> Avg(x)
-   cmp ebx, FullLength          // Test if offset at end of array
-   jnb davgend
+         mov ebx, MMXLength    // ebx ==> x = offset bytes remaining after MMX
+         mov edi, row          // edi ==> Avg(x)
+         cmp ebx, FullLength   // Test if offset at end of array
+         jnb davgend
          // Do Paeth decode for remaining bytes
-     mov esi, prev_row          // esi ==> Prior(x)
+         mov esi, prev_row     // esi ==> Prior(x)
          mov edx, edi
-         xor ecx, ecx           // zero ecx before using cl & cx in loop below
-         sub edx, bpp           // edx ==> Raw(x-bpp)
+         xor ecx, ecx          // zero ecx before using cl & cx in loop below
+         sub edx, bpp          // edx ==> Raw(x-bpp)
 davglp2:
          // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
          xor eax, eax
-   mov cl, [esi + ebx]        // load cl with Prior(x)
-         mov al, [edx + ebx]    // load al with Raw(x-bpp)
+         mov cl, [esi + ebx]   // load cl with Prior(x)
+         mov al, [edx + ebx]   // load al with Raw(x-bpp)
          add ax, cx
-   inc ebx
+         inc ebx
          shr ax, 1              // divide by 2
          add al, [edi+ebx-1]    // Add Avg(x); -1 to offset inc ebx
-   cmp ebx, FullLength        // Check if at end of array
-   mov [edi+ebx-1], al        // Write back Raw(x);
+         cmp ebx, FullLength    // Check if at end of array
+         mov [edi+ebx-1], al    // Write back Raw(x);
                           // mov does not affect flags; -1 to offset inc ebx
-   jb davglp2
+         jb davglp2
 davgend:
-   emms                   // End MMX instructions; prep for possible FP instrs.
+         emms             // End MMX instructions; prep for possible FP instrs.
    } // end _asm block
 }
 
 // Optimized code for PNG Paeth filter decoder
 void
-png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row
-                            , png_bytep prev_row)
+png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
+                              png_bytep prev_row)
 {
-      png_uint_32 FullLength;
-      png_uint_32 MMXLength;
-      //png_uint_32 len;
-      int bpp;
-      int diff;
-      //int ptemp;
-      int patemp, pbtemp, pctemp;
-      bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
-      FullLength  = row_info->rowbytes; // # of bytes to filter
-      _asm {
-         xor ebx, ebx                  // ebx ==> x offset
-   mov edi, row
-         xor edx, edx                  // edx ==> x-bpp offset
-   mov esi, prev_row
+   png_uint_32 FullLength;
+   png_uint_32 MMXLength;
+   //png_uint_32 len;
+   int bpp;
+   int diff;
+   //int ptemp;
+   int patemp, pbtemp, pctemp;
+
+   bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
+   FullLength  = row_info->rowbytes; // # of bytes to filter
+   _asm
+   {
+         xor ebx, ebx        // ebx ==> x offset
+         mov edi, row
+         xor edx, edx        // edx ==> x-bpp offset
+         mov esi, prev_row
          xor eax, eax
 
-     // Compute the Raw value for the first bpp bytes
-     // Note: the formula works out to always be Paeth(x) = Raw(x) + Prior(x)
-     //        where x < bpp
+         // Compute the Raw value for the first bpp bytes
+         // Note: the formula works out to be always
+         //   Paeth(x) = Raw(x) + Prior(x)      where x < bpp
 dpthrlp:
          mov al, [edi + ebx]
          add al, [esi + ebx]
@@ -2460,7 +2334,7 @@
          // get # of bytes to alignment
          mov diff, edi         // take start of row
          add diff, ebx         // add bpp
-   xor ecx, ecx
+         xor ecx, ecx
          add diff, 0xf         // add 7 + 8 to incr past alignment boundary
          and diff, 0xfffffff8  // mask to alignment boundary
          sub diff, edi         // subtract from start ==> value ebx at alignment
@@ -2523,33 +2397,34 @@
          // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
          mov cl, [edi + edx]  // load Raw(x-bpp) into cl
 dpthpaeth:
-   inc ebx
-   inc edx
+         inc ebx
+         inc edx
          // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
          add [edi + ebx - 1], cl
-   cmp ebx, diff
-   jb dpthlp1
+         cmp ebx, diff
+         jb dpthlp1
 dpthgo:
-    mov ecx, FullLength
+         mov ecx, FullLength
          mov eax, ecx
          sub eax, ebx          // subtract alignment fix
          and eax, 0x00000007   // calc bytes over mult of 8
          sub ecx, eax          // drop over bytes from original length
          mov MMXLength, ecx
-     } // end _asm block
-      // Now do the math for the rest of the row
-      switch ( bpp )
-      {
+   } // end _asm block
+   // Now do the math for the rest of the row
+   switch ( bpp )
+   {
       case 3:
       {
          ActiveMask.use = 0x0000000000ffffff;
          ActiveMaskEnd.use = 0xffff000000000000;
          ShiftBpp.use = 24;    // == bpp(3) * 8
          ShiftRem.use = 40;    // == 64 - 24
-      _asm {
+         _asm
+         {
             mov ebx, diff
-         mov edi, row
-         mov esi, prev_row
+            mov edi, row
+            mov esi, prev_row
             pxor mm0, mm0
             // PRIME the pump (load the first Raw(x-bpp) data set
             movq mm1, [edi+ebx-8]
@@ -2574,23 +2449,23 @@
             // pa = abs(p-a) = abs(pav)
             // pb = abs(p-b) = abs(pbv)
             // pc = abs(p-c) = abs(pcv)
-            pcmpgtw mm0, mm4        // Create mask pav bytes < 0
+            pcmpgtw mm0, mm4    // Create mask pav bytes < 0
             paddw mm6, mm5
-            pand mm0, mm4           // Only pav bytes < 0 in mm7
-            pcmpgtw mm7, mm5        // Create mask pbv bytes < 0
+            pand mm0, mm4       // Only pav bytes < 0 in mm7
+            pcmpgtw mm7, mm5    // Create mask pbv bytes < 0
             psubw mm4, mm0
-            pand mm7, mm5           // Only pbv bytes < 0 in mm0
+            pand mm7, mm5       // Only pbv bytes < 0 in mm0
             psubw mm4, mm0
             psubw mm5, mm7
             pxor mm0, mm0
-            pcmpgtw mm0, mm6        // Create mask pcv bytes < 0
-            pand mm0, mm6           // Only pav bytes < 0 in mm7
+            pcmpgtw mm0, mm6    // Create mask pcv bytes < 0
+            pand mm0, mm6       // Only pav bytes < 0 in mm7
             psubw mm5, mm7
             psubw mm6, mm0
             //  test pa <= pb
             movq mm7, mm4
             psubw mm6, mm0
-            pcmpgtw mm7, mm5        // pa > pb?
+            pcmpgtw mm7, mm5    // pa > pb?
             movq mm0, mm7
             // use mm7 mask to merge pa & pb
             pand mm5, mm7
@@ -2601,7 +2476,7 @@
             paddw mm7, mm5
             paddw mm0, mm2
             //  test  ((pa <= pb)? pa:pb) <= pc
-            pcmpgtw mm7, mm6           // pab > pc?
+            pcmpgtw mm7, mm6       // pab > pc?
             pxor mm1, mm1
             pand mm3, mm7
             pandn mm7, mm0
@@ -2634,22 +2509,22 @@
             // pa = abs(p-a) = abs(pav)
             // pb = abs(p-b) = abs(pbv)
             // pc = abs(p-c) = abs(pcv)
-            pcmpgtw mm0, mm5           // Create mask pbv bytes < 0
-            pcmpgtw mm7, mm4           // Create mask pav bytes < 0
-            pand mm0, mm5              // Only pbv bytes < 0 in mm0
-            pand mm7, mm4              // Only pav bytes < 0 in mm7
+            pcmpgtw mm0, mm5       // Create mask pbv bytes < 0
+            pcmpgtw mm7, mm4       // Create mask pav bytes < 0
+            pand mm0, mm5          // Only pbv bytes < 0 in mm0
+            pand mm7, mm4          // Only pav bytes < 0 in mm7
             psubw mm5, mm0
             psubw mm4, mm7
             psubw mm5, mm0
             psubw mm4, mm7
             pxor mm0, mm0
-            pcmpgtw mm0, mm6           // Create mask pcv bytes < 0
-            pand mm0, mm6              // Only pav bytes < 0 in mm7
+            pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
+            pand mm0, mm6          // Only pav bytes < 0 in mm7
             psubw mm6, mm0
             //  test pa <= pb
             movq mm7, mm4
             psubw mm6, mm0
-            pcmpgtw mm7, mm5           // pa > pb?
+            pcmpgtw mm7, mm5       // pa > pb?
             movq mm0, mm7
             // use mm7 mask to merge pa & pb
             pand mm5, mm7
@@ -2660,8 +2535,8 @@
             paddw mm7, mm5
             paddw mm0, mm2
             //  test  ((pa <= pb)? pa:pb) <= pc
-            pcmpgtw mm7, mm6           // pab > pc?
-            movq mm2, [esi + ebx]      // load b=Prior(x)
+            pcmpgtw mm7, mm6       // pab > pc?
+            movq mm2, [esi + ebx]  // load b=Prior(x)
             pand mm3, mm7
             pandn mm7, mm0
             pxor mm1, mm1
@@ -2696,22 +2571,22 @@
             // pa = abs(p-a) = abs(pav)
             // pb = abs(p-b) = abs(pbv)
             // pc = abs(p-c) = abs(pcv)
-            pcmpgtw mm0, mm4        // Create mask pav bytes < 0
-            pcmpgtw mm7, mm5        // Create mask pbv bytes < 0
-            pand mm0, mm4           // Only pav bytes < 0 in mm7
-            pand mm7, mm5           // Only pbv bytes < 0 in mm0
+            pcmpgtw mm0, mm4    // Create mask pav bytes < 0
+            pcmpgtw mm7, mm5    // Create mask pbv bytes < 0
+            pand mm0, mm4       // Only pav bytes < 0 in mm7
+            pand mm7, mm5       // Only pbv bytes < 0 in mm0
             psubw mm4, mm0
             psubw mm5, mm7
             psubw mm4, mm0
             psubw mm5, mm7
             pxor mm0, mm0
-            pcmpgtw mm0, mm6        // Create mask pcv bytes < 0
-            pand mm0, mm6           // Only pav bytes < 0 in mm7
+            pcmpgtw mm0, mm6    // Create mask pcv bytes < 0
+            pand mm0, mm6       // Only pav bytes < 0 in mm7
             psubw mm6, mm0
             //  test pa <= pb
             movq mm7, mm4
             psubw mm6, mm0
-            pcmpgtw mm7, mm5        // pa > pb?
+            pcmpgtw mm7, mm5    // pa > pb?
             movq mm0, mm7
             // use mm0 mask copy to merge a & b
             pand mm2, mm0
@@ -2722,26 +2597,27 @@
             paddw mm0, mm2
             paddw mm7, mm5
             //  test  ((pa <= pb)? pa:pb) <= pc
-            pcmpgtw mm7, mm6        // pab > pc?
+            pcmpgtw mm7, mm6    // pab > pc?
             pand mm3, mm7
             pandn mm7, mm0
             paddw mm7, mm3
             pxor mm1, mm1
             packuswb mm1, mm7
             // Step ebx to next set of 8 bytes and repeat loop til done
-      add ebx, 8
+            add ebx, 8
             pand mm1, ActiveMaskEnd
             paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
 
-      cmp ebx, MMXLength
+            cmp ebx, MMXLength
             pxor mm0, mm0              // pxor does not affect flags
             movq [edi + ebx - 8], mm1  // write back updated value
                                  // mm1 will be used as Raw(x-bpp) next loop
                            // mm3 ready to be used as Prior(x-bpp) next loop
-      jb dpth3lp
-  } // end _asm block
+            jb dpth3lp
+         } // end _asm block
       }
       break;
+
       case 6:
       case 7:
       case 5:
@@ -2750,18 +2626,19 @@
          ActiveMask2.use = 0xffffffff00000000;
          ShiftBpp.use = bpp << 3;    // == bpp * 8
          ShiftRem.use = 64 - ShiftBpp.use;
-    _asm {
+         _asm
+         {
             mov ebx, diff
-         mov edi, row               //
-         mov esi, prev_row
+            mov edi, row
+            mov esi, prev_row
             // PRIME the pump (load the first Raw(x-bpp) data set
-      movq mm1, [edi+ebx-8]
+            movq mm1, [edi+ebx-8]
             pxor mm0, mm0
 dpth6lp:
             // Must shift to position Raw(x-bpp) data
             psrlq mm1, ShiftRem
             // Do first set of 4 bytes
-      movq mm3, [esi+ebx-8]      // read c=Prior(x-bpp) bytes
+            movq mm3, [esi+ebx-8]      // read c=Prior(x-bpp) bytes
             punpcklbw mm1, mm0      // Unpack Low bytes of a
             movq mm2, [esi + ebx]   // load b=Prior(x)
             punpcklbw mm2, mm0      // Unpack Low bytes of b
@@ -2780,23 +2657,23 @@
             // pa = abs(p-a) = abs(pav)
             // pb = abs(p-b) = abs(pbv)
             // pc = abs(p-c) = abs(pcv)
-            pcmpgtw mm0, mm4        // Create mask pav bytes < 0
+            pcmpgtw mm0, mm4    // Create mask pav bytes < 0
             paddw mm6, mm5
-            pand mm0, mm4           // Only pav bytes < 0 in mm7
-            pcmpgtw mm7, mm5        // Create mask pbv bytes < 0
+            pand mm0, mm4       // Only pav bytes < 0 in mm7
+            pcmpgtw mm7, mm5    // Create mask pbv bytes < 0
             psubw mm4, mm0
-            pand mm7, mm5           // Only pbv bytes < 0 in mm0
+            pand mm7, mm5       // Only pbv bytes < 0 in mm0
             psubw mm4, mm0
             psubw mm5, mm7
             pxor mm0, mm0
-            pcmpgtw mm0, mm6        // Create mask pcv bytes < 0
-            pand mm0, mm6           // Only pav bytes < 0 in mm7
+            pcmpgtw mm0, mm6    // Create mask pcv bytes < 0
+            pand mm0, mm6       // Only pav bytes < 0 in mm7
             psubw mm5, mm7
             psubw mm6, mm0
             //  test pa <= pb
             movq mm7, mm4
             psubw mm6, mm0
-            pcmpgtw mm7, mm5        // pa > pb?
+            pcmpgtw mm7, mm5    // pa > pb?
             movq mm0, mm7
             // use mm7 mask to merge pa & pb
             pand mm5, mm7
@@ -2807,7 +2684,7 @@
             paddw mm7, mm5
             paddw mm0, mm2
             //  test  ((pa <= pb)? pa:pb) <= pc
-            pcmpgtw mm7, mm6        // pab > pc?
+            pcmpgtw mm7, mm6    // pab > pc?
             pxor mm1, mm1
             pand mm3, mm7
             pandn mm7, mm0
@@ -2821,7 +2698,7 @@
             paddb mm7, [edi + ebx]     // add Paeth predictor with Raw(x)
             movq mm6, mm2
             movq [edi + ebx], mm7      // write back updated value
-        movq mm1, [edi+ebx-8]
+            movq mm1, [edi+ebx-8]
             psllq mm6, ShiftBpp
             movq mm5, mm7
             psrlq mm1, ShiftRem
@@ -2844,23 +2721,23 @@
             // pa = abs(p-a) = abs(pav)
             // pb = abs(p-b) = abs(pbv)
             // pc = abs(p-c) = abs(pcv)
-            pcmpgtw mm0, mm4           // Create mask pav bytes < 0
+            pcmpgtw mm0, mm4       // Create mask pav bytes < 0
             paddw mm6, mm5
-            pand mm0, mm4              // Only pav bytes < 0 in mm7
-            pcmpgtw mm7, mm5           // Create mask pbv bytes < 0
+            pand mm0, mm4          // Only pav bytes < 0 in mm7
+            pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
             psubw mm4, mm0
-            pand mm7, mm5              // Only pbv bytes < 0 in mm0
+            pand mm7, mm5          // Only pbv bytes < 0 in mm0
             psubw mm4, mm0
             psubw mm5, mm7
             pxor mm0, mm0
-            pcmpgtw mm0, mm6           // Create mask pcv bytes < 0
-            pand mm0, mm6              // Only pav bytes < 0 in mm7
+            pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
+            pand mm0, mm6          // Only pav bytes < 0 in mm7
             psubw mm5, mm7
             psubw mm6, mm0
             //  test pa <= pb
             movq mm7, mm4
             psubw mm6, mm0
-            pcmpgtw mm7, mm5           // pa > pb?
+            pcmpgtw mm7, mm5       // pa > pb?
             movq mm0, mm7
             // use mm7 mask to merge pa & pb
             pand mm5, mm7
@@ -2879,29 +2756,31 @@
             paddw mm7, mm3
             pxor mm0, mm0
             // Step ex to next set of 8 bytes and repeat loop til done
-      add ebx, 8
+            add ebx, 8
             packuswb mm1, mm7
             paddb mm1, [edi + ebx - 8]     // add Paeth predictor with Raw(x)
-      cmp ebx, MMXLength
+            cmp ebx, MMXLength
             movq [edi + ebx - 8], mm1      // write back updated value
                                 // mm1 will be used as Raw(x-bpp) next loop
-      jb dpth6lp
-  } // end _asm block
+            jb dpth6lp
+         } // end _asm block
       }
       break;
+
       case 4:
       {
          ActiveMask.use  = 0x00000000ffffffff;
-  _asm {
+         _asm {
             mov ebx, diff
-         mov edi, row               //
-         mov esi, prev_row
+            mov edi, row
+            mov esi, prev_row
             pxor mm0, mm0
             // PRIME the pump (load the first Raw(x-bpp) data set
-      movq mm1, [edi+ebx-8] // Only time should need to read a=Raw(x-bpp) bytes
+            movq mm1, [edi+ebx-8]    // Only time should need to read
+                                     //  a=Raw(x-bpp) bytes
 dpth4lp:
             // Do first set of 4 bytes
-          movq mm3, [esi+ebx-8]      // read c=Prior(x-bpp) bytes
+            movq mm3, [esi+ebx-8]    // read c=Prior(x-bpp) bytes
             punpckhbw mm1, mm0       // Unpack Low bytes of a
             movq mm2, [esi + ebx]    // load b=Prior(x)
             punpcklbw mm2, mm0       // Unpack High bytes of b
@@ -2918,23 +2797,23 @@
             // pa = abs(p-a) = abs(pav)
             // pb = abs(p-b) = abs(pbv)
             // pc = abs(p-c) = abs(pcv)
-            pcmpgtw mm0, mm4           // Create mask pav bytes < 0
+            pcmpgtw mm0, mm4       // Create mask pav bytes < 0
             paddw mm6, mm5
-            pand mm0, mm4              // Only pav bytes < 0 in mm7
-            pcmpgtw mm7, mm5           // Create mask pbv bytes < 0
+            pand mm0, mm4          // Only pav bytes < 0 in mm7
+            pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
             psubw mm4, mm0
-            pand mm7, mm5              // Only pbv bytes < 0 in mm0
+            pand mm7, mm5          // Only pbv bytes < 0 in mm0
             psubw mm4, mm0
             psubw mm5, mm7
             pxor mm0, mm0
-            pcmpgtw mm0, mm6           // Create mask pcv bytes < 0
-            pand mm0, mm6              // Only pav bytes < 0 in mm7
+            pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
+            pand mm0, mm6          // Only pav bytes < 0 in mm7
             psubw mm5, mm7
             psubw mm6, mm0
             //  test pa <= pb
             movq mm7, mm4
             psubw mm6, mm0
-            pcmpgtw mm7, mm5           // pa > pb?
+            pcmpgtw mm7, mm5       // pa > pb?
             movq mm0, mm7
             // use mm7 mask to merge pa & pb
             pand mm5, mm7
@@ -2945,7 +2824,7 @@
             paddw mm7, mm5
             paddw mm0, mm2
             //  test  ((pa <= pb)? pa:pb) <= pc
-            pcmpgtw mm7, mm6           // pab > pc?
+            pcmpgtw mm7, mm6       // pab > pc?
             pxor mm1, mm1
             pand mm3, mm7
             pandn mm7, mm0
@@ -2974,23 +2853,23 @@
             // pa = abs(p-a) = abs(pav)
             // pb = abs(p-b) = abs(pbv)
             // pc = abs(p-c) = abs(pcv)
-            pcmpgtw mm0, mm4           // Create mask pav bytes < 0
+            pcmpgtw mm0, mm4       // Create mask pav bytes < 0
             paddw mm6, mm5
-            pand mm0, mm4              // Only pav bytes < 0 in mm7
-            pcmpgtw mm7, mm5           // Create mask pbv bytes < 0
+            pand mm0, mm4          // Only pav bytes < 0 in mm7
+            pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
             psubw mm4, mm0
-            pand mm7, mm5              // Only pbv bytes < 0 in mm0
+            pand mm7, mm5          // Only pbv bytes < 0 in mm0
             psubw mm4, mm0
             psubw mm5, mm7
             pxor mm0, mm0
-            pcmpgtw mm0, mm6           // Create mask pcv bytes < 0
-            pand mm0, mm6              // Only pav bytes < 0 in mm7
+            pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
+            pand mm0, mm6          // Only pav bytes < 0 in mm7
             psubw mm5, mm7
             psubw mm6, mm0
             //  test pa <= pb
             movq mm7, mm4
             psubw mm6, mm0
-            pcmpgtw mm7, mm5           // pa > pb?
+            pcmpgtw mm7, mm5       // pa > pb?
             movq mm0, mm7
             // use mm7 mask to merge pa & pb
             pand mm5, mm7
@@ -3001,7 +2880,7 @@
             paddw mm7, mm5
             paddw mm0, mm2
             //  test  ((pa <= pb)? pa:pb) <= pc
-            pcmpgtw mm7, mm6           // pab > pc?
+            pcmpgtw mm7, mm6       // pab > pc?
             pxor mm1, mm1
             pand mm3, mm7
             pandn mm7, mm0
@@ -3009,29 +2888,30 @@
             paddw mm7, mm3
             pxor mm0, mm0
             // Step ex to next set of 8 bytes and repeat loop til done
-        add ebx, 8
+            add ebx, 8
             packuswb mm1, mm7
             paddb mm1, [edi + ebx - 8]     // add Paeth predictor with Raw(x)
-      cmp ebx, MMXLength
+            cmp ebx, MMXLength
             movq [edi + ebx - 8], mm1      // write back updated value
                                 // mm1 will be used as Raw(x-bpp) next loop
-      jb dpth4lp
-  } // end _asm block
+            jb dpth4lp
+         } // end _asm block
       }
       break;
       case 8:                          // bpp == 8
       {
          ActiveMask.use  = 0x00000000ffffffff;
-   _asm {
+         _asm {
             mov ebx, diff
-         mov edi, row               //
-         mov esi, prev_row
+            mov edi, row
+            mov esi, prev_row
             pxor mm0, mm0
             // PRIME the pump (load the first Raw(x-bpp) data set
-      movq mm1, [edi+ebx-8] // Only time should need to read a=Raw(x-bpp) bytes
+            movq mm1, [edi+ebx-8]      // Only time should need to read
+                                       //  a=Raw(x-bpp) bytes
 dpth8lp:
             // Do first set of 4 bytes
-      movq mm3, [esi+ebx-8]      // read c=Prior(x-bpp) bytes
+            movq mm3, [esi+ebx-8]      // read c=Prior(x-bpp) bytes
             punpcklbw mm1, mm0         // Unpack Low bytes of a
             movq mm2, [esi + ebx]      // load b=Prior(x)
             punpcklbw mm2, mm0         // Unpack Low bytes of b
@@ -3048,23 +2928,23 @@
             // pa = abs(p-a) = abs(pav)
             // pb = abs(p-b) = abs(pbv)
             // pc = abs(p-c) = abs(pcv)
-            pcmpgtw mm0, mm4           // Create mask pav bytes < 0
+            pcmpgtw mm0, mm4       // Create mask pav bytes < 0
             paddw mm6, mm5
-            pand mm0, mm4              // Only pav bytes < 0 in mm7
-            pcmpgtw mm7, mm5           // Create mask pbv bytes < 0
+            pand mm0, mm4          // Only pav bytes < 0 in mm7
+            pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
             psubw mm4, mm0
-            pand mm7, mm5              // Only pbv bytes < 0 in mm0
+            pand mm7, mm5          // Only pbv bytes < 0 in mm0
             psubw mm4, mm0
             psubw mm5, mm7
             pxor mm0, mm0
-            pcmpgtw mm0, mm6           // Create mask pcv bytes < 0
-            pand mm0, mm6              // Only pav bytes < 0 in mm7
+            pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
+            pand mm0, mm6          // Only pav bytes < 0 in mm7
             psubw mm5, mm7
             psubw mm6, mm0
             //  test pa <= pb
             movq mm7, mm4
             psubw mm6, mm0
-            pcmpgtw mm7, mm5           // pa > pb?
+            pcmpgtw mm7, mm5       // pa > pb?
             movq mm0, mm7
             // use mm7 mask to merge pa & pb
             pand mm5, mm7
@@ -3075,24 +2955,24 @@
             paddw mm7, mm5
             paddw mm0, mm2
             //  test  ((pa <= pb)? pa:pb) <= pc
-            pcmpgtw mm7, mm6           // pab > pc?
+            pcmpgtw mm7, mm6       // pab > pc?
             pxor mm1, mm1
             pand mm3, mm7
             pandn mm7, mm0
             paddw mm7, mm3
             pxor mm0, mm0
             packuswb mm7, mm1
-        movq mm3, [esi+ebx-8]    // read c=Prior(x-bpp) bytes
+            movq mm3, [esi+ebx-8]    // read c=Prior(x-bpp) bytes
             pand mm7, ActiveMask
-            movq mm2, [esi + ebx]      // load b=Prior(x)
-            paddb mm7, [edi + ebx]     // add Paeth predictor with Raw(x)
-            punpckhbw mm3, mm0         // Unpack High bytes of c
-            movq [edi + ebx], mm7      // write back updated value
-        movq mm1, [edi+ebx-8]    // read a=Raw(x-bpp) bytes
+            movq mm2, [esi + ebx]    // load b=Prior(x)
+            paddb mm7, [edi + ebx]   // add Paeth predictor with Raw(x)
+            punpckhbw mm3, mm0       // Unpack High bytes of c
+            movq [edi + ebx], mm7    // write back updated value
+            movq mm1, [edi+ebx-8]    // read a=Raw(x-bpp) bytes
 
             // Do second set of 4 bytes
-            punpckhbw mm2, mm0         // Unpack High bytes of b
-            punpckhbw mm1, mm0         // Unpack High bytes of a
+            punpckhbw mm2, mm0       // Unpack High bytes of b
+            punpckhbw mm1, mm0       // Unpack High bytes of a
             // pav = p - a = (a + b - c) - a = b - c
             movq mm4, mm2
             // pbv = p - b = (a + b - c) - b = a - c
@@ -3105,23 +2985,23 @@
             // pa = abs(p-a) = abs(pav)
             // pb = abs(p-b) = abs(pbv)
             // pc = abs(p-c) = abs(pcv)
-            pcmpgtw mm0, mm4           // Create mask pav bytes < 0
+            pcmpgtw mm0, mm4       // Create mask pav bytes < 0
             paddw mm6, mm5
-            pand mm0, mm4              // Only pav bytes < 0 in mm7
-            pcmpgtw mm7, mm5           // Create mask pbv bytes < 0
+            pand mm0, mm4          // Only pav bytes < 0 in mm7
+            pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
             psubw mm4, mm0
-            pand mm7, mm5              // Only pbv bytes < 0 in mm0
+            pand mm7, mm5          // Only pbv bytes < 0 in mm0
             psubw mm4, mm0
             psubw mm5, mm7
             pxor mm0, mm0
-            pcmpgtw mm0, mm6           // Create mask pcv bytes < 0
-            pand mm0, mm6              // Only pav bytes < 0 in mm7
+            pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
+            pand mm0, mm6          // Only pav bytes < 0 in mm7
             psubw mm5, mm7
             psubw mm6, mm0
             //  test pa <= pb
             movq mm7, mm4
             psubw mm6, mm0
-            pcmpgtw mm7, mm5           // pa > pb?
+            pcmpgtw mm7, mm5       // pa > pb?
             movq mm0, mm7
             // use mm7 mask to merge pa & pb
             pand mm5, mm7
@@ -3132,7 +3012,7 @@
             paddw mm7, mm5
             paddw mm0, mm2
             //  test  ((pa <= pb)? pa:pb) <= pc
-            pcmpgtw mm7, mm6           // pab > pc?
+            pcmpgtw mm7, mm6       // pab > pc?
             pxor mm1, mm1
             pand mm3, mm7
             pandn mm7, mm0
@@ -3140,26 +3020,27 @@
             paddw mm7, mm3
             pxor mm0, mm0
             // Step ex to next set of 8 bytes and repeat loop til done
-        add ebx, 8
+            add ebx, 8
             packuswb mm1, mm7
             paddb mm1, [edi + ebx - 8]     // add Paeth predictor with Raw(x)
-        cmp ebx, MMXLength
+            cmp ebx, MMXLength
             movq [edi + ebx - 8], mm1      // write back updated value
                             // mm1 will be used as Raw(x-bpp) next loop
-        jb dpth8lp
-      } // end _asm block
+            jb dpth8lp
+         } // end _asm block
       }
       break;
-      case 1:                          // bpp = 1
-      case 2:                          // bpp = 2
-      default:                         // bpp > 8
+
+      case 1:                // bpp = 1
+      case 2:                // bpp = 2
+      default:               // bpp > 8
       {
-   _asm {
-      mov ebx, diff
-      cmp ebx, FullLength
-      jnb dpthdend
-        mov edi, row               //
-        mov esi, prev_row
+         _asm {
+            mov ebx, diff
+            cmp ebx, FullLength
+            jnb dpthdend
+            mov edi, row
+            mov esi, prev_row
             // Do Paeth decode for remaining bytes
             mov edx, ebx
             xor ecx, ecx        // zero ecx before using cl & cx in loop below
@@ -3221,25 +3102,26 @@
             // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
             mov cl, [edi + edx]  // load Raw(x-bpp) into cl
 dpthdpaeth:
-      inc ebx
-      inc edx
+            inc ebx
+            inc edx
             // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
             add [edi + ebx - 1], cl
-      cmp ebx, FullLength
-      jb dpthdlp
+            cmp ebx, FullLength
+            jb dpthdlp
 dpthdend:
-        } // end _asm block
+         } // end _asm block
       }
       return;                   // No need to go further with this one
-      }                         // end switch ( bpp )
-      _asm {
+   }                         // end switch ( bpp )
+   _asm
+   {
          // MMX acceleration complete now do clean-up
          // Check if any remaining bytes left to decode
-   mov ebx, MMXLength
-   cmp ebx, FullLength
-   jnb dpthend
-     mov edi, row
-     mov esi, prev_row
+         mov ebx, MMXLength
+         cmp ebx, FullLength
+         jnb dpthend
+         mov edi, row
+         mov esi, prev_row
          // Do Paeth decode for remaining bytes
          mov edx, ebx
          xor ecx, ecx         // zero ecx before using cl & cx in loop below
@@ -3301,69 +3183,71 @@
          // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
          mov cl, [edi + edx]  // load Raw(x-bpp) into cl
 dpthpaeth2:
-      inc ebx
-      inc edx
+         inc ebx
+         inc edx
          // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
          add [edi + ebx - 1], cl
-      cmp ebx, FullLength
-      jb dpthlp2
+         cmp ebx, FullLength
+         jb dpthlp2
 dpthend:
-      emms             // End MMX instructions; prep for possible FP instrs.
-     } // end _asm block
+         emms             // End MMX instructions; prep for possible FP instrs.
+   } // end _asm block
 }
 
 // Optimized code for PNG Sub filter decoder
 void
 png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
 {
-    //int test;
-      int bpp;
-    png_uint_32 FullLength;
-    png_uint_32 MMXLength;
-    int diff;
-      bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
-    FullLength  = row_info->rowbytes - bpp; // # of bytes to filter
-    _asm {
+   //int test;
+   int bpp;
+   png_uint_32 FullLength;
+   png_uint_32 MMXLength;
+   int diff;
+
+   bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
+   FullLength  = row_info->rowbytes - bpp; // # of bytes to filter
+   _asm {
         mov edi, row
         mov esi, edi               // lp = row
-            add edi, bpp               // rp = row + bpp
-            xor eax, eax
-            // get # of bytes to alignment
-            mov diff, edi               // take start of row
-            add diff, 0xf               // add 7 + 8 to incr past
+        add edi, bpp               // rp = row + bpp
+        xor eax, eax
+        // get # of bytes to alignment
+        mov diff, edi               // take start of row
+        add diff, 0xf               // add 7 + 8 to incr past
                                         // alignment boundary
-            xor ebx, ebx
-            and diff, 0xfffffff8        // mask to alignment boundary
-            sub diff, edi               // subtract from start ==> value
+        xor ebx, ebx
+        and diff, 0xfffffff8        // mask to alignment boundary
+        sub diff, edi               // subtract from start ==> value
                                         //  ebx at alignment
-            jz dsubgo
-            // fix alignment
+        jz dsubgo
+        // fix alignment
 dsublp1:
-         mov al, [esi+ebx]
-         add [edi+ebx], al
-          inc ebx
-          cmp ebx, diff
-         jb dsublp1
+        mov al, [esi+ebx]
+        add [edi+ebx], al
+        inc ebx
+        cmp ebx, diff
+        jb dsublp1
 dsubgo:
-         mov ecx, FullLength
-            mov edx, ecx
-            sub edx, ebx                  // subtract alignment fix
-            and edx, 0x00000007           // calc bytes over mult of 8
-            sub ecx, edx                  // drop over bytes from length
-            mov MMXLength, ecx
-     } // end _asm block
-      // Now do the math for the rest of the row
-      switch ( bpp )
-      {
-      case 3:
-    {
+        mov ecx, FullLength
+        mov edx, ecx
+        sub edx, ebx                  // subtract alignment fix
+        and edx, 0x00000007           // calc bytes over mult of 8
+        sub ecx, edx                  // drop over bytes from length
+        mov MMXLength, ecx
+   } // end _asm block
+
+   // Now do the math for the rest of the row
+   switch ( bpp )
+   {
+        case 3:
+        {
          ActiveMask.use  = 0x0000ffffff000000;
          ShiftBpp.use = 24;       // == 3 * 8
          ShiftRem.use  = 40;      // == 64 - 24
-      _asm {
+         _asm {
             mov edi, row
             movq mm7, ActiveMask  // Load ActiveMask for 2nd active byte group
-        mov esi, edi              // lp = row
+            mov esi, edi              // lp = row
             add edi, bpp          // rp = row + bpp
             movq mm6, mm7
             mov ebx, diff
@@ -3376,234 +3260,242 @@
                           // no need for mask; shift clears inactive bytes
             // Add 1st active group
             movq mm0, [edi+ebx]
-        paddb mm0, mm1
+            paddb mm0, mm1
             // Add 2nd active group
             movq mm1, mm0         // mov updated Raws to mm1
             psllq mm1, ShiftBpp   // shift data to position correctly
             pand mm1, mm7         // mask to use only 2nd active group
-        paddb mm0, mm1
+            paddb mm0, mm1
             // Add 3rd active group
             movq mm1, mm0         // mov updated Raws to mm1
             psllq mm1, ShiftBpp   // shift data to position correctly
             pand mm1, mm6         // mask to use only 3rd active group
-        add ebx, 8
-        paddb mm0, mm1
-        cmp ebx, MMXLength
-        movq [edi+ebx-8], mm0     // Write updated Raws back to array
+            add ebx, 8
+            paddb mm0, mm1
+            cmp ebx, MMXLength
+            movq [edi+ebx-8], mm0     // Write updated Raws back to array
             // Prep for doing 1st add at top of loop
             movq mm1, mm0
-        jb dsub3lp
-      } // end _asm block
+            jb dsub3lp
+         } // end _asm block
       }
       break;
+
       case 1:
-    {
-      /* Placed here just in case this is a duplicate of the
-      non-MMX code for the SUB filter in png_read_filter_row
-                        above
-      */
-//         png_bytep rp;
-//         png_bytep lp;
-//         png_uint_32 i;
-//         bpp = (row_info->pixel_depth + 7) >> 3;
-//         for (i = (png_uint_32)bpp, rp = row + bpp, lp = row;
-//            i < row_info->rowbytes; i++, rp++, lp++)
-//      {
-//            *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff);
-//      }
-      _asm {
+      {
+         // Placed here just in case this is a duplicate of the
+         // non-MMX code for the SUB filter in png_read_filter_row above
+         //
+         //         png_bytep rp;
+         //         png_bytep lp;
+         //         png_uint_32 i;
+         //         bpp = (row_info->pixel_depth + 7) >> 3;
+         //         for (i = (png_uint_32)bpp, rp = row + bpp, lp = row;
+         //            i < row_info->rowbytes; i++, rp++, lp++)
+         //      {
+         //            *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff);
+         //      }
+         _asm {
             mov ebx, diff
             mov edi, row
-        cmp ebx, FullLength
-        jnb dsub1end
-        mov esi, edi          // lp = row
-        xor eax, eax
+            cmp ebx, FullLength
+            jnb dsub1end
+            mov esi, edi          // lp = row
+            xor eax, eax
             add edi, bpp      // rp = row + bpp
 dsub1lp:
-        mov al, [esi+ebx]
-        add [edi+ebx], al
-          inc ebx
-          cmp ebx, FullLength
-        jb dsub1lp
+            mov al, [esi+ebx]
+            add [edi+ebx], al
+            inc ebx
+            cmp ebx, FullLength
+            jb dsub1lp
 dsub1end:
-      } // end _asm block
-    }
+         } // end _asm block
+      }
       return;
+
       case 6:
       case 7:
       case 4:
       case 5:
-    {
+      {
          ShiftBpp.use = bpp << 3;
          ShiftRem.use = 64 - ShiftBpp.use;
-      _asm {
+         _asm {
             mov edi, row
             mov ebx, diff
-        mov esi, edi               // lp = row
+            mov esi, edi               // lp = row
             add edi, bpp           // rp = row + bpp
             // PRIME the pump (load the first Raw(x-bpp) data set
             movq mm1, [edi+ebx-8]
 dsub4lp:
             psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
                           // no need for mask; shift clears inactive bytes
-        movq mm0, [edi+ebx]
-        paddb mm0, mm1
+            movq mm0, [edi+ebx]
+            paddb mm0, mm1
             // Add 2nd active group
             movq mm1, mm0          // mov updated Raws to mm1
             psllq mm1, ShiftBpp    // shift data to position correctly
                                    // there is no need for any mask
                                    // since shift clears inactive bits/bytes
-        add ebx, 8
-        paddb mm0, mm1
-        cmp ebx, MMXLength
-        movq [edi+ebx-8], mm0
+            add ebx, 8
+            paddb mm0, mm1
+            cmp ebx, MMXLength
+            movq [edi+ebx-8], mm0
             movq mm1, mm0          // Prep for doing 1st add at top of loop
-        jb dsub4lp
-      } // end _asm block
+            jb dsub4lp
+         } // end _asm block
       }
       break;
+
       case 2:
-    {
+      {
          ActiveMask.use  = 0x00000000ffff0000;
          ShiftBpp.use = 16;       // == 2 * 8
          ShiftRem.use = 48;       // == 64 - 16
-      _asm {
+         _asm {
             movq mm7, ActiveMask  // Load ActiveMask for 2nd active byte group
             mov ebx, diff
             movq mm6, mm7
-        mov edi, row
-            psllq mm6, ShiftBpp // Move mask in mm6 to cover 3rd active byte group
-        mov esi, edi            // lp = row
+            mov edi, row
+            psllq mm6, ShiftBpp     // Move mask in mm6 to cover 3rd active
+                                    //  byte group
+            mov esi, edi            // lp = row
             movq mm5, mm6
-            add edi, bpp        // rp = row + bpp
-            psllq mm5, ShiftBpp // Move mask in mm5 to cover 4th active byte group
+            add edi, bpp            // rp = row + bpp
+            psllq mm5, ShiftBpp     // Move mask in mm5 to cover 4th active
+                                    //  byte group
             // PRIME the pump (load the first Raw(x-bpp) data set
             movq mm1, [edi+ebx-8]
 dsub2lp:
             // Add 1st active group
-            psrlq mm1, ShiftRem    // Shift data for adding 1st bpp bytes
-                                 // no need for mask; shift clears inactive bytes
+            psrlq mm1, ShiftRem     // Shift data for adding 1st bpp bytes
+                                    // no need for mask; shift clears inactive
+                                    //  bytes
             movq mm0, [edi+ebx]
-        paddb mm0, mm1
+            paddb mm0, mm1
             // Add 2nd active group
-            movq mm1, mm0              // mov updated Raws to mm1
-            psllq mm1, ShiftBpp      // shift data to position correctly
-            pand mm1, mm7              // mask to use only 2nd active group
-        paddb mm0, mm1
+            movq mm1, mm0           // mov updated Raws to mm1
+            psllq mm1, ShiftBpp     // shift data to position correctly
+            pand mm1, mm7           // mask to use only 2nd active group
+            paddb mm0, mm1
             // Add 3rd active group
-            movq mm1, mm0              // mov updated Raws to mm1
-            psllq mm1, ShiftBpp      // shift data to position correctly
-            pand mm1, mm6              // mask to use only 3rd active group
-        paddb mm0, mm1
+            movq mm1, mm0           // mov updated Raws to mm1
+            psllq mm1, ShiftBpp     // shift data to position correctly
+            pand mm1, mm6           // mask to use only 3rd active group
+            paddb mm0, mm1
             // Add 4th active group
-            movq mm1, mm0              // mov updated Raws to mm1
-            psllq mm1, ShiftBpp      // shift data to position correctly
-            pand mm1, mm5              // mask to use only 4th active group
-        add ebx, 8
-        paddb mm0, mm1
-        cmp ebx, MMXLength
-        movq [edi+ebx-8], mm0        // Write updated Raws back to array
-            movq mm1, mm0            // Prep for doing 1st add at top of loop
-        jb dsub2lp
-      } // end _asm block
+            movq mm1, mm0           // mov updated Raws to mm1
+            psllq mm1, ShiftBpp     // shift data to position correctly
+            pand mm1, mm5           // mask to use only 4th active group
+            add ebx, 8
+            paddb mm0, mm1
+            cmp ebx, MMXLength
+            movq [edi+ebx-8], mm0   // Write updated Raws back to array
+            movq mm1, mm0           // Prep for doing 1st add at top of loop
+            jb dsub2lp
+         } // end _asm block
       }
       break;
       case 8:
-    {
-      _asm {
-        mov edi, row
+      {
+         _asm {
+            mov edi, row
             mov ebx, diff
-        mov esi, edi               // lp = row
-            add edi, bpp             // rp = row + bpp
-         mov ecx, MMXLength
+            mov esi, edi            // lp = row
+            add edi, bpp            // rp = row + bpp
+            mov ecx, MMXLength
             movq mm7, [edi+ebx-8]   // PRIME the pump (load the first
                                     // Raw(x-bpp) data set
             and ecx, 0x0000003f     // calc bytes over mult of 64
 dsub8lp:
-        movq mm0, [edi+ebx]         // Load Sub(x) for 1st 8 bytes
-        paddb mm0, mm7
-               movq mm1, [edi+ebx+8]   // Load Sub(x) for 2nd 8 bytes
-        movq [edi+ebx], mm0        // Write Raw(x) for 1st 8 bytes
+            movq mm0, [edi+ebx]     // Load Sub(x) for 1st 8 bytes
+            paddb mm0, mm7
+            movq mm1, [edi+ebx+8]   // Load Sub(x) for 2nd 8 bytes
+            movq [edi+ebx], mm0    // Write Raw(x) for 1st 8 bytes
                                    // Now mm0 will be used as Raw(x-bpp) for
                                    // the 2nd group of 8 bytes.  This will be
                                    // repeated for each group of 8 bytes with
                                    // the 8th group being used as the Raw(x-bpp)
                                    // for the 1st group of the next loop.
-        paddb mm1, mm0
-        movq mm2, [edi+ebx+16]      // Load Sub(x) for 3rd 8 bytes
-        movq [edi+ebx+8], mm1      // Write Raw(x) for 2nd 8 bytes
-        paddb mm2, mm1
-        movq mm3, [edi+ebx+24]      // Load Sub(x) for 4th 8 bytes
-        movq [edi+ebx+16], mm2      // Write Raw(x) for 3rd 8 bytes
-        paddb mm3, mm2
-        movq mm4, [edi+ebx+32] // Load Sub(x) for 5th 8 bytes
-        movq [edi+ebx+24], mm3      // Write Raw(x) for 4th 8 bytes
-        paddb mm4, mm3
-        movq mm5, [edi+ebx+40] // Load Sub(x) for 6th 8 bytes
-        movq [edi+ebx+32], mm4      // Write Raw(x) for 5th 8 bytes
-        paddb mm5, mm4
-        movq mm6, [edi+ebx+48]  // Load Sub(x) for 7th 8 bytes
-        movq [edi+ebx+40], mm5      // Write Raw(x) for 6th 8 bytes
-        paddb mm6, mm5
-        movq mm7, [edi+ebx+56]      // Load Sub(x) for 8th 8 bytes
-        movq [edi+ebx+48], mm6      // Write Raw(x) for 7th 8 bytes
-        add ebx, 64
-        paddb mm7, mm6
-        cmp ebx, ecx
-        movq [edi+ebx-8], mm7      // Write Raw(x) for 8th 8 bytes
-        jb dsub8lp
-        cmp ebx, MMXLength
-        jnb dsub8lt8
+            paddb mm1, mm0
+            movq mm2, [edi+ebx+16]  // Load Sub(x) for 3rd 8 bytes
+            movq [edi+ebx+8], mm1   // Write Raw(x) for 2nd 8 bytes
+            paddb mm2, mm1
+            movq mm3, [edi+ebx+24]  // Load Sub(x) for 4th 8 bytes
+            movq [edi+ebx+16], mm2  // Write Raw(x) for 3rd 8 bytes
+            paddb mm3, mm2
+            movq mm4, [edi+ebx+32]  // Load Sub(x) for 5th 8 bytes
+            movq [edi+ebx+24], mm3  // Write Raw(x) for 4th 8 bytes
+            paddb mm4, mm3
+            movq mm5, [edi+ebx+40]  // Load Sub(x) for 6th 8 bytes
+            movq [edi+ebx+32], mm4  // Write Raw(x) for 5th 8 bytes
+            paddb mm5, mm4
+            movq mm6, [edi+ebx+48]  // Load Sub(x) for 7th 8 bytes
+            movq [edi+ebx+40], mm5  // Write Raw(x) for 6th 8 bytes
+            paddb mm6, mm5
+            movq mm7, [edi+ebx+56]  // Load Sub(x) for 8th 8 bytes
+            movq [edi+ebx+48], mm6  // Write Raw(x) for 7th 8 bytes
+            add ebx, 64
+            paddb mm7, mm6
+            cmp ebx, ecx
+            movq [edi+ebx-8], mm7   // Write Raw(x) for 8th 8 bytes
+            jb dsub8lp
+            cmp ebx, MMXLength
+            jnb dsub8lt8
 dsub8lpA:
             movq mm0, [edi+ebx]
-        add ebx, 8
-        paddb mm0, mm7
-        cmp ebx, MMXLength
-        movq [edi+ebx-8], mm0   // use -8 to offset early add to ebx
-            movq mm7, mm0       // Move calculated Raw(x) data to mm1 to
-                                // be the new Raw(x-bpp) for the next loop
-        jb dsub8lpA
+            add ebx, 8
+            paddb mm0, mm7
+            cmp ebx, MMXLength
+            movq [edi+ebx-8], mm0   // use -8 to offset early add to ebx
+            movq mm7, mm0           // Move calculated Raw(x) data to mm1 to
+                                    // be the new Raw(x-bpp) for the next loop
+            jb dsub8lpA
 dsub8lt8:
-      } // end _asm block
+         } // end _asm block
       }
       break;
+
       default:                // bpp greater than 8 bytes
-    {
-      _asm {
+      {
+         _asm {
             mov ebx, diff
-        mov edi, row
-        mov esi, edi               // lp = row
+            mov edi, row
+            mov esi, edi           // lp = row
             add edi, bpp           // rp = row + bpp
 dsubAlp:
-        movq mm0, [edi+ebx]
-        movq mm1, [esi+ebx]
-        add ebx, 8
-        paddb mm0, mm1
-        cmp ebx, MMXLength
-        movq [edi+ebx-8], mm0 // mov does not affect flags; -8 to offset add ebx
-        jb dsubAlp
-      } // end _asm block
+            movq mm0, [edi+ebx]
+            movq mm1, [esi+ebx]
+            add ebx, 8
+            paddb mm0, mm1
+            cmp ebx, MMXLength
+            movq [edi+ebx-8], mm0  // mov does not affect flags; -8 to offset
+                                   //  add ebx
+            jb dsubAlp
+         } // end _asm block
       }
       break;
-      }                                // end switch ( bpp )
 
-      _asm {
-            mov ebx, MMXLength
-            mov edi, row
+   } // end switch ( bpp )
+
+   _asm {
+        mov ebx, MMXLength
+        mov edi, row
         cmp ebx, FullLength
         jnb dsubend
         mov esi, edi               // lp = row
         xor eax, eax
-            add edi, bpp           // rp = row + bpp
+        add edi, bpp               // rp = row + bpp
 dsublp2:
         mov al, [esi+ebx]
         add [edi+ebx], al
-          inc ebx
-          cmp ebx, FullLength
+        inc ebx
+        cmp ebx, FullLength
         jb dsublp2
 dsubend:
-         emms             // End MMX instructions; prep for possible FP instrs.
-    } // end _asm block
+        emms             // End MMX instructions; prep for possible FP instrs.
+   } // end _asm block
 }
 
 // Optimized code for PNG Up filter decoder
@@ -3611,20 +3503,20 @@
 png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
    png_bytep prev_row)
 {
-      png_uint_32 len;
-    len  = row_info->rowbytes;       // # of bytes to filter
-    _asm {
+   png_uint_32 len;
+   len  = row_info->rowbytes;       // # of bytes to filter
+   _asm {
       mov edi, row
-         // get # of bytes to alignment
-         mov ecx, edi
-       xor ebx, ebx
-         add ecx, 0x7
-         xor eax, eax
-         and ecx, 0xfffffff8
+      // get # of bytes to alignment
+      mov ecx, edi
+      xor ebx, ebx
+      add ecx, 0x7
+      xor eax, eax
+      and ecx, 0xfffffff8
       mov esi, prev_row
-         sub ecx, edi
-         jz dupgo
-         // fix alignment
+      sub ecx, edi
+      jz dupgo
+      // fix alignment
 duplp1:
       mov al, [edi+ebx]
       add al, [esi+ebx]
@@ -3634,47 +3526,47 @@
       jb duplp1
 dupgo:
       mov ecx, len
-         mov edx, ecx
-         sub edx, ebx                  // subtract alignment fix
-         and edx, 0x0000003f           // calc bytes over mult of 64
-         sub ecx, edx                  // drop over bytes from length
-         // Unrolled loop - use all MMX registers and interleave to reduce
-         // number of branch instructions (loops) and reduce partial stalls
+      mov edx, ecx
+      sub edx, ebx                  // subtract alignment fix
+      and edx, 0x0000003f           // calc bytes over mult of 64
+      sub ecx, edx                  // drop over bytes from length
+      // Unrolled loop - use all MMX registers and interleave to reduce
+      // number of branch instructions (loops) and reduce partial stalls
 duploop:
       movq mm1, [esi+ebx]
       movq mm0, [edi+ebx]
-          movq mm3, [esi+ebx+8]
+      movq mm3, [esi+ebx+8]
       paddb mm0, mm1
-          movq mm2, [edi+ebx+8]
+      movq mm2, [edi+ebx+8]
       movq [edi+ebx], mm0
-          paddb mm2, mm3
-            movq mm5, [esi+ebx+16]
-          movq [edi+ebx+8], mm2
-            movq mm4, [edi+ebx+16]
-               movq mm7, [esi+ebx+24]
-            paddb mm4, mm5
-               movq mm6, [edi+ebx+24]
-            movq [edi+ebx+16], mm4
-               paddb mm6, mm7
+      paddb mm2, mm3
+      movq mm5, [esi+ebx+16]
+      movq [edi+ebx+8], mm2
+      movq mm4, [edi+ebx+16]
+      movq mm7, [esi+ebx+24]
+      paddb mm4, mm5
+      movq mm6, [edi+ebx+24]
+      movq [edi+ebx+16], mm4
+      paddb mm6, mm7
       movq mm1, [esi+ebx+32]
-               movq [edi+ebx+24], mm6
+      movq [edi+ebx+24], mm6
       movq mm0, [edi+ebx+32]
-         movq mm3, [esi+ebx+40]
+      movq mm3, [esi+ebx+40]
       paddb mm0, mm1
-         movq mm2, [edi+ebx+40]
+      movq mm2, [edi+ebx+40]
       movq [edi+ebx+32], mm0
-         paddb mm2, mm3
-            movq mm5, [esi+ebx+48]
-         movq [edi+ebx+40], mm2
-            movq mm4, [edi+ebx+48]
-               movq mm7, [esi+ebx+56]
-            paddb mm4, mm5
-               movq mm6, [edi+ebx+56]
-            movq [edi+ebx+48], mm4
-         add ebx, 64
-               paddb mm6, mm7
+      paddb mm2, mm3
+      movq mm5, [esi+ebx+48]
+      movq [edi+ebx+40], mm2
+      movq mm4, [edi+ebx+48]
+      movq mm7, [esi+ebx+56]
+      paddb mm4, mm5
+      movq mm6, [edi+ebx+56]
+      movq [edi+ebx+48], mm4
+      add ebx, 64
+      paddb mm6, mm7
       cmp ebx, ecx
-               movq [edi+ebx-8], mm6 // (+56)movq does not affect flags;
+      movq [edi+ebx-8], mm6 // (+56)movq does not affect flags;
                                      // -8 to offset add ebx
       jb duploop
 
@@ -3682,17 +3574,17 @@
       jz dupend
 
 
-         // 2 lines added by lcreeve@netins.net
-         // (mail 11 Jul 98 in png-implement list)
-         cmp edx, 8 //test for less than 8 bytes
-         jb duplt8
+      // 2 lines added by lcreeve@netins.net
+      // (mail 11 Jul 98 in png-implement list)
+      cmp edx, 8 //test for less than 8 bytes
+      jb duplt8
 
 
-         add ecx, edx
-         and edx, 0x00000007           // calc bytes over mult of 8
-         sub ecx, edx                  // drop over bytes from length
+      add ecx, edx
+      and edx, 0x00000007           // calc bytes over mult of 8
+      sub ecx, edx                  // drop over bytes from length
       jz duplt8
-         // Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously
+      // Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously
 duplpA:
       movq mm1, [esi+ebx]
       movq mm0, [edi+ebx]
@@ -3704,9 +3596,9 @@
       cmp edx, 0            // Test for bytes over mult of 8
       jz dupend
 duplt8:
-         xor eax, eax
+      xor eax, eax
       add ecx, edx          // move over byte count into counter
-         // Loop using x86 registers to update remaining bytes
+      // Loop using x86 registers to update remaining bytes
 duplp2:
       mov al, [edi + ebx]
       add al, [esi + ebx]
@@ -3715,52 +3607,54 @@
       mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx
       jb duplp2
 dupend:
-         // Conversion of filtered row completed
+      // Conversion of filtered row completed
       emms          // End MMX instructions; prep for possible FP instrs.
-    } // end _asm block
+   } // end _asm block
 }
 
 
-
 // Optimized png_read_filter_row routines
 void
 png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
    row, png_bytep prev_row, int filter)
 {
+#ifdef PNG_DEBUG
    char filnm[6];
+#endif
    #define UseMMX (1)
 
+   if (mmx_supported == 2)
+       mmx_supported = mmxsupport();
 
-   if (mmx_supported==2)
-       mmx_supported=mmxsupport();
-   //if (!mmx_supported)
+   if (!mmx_supported)
    {
        png_read_filter_row_c(png_ptr, row_info, row, prev_row, filter);
        return ;
    }
 
-
+#ifdef PNG_DEBUG
    png_debug(1, "in png_read_filter_row\n");
    png_debug1(0,"%s, ", (UseMMX?"MMX":"x86"));
    switch (filter)
    {
-   case 0: sprintf(filnm, "None ");
-      break;
-   case 1: sprintf(filnm, "Sub  ");
-      break;
-   case 2: sprintf(filnm, "Up   ");
-      break;
-   case 3: sprintf(filnm, "Avg  ");
-      break;
-   case 4: sprintf(filnm, "Paeth");
-      break;
-   default: sprintf(filnm, "Unknw");
-      break;
+      case 0: sprintf(filnm, "None ");
+         break;
+      case 1: sprintf(filnm, "Sub  ");
+         break;
+      case 2: sprintf(filnm, "Up   ");
+         break;
+      case 3: sprintf(filnm, "Avg  ");
+         break;
+      case 4: sprintf(filnm, "Paeth");
+         break;
+      default: sprintf(filnm, "Unknw");
+         break;
    }
    png_debug2(0,"row=%5d, %s, ", png_ptr->row_number, filnm);
    png_debug2(0, "pd=%2d, b=%d, ", (int)row_info->pixel_depth,
       (int)((row_info->pixel_depth + 7) >> 3));
    png_debug1(0,"len=%8d, ", row_info->rowbytes);
+#endif
 
    switch (filter)
    {
@@ -3775,16 +3669,17 @@
          }  //end if UseMMX
          else
          {
-            int bpp;
-            png_bytep rp;
-            png_bytep lp;
             png_uint_32 i;
-            bpp = (row_info->pixel_depth + 7) >> 3;
-            for (i = (png_uint_32)bpp, rp = row + bpp, lp = row;
-               i < row_info->rowbytes; i++, rp++, lp++)
+            png_uint_32 istop = row_info->rowbytes;
+            png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
+            png_bytep rp = row + bpp;
+            png_bytep lp = row;
+
+            for (i = bpp; i < istop; i++)
             {
-               *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff);
-         }
+               *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
+               rp++;
+            }
          }  //end !UseMMX
          break;
       }
@@ -3817,23 +3712,26 @@
          }  //end if UseMMX
          else
          {
-      png_uint_32 i;
-      int bpp;
-      png_bytep rp;
-      png_bytep pp;
-      png_bytep lp;
-            bpp = (row_info->pixel_depth + 7) >> 3;
-            for (i = 0, rp = row, pp = prev_row;
-               i < (png_uint_32)bpp; i++, rp++, pp++)
-      {
+            png_uint_32 i;
+            png_bytep rp = row;
+            png_bytep pp = prev_row;
+            png_bytep lp = row;
+            png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
+            png_uint_32 istop = row_info->rowbytes - bpp;
+
+            for (i = 0; i < bpp; i++)
+            {
                *rp = (png_byte)(((int)(*rp) +
-                  ((int)(*pp) / 2)) & 0xff);
-      }
-            for (lp = row; i < row_info->rowbytes; i++, rp++, lp++, pp++)
-      {
+                  ((int)(*pp++) >> 1)) & 0xff);
+               rp++;
+            }
+
+            for (i = 0; i < istop; i++)
+            {
                *rp = (png_byte)(((int)(*rp) +
-                  (int)(*pp + *lp) / 2) & 0xff);
-      }
+                  ((int)(*pp++ + *lp++) >> 1)) & 0xff);
+               rp++;
+            }
          }  //end !UseMMX
          break;
       }
@@ -3846,36 +3744,54 @@
          }  //end if UseMMX
          else
          {
-            int bpp;
             png_uint_32 i;
-            png_bytep rp;
-            png_bytep pp;
-            png_bytep lp;
-            png_bytep cp;
-            bpp = (row_info->pixel_depth + 7) >> 3;
-            for (i = 0, rp = row, pp = prev_row;
-               i < (png_uint_32)bpp; i++, rp++, pp++)
+            png_bytep rp = row;
+            png_bytep pp = prev_row;
+            png_bytep lp = row;
+            png_bytep cp = prev_row;
+            png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
+            png_uint_32 istop=row_info->rowbytes - bpp;
+
+            for (i = 0; i < bpp; i++)
             {
-               *rp = (png_byte)(((int)(*rp) + (int)(*pp)) & 0xff);
+               *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
+               rp++;
             }
-            for (lp = rp - bpp, cp = pp - bpp;
-               i < row_info->rowbytes; i++, rp++, pp++, lp++, cp++)
+
+            for (i = 0; i < istop; i++)   // use leftover rp,pp
             {
                int a, b, c, pa, pb, pc, p;
-               b = *pp;
-               c = *cp;
-               a = *lp;
-               p = a + b - c;
-               pa = abs(p - a);
-               pb = abs(p - b);
-               pc = abs(p - c);
-               if (pa <= pb && pa <= pc)
-                  p = a;
-               else if (pb <= pc)
-                  p = b;
-               else
-                  p = c;
+
+               a = *lp++;
+               b = *pp++;
+               c = *cp++;
+
+               p = b - c;
+               pc = a - c;
+
+#ifdef    PNG_USE_ABS
+               pa = abs(p);
+               pb = abs(pc);
+               pc = abs(p + pc);
+#else
+               pa = p < 0 ? -p : p;
+               pb = pc < 0 ? -pc : pc;
+               pc = (p + pc) < 0 ? -(p + pc) : p + pc;
+#endif
+
+               /*
+                  if (pa <= pb && pa <= pc)
+                     p = a;
+                  else if (pb <= pc)
+                     p = b;
+                  else
+                     p = c;
+                */
+
+               p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c;
+
                *rp = (png_byte)(((int)(*rp) + p) & 0xff);
+               rp++;
             }
          }  //end !UseMMX
          break;
diff --git a/pngwio.c b/pngwio.c
index 3831acf..d5444a0 100644
--- a/pngwio.c
+++ b/pngwio.c
@@ -1,7 +1,7 @@
 
 /* pngwio.c - functions for data output
  *
- * libpng 1.0.4 - September 19, 1999
+ * libpng 1.0.4c - October 1, 1999
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
  * Copyright (c) 1996, 1997 Andreas Dilger
diff --git a/pngwrite.c b/pngwrite.c
index 9a3c928..9830ca1 100644
--- a/pngwrite.c
+++ b/pngwrite.c
@@ -1,7 +1,7 @@
 
 /* pngwrite.c - general routines to write a PNG file
  *
- * libpng 1.0.4 - September 19, 1999
+ * libpng 1.0.4c - October 1, 1999
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
  * Copyright (c) 1996, 1997 Andreas Dilger
diff --git a/pngwtran.c b/pngwtran.c
index 10f50c0..cd32a62 100644
--- a/pngwtran.c
+++ b/pngwtran.c
@@ -1,7 +1,7 @@
 
 /* pngwtran.c - transforms the data in a row for PNG writers
  *
- * libpng 1.0.4 - September 19, 1999
+ * libpng 1.0.4c - October 1, 1999
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
  * Copyright (c) 1996, 1997 Andreas Dilger
diff --git a/pngwutil.c b/pngwutil.c
index 446c4da..b7a104b 100644
--- a/pngwutil.c
+++ b/pngwutil.c
@@ -1,7 +1,7 @@
 
 /* pngwutil.c - utilities to write a PNG file
  *
- * libpng 1.0.4 - September 19, 1999
+ * libpng 1.0.4c - October 1, 1999
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
  * Copyright (c) 1996, 1997 Andreas Dilger
diff --git a/scripts/makefile.beos b/scripts/makefile.beos
index bc7be4a..0a89151 100644
--- a/scripts/makefile.beos
+++ b/scripts/makefile.beos
@@ -1,5 +1,5 @@
 # makefile for libpng on BeOS x86 ELF with gcc
-# modified from makefile.lnx by Sander Stoks
+# modified from makefile.linux by Sander Stoks
 # Copyright (C) 1996, 1997 Andreas Dilger
 # Copyright (C) 1999 Greg Roelofs
 # For conditions of distribution and use, see copyright notice in png.h
@@ -31,7 +31,7 @@
 # read libpng.txt or png.h to see why PNGMAJ is 2.  You should not
 # have to change it.
 PNGMAJ = 2
-PNGMIN = 1.0.4
+PNGMIN = 1.0.4c
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 # where make install puts libpng.a, libpng.so*, and png.h
diff --git a/scripts/makefile.borland b/scripts/makefile.borland
index 57f374f..2d3fe5b 100644
--- a/scripts/makefile.borland
+++ b/scripts/makefile.borland
@@ -2,8 +2,8 @@
 # Borland C++ 4.5 (Note: All modules are compiled in C mode)
 # Will work with C++ 4.02 also
 # To build the library, do: 
-#       "make -fmakefile.bor -DMODEL=m"
-# or:   "make -fmakefile.bor -DMODEL=l"
+#       "make -fmakefile.borland -DMODEL=m"
+# or:   "make -fmakefile.borland -DMODEL=l"
 #
 # ------------- Borland C++ 4.5 -------------
 
diff --git a/scripts/makefile.dec b/scripts/makefile.dec
index 6f252e4..51403ca 100644
--- a/scripts/makefile.dec
+++ b/scripts/makefile.dec
@@ -14,7 +14,7 @@
 # read libpng.txt or png.h to see why PNGMAJ is 2.  You should not
 # have to change it.
 PNGMAJ = 2
-PNGMIN = 1.0.4
+PNGMIN = 1.0.4c
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 CC=cc
diff --git a/scripts/makefile.linux b/scripts/makefile.linux
index 4201201..c84b6ce 100644
--- a/scripts/makefile.linux
+++ b/scripts/makefile.linux
@@ -22,6 +22,8 @@
 	-Wmissing-declarations -Wtraditional -Wcast-align \
 	-Wstrict-prototypes -Wmissing-prototypes #-Wconversion
 
+# for pgcc version 2.95.1, -O3 is buggy; don't us it.
+
 CFLAGS=-I$(ZLIBINC) -Wall -O3 -funroll-loops \
 	$(ALIGN) # $(WARNMORE) -g -DPNG_DEBUG=5
 LDFLAGS=-L. -Wl,-rpath,. -L$(ZLIBLIB) -Wl,-rpath,$(ZLIBLIB) -lpng -lz -lm
@@ -32,7 +34,7 @@
 # read libpng.txt or png.h to see why PNGMAJ is 2.  You should not
 # have to change it.
 PNGMAJ = 2
-PNGMIN = 1.0.4
+PNGMIN = 1.0.4c
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 INCPATH=$(prefix)/include
diff --git a/scripts/makefile.msc b/scripts/makefile.msc
index 6356218..96b2cfc 100644
--- a/scripts/makefile.msc
+++ b/scripts/makefile.msc
@@ -3,7 +3,7 @@
 # For conditions of distribution and use, see copyright notice in png.h
 # Assumes that zlib.lib, zconf.h, and zlib.h have been copied to ..\zlib
 
-# ------------- Microsoft C 5.1 and later -------------
+# -------- Microsoft C 5.1 and later, does not use assembler code -----
 MODEL=-AL
 CFLAGS=-Oait -Gs -nologo -W3 $(MODEL) -I..\zlib
 #-Ox generates bad code with MSC 5.1
diff --git a/scripts/makefile.sco b/scripts/makefile.sco
index 9eee4a2..1e51004 100644
--- a/scripts/makefile.sco
+++ b/scripts/makefile.sco
@@ -25,7 +25,7 @@
 # read libpng.txt or png.h to see why PNGMAJ is 2.  You should not
 # have to change it.
 PNGMAJ = 2
-PNGMIN = 1.0.4
+PNGMIN = 1.0.4c
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 INCPATH=$(prefix)/include
diff --git a/scripts/makefile.solaris b/scripts/makefile.solaris
index 5f3a412..fcc3078 100644
--- a/scripts/makefile.solaris
+++ b/scripts/makefile.solaris
@@ -1,5 +1,5 @@
 # makefile for libpng on Solaris 2.x with gcc
-# Contributed by William L. Sebok, based on makefile.lnx
+# Contributed by William L. Sebok, based on makefile.linux
 # Copyright (C) 1996, 1997 Andreas Dilger
 # Copyright (C) 1998 Greg Roelofs
 # For conditions of distribution and use, see copyright notice in png.h
@@ -36,7 +36,7 @@
 # read libpng.txt or png.h to see why PNGMAJ is 2.  You should not
 # have to change it.
 PNGMAJ = 2
-PNGMIN = 1.0.4
+PNGMIN = 1.0.4c
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 INCPATH=$(prefix)/include
diff --git a/scripts/makefile.turboc3 b/scripts/makefile.turboc3
index c925831..f9a2269 100644
--- a/scripts/makefile.turboc3
+++ b/scripts/makefile.turboc3
@@ -1,7 +1,7 @@
 # Makefile for libpng
 # TurboC++ 3.0 (Note: All modules are compiled in C mode)
 
-# To use, do "make -fmakefile.tc3"
+# To use, do "make -fmakefile.turboc3"
 
 # ------------- Turbo C++ 3.0 -------------
 MODEL=-ml
diff --git a/scripts/makefile.win32vc b/scripts/makefile.vcawin32
similarity index 86%
rename from scripts/makefile.win32vc
rename to scripts/makefile.vcawin32
index 52934c3..be7fcc8 100644
--- a/scripts/makefile.win32vc
+++ b/scripts/makefile.vcawin32
@@ -2,9 +2,15 @@
 # Copyright (C) 1998 Tim Wegner
 # For conditions of distribution and use, see copyright notice in png.h
 # Assumes that zlib.lib, zconf.h, and zlib.h have been copied to ..\zlib
-# To use, do "nmake /f scripts\makefile.w32"
+# To use, do "nmake /f scripts\makefile.vcawin32"
 
-# ------------- Microsoft Visual C++ 4.0 and later -------------
+# ---------- Microsoft Visual C++ 5.0 and later, uses assembler code------
+
+# Caution: the assembler code was introduced at libpng version 1.0.4 and has
+# not yet been thoroughly tested.
+
+# If you don't want to use assembler code, use makefile.vcwin32 instead.
+
 MODEL=- 
 CFLAGS=-DPNG_USE_PNGVCRD -Ox -GA3s -nologo -W3 -I..\zlib
 
diff --git a/scripts/makefile.win32vc b/scripts/makefile.vcwin32
similarity index 85%
copy from scripts/makefile.win32vc
copy to scripts/makefile.vcwin32
index 52934c3..5b62fc3 100644
--- a/scripts/makefile.win32vc
+++ b/scripts/makefile.vcwin32
@@ -2,11 +2,13 @@
 # Copyright (C) 1998 Tim Wegner
 # For conditions of distribution and use, see copyright notice in png.h
 # Assumes that zlib.lib, zconf.h, and zlib.h have been copied to ..\zlib
-# To use, do "nmake /f scripts\makefile.w32"
+# To use, do "nmake /f scripts\makefile.vcwin32"
 
-# ------------- Microsoft Visual C++ 4.0 and later -------------
+# ---------- Microsoft Visual C++ 4.0 and later, no assembler code------
+# If you want to use assembler code, use makefile.vcawin32 instead.
+
 MODEL=- 
-CFLAGS=-DPNG_USE_PNGVCRD -Ox -GA3s -nologo -W3 -I..\zlib
+CFLAGS= -Ox -GA3s -nologo -W3 -I..\zlib
 
 CC=cl
 LD=link
@@ -19,7 +21,7 @@
 # variables
 OBJS1 = png$(O) pngset$(O) pngget$(O) pngrutil$(O) pngtrans$(O) pngwutil$(O)
 OBJS2 = pngmem$(O) pngpread$(O) pngread$(O) pngerror$(O) pngwrite$(O)
-OBJS3 = pngrtran$(O) pngwtran$(O) pngrio$(O) pngwio$(O) pngvcrd$(O)
+OBJS3 = pngrtran$(O) pngwtran$(O) pngrio$(O) pngwio$(O)
 
 all: libpng.lib
 
@@ -44,9 +46,6 @@
 pngrutil$(O): png.h pngconf.h pngasmrd.h
 	$(CC) -c $(CFLAGS) $*.c $(ERRFILE)
 
-pngvcrd$(O): png.h pngconf.h pngasmrd.h
-	$(CC) -c $(CFLAGS) $*.c $(ERRFILE)
-
 pngerror$(O): png.h pngconf.h
 	$(CC) -c $(CFLAGS) $*.c $(ERRFILE)
 
diff --git a/scripts/makefile.watcom b/scripts/makefile.watcom
index a7d99c2..e14f162 100644
--- a/scripts/makefile.watcom
+++ b/scripts/makefile.watcom
@@ -5,7 +5,7 @@
 # For conditions of distribution and use, see copyright notice in png.h
 # Assumes that zlib.lib, zconf.h, and zlib.h have been copied to ..\zlib
 
-# To use, do "wmake /f scripts\makefile.wat"
+# To use, do "wmake /f scripts\makefile.watcom"
 
 # ------------- Watcom 10.0 and later -------------
 MODEL=-mf
diff --git a/scripts/pngdef.pas b/scripts/pngdef.pas
index 94e859a..1441808 100644
--- a/scripts/pngdef.pas
+++ b/scripts/pngdef.pas
@@ -3,8 +3,8 @@
 interface
 
 const
-  PNG_LIBPNG_VER_STRING = '1.0.4';
-  PNG_LIBPNG_VER        =  10004;
+  PNG_LIBPNG_VER_STRING = '1.0.4c';
+  PNG_LIBPNG_VER        =  10005;
 
 type
   png_uint_32 = Cardinal;