target/linux/bcm27xx/patches-5.4/950-0041-Speed-up-console-framebuffer-imageblit-function.patch - T108 - Gitiles

 From 36be92675cdb5eb76ec03997b6ee0b8a1863b08a Mon Sep 17 00:00:00 2001
 From: Harm Hanemaaijer <fgenfb@yahoo.com>
 Date: Thu, 20 Jun 2013 20:21:39 +0200
 Subject: [PATCH] Speed up console framebuffer imageblit function

 Especially on platforms with a slower CPU but a relatively high
 framebuffer fill bandwidth, like current ARM devices, the existing
 console monochrome imageblit function used to draw console text is
 suboptimal for common pixel depths such as 16bpp and 32bpp. The existing
 code is quite general and can deal with several pixel depths. By creating
 special case functions for 16bpp and 32bpp, by far the most common pixel
 formats used on modern systems, a significant speed-up is attained
 which can be readily felt on ARM-based devices like the Raspberry Pi
 and the Allwinner platform, but should help any platform using the
 fb layer.

 The special case functions allow constant folding, eliminating a number
 of instructions including divide operations, and allow the use of an
 unrolled loop, eliminating instructions with a variable shift size,
 reducing source memory access instructions, and eliminating excessive
 branching. These unrolled loops also allow much better code optimization
 by the C compiler. The code that selects which optimized variant is used
 is also simplified, eliminating integer divide instructions.

 The speed-up, measured by timing 'cat file.txt' in the console, varies
 between 40% and 70%, when testing on the Raspberry Pi and Allwinner
 ARM-based platforms, depending on font size and the pixel depth, with
 the greater benefit for 32bpp.

 Signed-off-by: Harm Hanemaaijer <fgenfb@yahoo.com>
 ---
  drivers/video/fbdev/core/cfbimgblt.c | 152 ++++++++++++++++++++++++++-
  1 file changed, 147 insertions(+), 5 deletions(-)

 --- a/drivers/video/fbdev/core/cfbimgblt.c
 +++ b/drivers/video/fbdev/core/cfbimgblt.c
 @@ -28,6 +28,11 @@
   *
   *  Also need to add code to deal with cards endians that are different than
   *  the native cpu endians. I also need to deal with MSB position in the word.
 + *  Modified by Harm Hanemaaijer (fgenfb@yahoo.com) 2013:
 + *  - Provide optimized versions of fast_imageblit for 16 and 32bpp that are
 + *    significantly faster than the previous implementation.
 + *  - Simplify the fast/slow_imageblit selection code, avoiding integer
 + *    divides.
   */
  #include <linux/module.h>
  #include <linux/string.h>
 @@ -262,6 +267,133 @@ static inline void fast_imageblit(const
  	}
  }

 +/*
 + * Optimized fast_imageblit for bpp == 16. ppw = 2, bit_mask = 3 folded
 + * into the code, main loop unrolled.
 + */
 +
 +static inline void fast_imageblit16(const struct fb_image *image,
 +				    struct fb_info *p, u8 __iomem * dst1,
 +				    u32 fgcolor, u32 bgcolor)
 +{
 +	u32 fgx = fgcolor, bgx = bgcolor;
 +	u32 spitch = (image->width + 7) / 8;
 +	u32 end_mask, eorx;
 +	const char *s = image->data, *src;
 +	u32 __iomem *dst;
 +	const u32 *tab = NULL;
 +	int i, j, k;
 +
 +	tab = fb_be_math(p) ? cfb_tab16_be : cfb_tab16_le;
 +
 +	fgx <<= 16;
 +	bgx <<= 16;
 +	fgx |= fgcolor;
 +	bgx |= bgcolor;
 +
 +	eorx = fgx ^ bgx;
 +	k = image->width / 2;
 +
 +	for (i = image->height; i--;) {
 +		dst = (u32 __iomem *) dst1;
 +		src = s;
 +
 +		j = k;
 +		while (j >= 4) {
 +			u8 bits = *src;
 +			end_mask = tab[(bits >> 6) & 3];
 +			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
 +			end_mask = tab[(bits >> 4) & 3];
 +			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
 +			end_mask = tab[(bits >> 2) & 3];
 +			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
 +			end_mask = tab[bits & 3];
 +			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
 +			src++;
 +			j -= 4;
 +		}
 +		if (j != 0) {
 +			u8 bits = *src;
 +			end_mask = tab[(bits >> 6) & 3];
 +			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
 +			if (j >= 2) {
 +				end_mask = tab[(bits >> 4) & 3];
 +				FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
 +				if (j == 3) {
 +					end_mask = tab[(bits >> 2) & 3];
 +					FB_WRITEL((end_mask & eorx) ^ bgx, dst);
 +				}
 +			}
 +		}
 +		dst1 += p->fix.line_length;
 +		s += spitch;
 +	}
 +}
 +
 +/*
 + * Optimized fast_imageblit for bpp == 32. ppw = 1, bit_mask = 1 folded
 + * into the code, main loop unrolled.
 + */
 +
 +static inline void fast_imageblit32(const struct fb_image *image,
 +				    struct fb_info *p, u8 __iomem * dst1,
 +				    u32 fgcolor, u32 bgcolor)
 +{
 +	u32 fgx = fgcolor, bgx = bgcolor;
 +	u32 spitch = (image->width + 7) / 8;
 +	u32 end_mask, eorx;
 +	const char *s = image->data, *src;
 +	u32 __iomem *dst;
 +	const u32 *tab = NULL;
 +	int i, j, k;
 +
 +	tab = cfb_tab32;
 +
 +	eorx = fgx ^ bgx;
 +	k = image->width;
 +
 +	for (i = image->height; i--;) {
 +		dst = (u32 __iomem *) dst1;
 +		src = s;
 +
 +		j = k;
 +		while (j >= 8) {
 +			u8 bits = *src;
 +			end_mask = tab[(bits >> 7) & 1];
 +			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
 +			end_mask = tab[(bits >> 6) & 1];
 +			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
 +			end_mask = tab[(bits >> 5) & 1];
 +			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
 +			end_mask = tab[(bits >> 4) & 1];
 +			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
 +			end_mask = tab[(bits >> 3) & 1];
 +			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
 +			end_mask = tab[(bits >> 2) & 1];
 +			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
 +			end_mask = tab[(bits >> 1) & 1];
 +			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
 +			end_mask = tab[bits & 1];
 +			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
 +			src++;
 +			j -= 8;
 +		}
 +		if (j != 0) {
 +			u32 bits = (u32) * src;
 +			while (j > 1) {
 +				end_mask = tab[(bits >> 7) & 1];
 +				FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
 +				bits <<= 1;
 +				j--;
 +			}
 +			end_mask = tab[(bits >> 7) & 1];
 +			FB_WRITEL((end_mask & eorx) ^ bgx, dst);
 +		}
 +		dst1 += p->fix.line_length;
 +		s += spitch;
 +	}
 +}
 +
  void cfb_imageblit(struct fb_info *p, const struct fb_image *image)
  {
  	u32 fgcolor, bgcolor, start_index, bitstart, pitch_index = 0;
 @@ -294,11 +426,21 @@ void cfb_imageblit(struct fb_info *p, co
  			bgcolor = image->bg_color;
  		}

 -		if (32 % bpp == 0 && !start_index && !pitch_index &&
 -		    ((width & (32/bpp-1)) == 0) &&
 -		    bpp >= 8 && bpp <= 32)
 -			fast_imageblit(image, p, dst1, fgcolor, bgcolor);
 -		else
 +		if (!start_index && !pitch_index) {
 +			if (bpp == 32)
 +				fast_imageblit32(image, p, dst1, fgcolor,
 +						 bgcolor);
 +			else if (bpp == 16 && (width & 1) == 0)
 +				fast_imageblit16(image, p, dst1, fgcolor,
 +						 bgcolor);
 +			else if (bpp == 8 && (width & 3) == 0)
 +				fast_imageblit(image, p, dst1, fgcolor,
 +					       bgcolor);
 +			else
 +				slow_imageblit(image, p, dst1, fgcolor,
 +					       bgcolor,
 +					       start_index, pitch_index);
 +		} else
  			slow_imageblit(image, p, dst1, fgcolor, bgcolor,
  					start_index, pitch_index);
  	} else
	From 36be92675cdb5eb76ec03997b6ee0b8a1863b08a Mon Sep 17 00:00:00 2001
	From: Harm Hanemaaijer <fgenfb@yahoo.com>
	Date: Thu, 20 Jun 2013 20:21:39 +0200
	Subject: [PATCH] Speed up console framebuffer imageblit function

	Especially on platforms with a slower CPU but a relatively high
	framebuffer fill bandwidth, like current ARM devices, the existing
	console monochrome imageblit function used to draw console text is
	suboptimal for common pixel depths such as 16bpp and 32bpp. The existing
	code is quite general and can deal with several pixel depths. By creating
	special case functions for 16bpp and 32bpp, by far the most common pixel
	formats used on modern systems, a significant speed-up is attained
	which can be readily felt on ARM-based devices like the Raspberry Pi
	and the Allwinner platform, but should help any platform using the
	fb layer.

	The special case functions allow constant folding, eliminating a number
	of instructions including divide operations, and allow the use of an
	unrolled loop, eliminating instructions with a variable shift size,
	reducing source memory access instructions, and eliminating excessive
	branching. These unrolled loops also allow much better code optimization
	by the C compiler. The code that selects which optimized variant is used
	is also simplified, eliminating integer divide instructions.

	The speed-up, measured by timing 'cat file.txt' in the console, varies
	between 40% and 70%, when testing on the Raspberry Pi and Allwinner
	ARM-based platforms, depending on font size and the pixel depth, with
	the greater benefit for 32bpp.

	Signed-off-by: Harm Hanemaaijer <fgenfb@yahoo.com>
	---
	drivers/video/fbdev/core/cfbimgblt.c \| 152 ++++++++++++++++++++++++++-
	1 file changed, 147 insertions(+), 5 deletions(-)

	--- a/drivers/video/fbdev/core/cfbimgblt.c
	+++ b/drivers/video/fbdev/core/cfbimgblt.c
	@@ -28,6 +28,11 @@
	*
	* Also need to add code to deal with cards endians that are different than
	* the native cpu endians. I also need to deal with MSB position in the word.
	+ * Modified by Harm Hanemaaijer (fgenfb@yahoo.com) 2013:
	+ * - Provide optimized versions of fast_imageblit for 16 and 32bpp that are
	+ * significantly faster than the previous implementation.
	+ * - Simplify the fast/slow_imageblit selection code, avoiding integer
	+ * divides.
	*/
	#include <linux/module.h>
	#include <linux/string.h>
	@@ -262,6 +267,133 @@ static inline void fast_imageblit(const
	}
	}

	+/*
	+ * Optimized fast_imageblit for bpp == 16. ppw = 2, bit_mask = 3 folded
	+ * into the code, main loop unrolled.
	+ */
	+
	+static inline void fast_imageblit16(const struct fb_image *image,
	+ struct fb_info p, u8 __iomem dst1,
	+ u32 fgcolor, u32 bgcolor)
	+{
	+ u32 fgx = fgcolor, bgx = bgcolor;
	+ u32 spitch = (image->width + 7) / 8;
	+ u32 end_mask, eorx;
	+ const char s = image->data, src;
	+ u32 __iomem *dst;
	+ const u32 *tab = NULL;
	+ int i, j, k;
	+
	+ tab = fb_be_math(p) ? cfb_tab16_be : cfb_tab16_le;
	+
	+ fgx <<= 16;
	+ bgx <<= 16;
	+ fgx \|= fgcolor;
	+ bgx \|= bgcolor;
	+
	+ eorx = fgx ^ bgx;
	+ k = image->width / 2;
	+
	+ for (i = image->height; i--;) {
	+ dst = (u32 __iomem *) dst1;
	+ src = s;
	+
	+ j = k;
	+ while (j >= 4) {
	+ u8 bits = *src;
	+ end_mask = tab[(bits >> 6) & 3];
	+ FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
	+ end_mask = tab[(bits >> 4) & 3];
	+ FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
	+ end_mask = tab[(bits >> 2) & 3];
	+ FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
	+ end_mask = tab[bits & 3];
	+ FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
	+ src++;
	+ j -= 4;
	+ }
	+ if (j != 0) {
	+ u8 bits = *src;
	+ end_mask = tab[(bits >> 6) & 3];
	+ FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
	+ if (j >= 2) {
	+ end_mask = tab[(bits >> 4) & 3];
	+ FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
	+ if (j == 3) {
	+ end_mask = tab[(bits >> 2) & 3];
	+ FB_WRITEL((end_mask & eorx) ^ bgx, dst);
	+ }
	+ }
	+ }
	+ dst1 += p->fix.line_length;
	+ s += spitch;
	+ }
	+}
	+
	+/*
	+ * Optimized fast_imageblit for bpp == 32. ppw = 1, bit_mask = 1 folded
	+ * into the code, main loop unrolled.
	+ */
	+
	+static inline void fast_imageblit32(const struct fb_image *image,
	+ struct fb_info p, u8 __iomem dst1,
	+ u32 fgcolor, u32 bgcolor)
	+{
	+ u32 fgx = fgcolor, bgx = bgcolor;
	+ u32 spitch = (image->width + 7) / 8;
	+ u32 end_mask, eorx;
	+ const char s = image->data, src;
	+ u32 __iomem *dst;
	+ const u32 *tab = NULL;
	+ int i, j, k;
	+
	+ tab = cfb_tab32;
	+
	+ eorx = fgx ^ bgx;
	+ k = image->width;
	+
	+ for (i = image->height; i--;) {
	+ dst = (u32 __iomem *) dst1;
	+ src = s;
	+
	+ j = k;
	+ while (j >= 8) {
	+ u8 bits = *src;
	+ end_mask = tab[(bits >> 7) & 1];
	+ FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
	+ end_mask = tab[(bits >> 6) & 1];
	+ FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
	+ end_mask = tab[(bits >> 5) & 1];
	+ FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
	+ end_mask = tab[(bits >> 4) & 1];
	+ FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
	+ end_mask = tab[(bits >> 3) & 1];
	+ FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
	+ end_mask = tab[(bits >> 2) & 1];
	+ FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
	+ end_mask = tab[(bits >> 1) & 1];
	+ FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
	+ end_mask = tab[bits & 1];
	+ FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
	+ src++;
	+ j -= 8;
	+ }
	+ if (j != 0) {
	+ u32 bits = (u32) * src;
	+ while (j > 1) {
	+ end_mask = tab[(bits >> 7) & 1];
	+ FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
	+ bits <<= 1;
	+ j--;
	+ }
	+ end_mask = tab[(bits >> 7) & 1];
	+ FB_WRITEL((end_mask & eorx) ^ bgx, dst);
	+ }
	+ dst1 += p->fix.line_length;
	+ s += spitch;
	+ }
	+}
	+
	void cfb_imageblit(struct fb_info p, const struct fb_image image)
	{
	u32 fgcolor, bgcolor, start_index, bitstart, pitch_index = 0;
	@@ -294,11 +426,21 @@ void cfb_imageblit(struct fb_info *p, co
	bgcolor = image->bg_color;
	}

	- if (32 % bpp == 0 && !start_index && !pitch_index &&
	- ((width & (32/bpp-1)) == 0) &&
	- bpp >= 8 && bpp <= 32)
	- fast_imageblit(image, p, dst1, fgcolor, bgcolor);
	- else
	+ if (!start_index && !pitch_index) {
	+ if (bpp == 32)
	+ fast_imageblit32(image, p, dst1, fgcolor,
	+ bgcolor);
	+ else if (bpp == 16 && (width & 1) == 0)
	+ fast_imageblit16(image, p, dst1, fgcolor,
	+ bgcolor);
	+ else if (bpp == 8 && (width & 3) == 0)
	+ fast_imageblit(image, p, dst1, fgcolor,
	+ bgcolor);
	+ else
	+ slow_imageblit(image, p, dst1, fgcolor,
	+ bgcolor,
	+ start_index, pitch_index);
	+ } else
	slow_imageblit(image, p, dst1, fgcolor, bgcolor,
	start_index, pitch_index);
	} else