ASR_BASE

Change-Id: Icf3719cc0afe3eeb3edc7fa80a2eb5199ca9dda1
diff --git a/target/linux/bcm27xx/patches-5.4/950-0041-Speed-up-console-framebuffer-imageblit-function.patch b/target/linux/bcm27xx/patches-5.4/950-0041-Speed-up-console-framebuffer-imageblit-function.patch
new file mode 100644
index 0000000..2524f56
--- /dev/null
+++ b/target/linux/bcm27xx/patches-5.4/950-0041-Speed-up-console-framebuffer-imageblit-function.patch
@@ -0,0 +1,209 @@
+From 36be92675cdb5eb76ec03997b6ee0b8a1863b08a Mon Sep 17 00:00:00 2001
+From: Harm Hanemaaijer <fgenfb@yahoo.com>
+Date: Thu, 20 Jun 2013 20:21:39 +0200
+Subject: [PATCH] Speed up console framebuffer imageblit function
+
+Especially on platforms with a slower CPU but a relatively high
+framebuffer fill bandwidth, like current ARM devices, the existing
+console monochrome imageblit function used to draw console text is
+suboptimal for common pixel depths such as 16bpp and 32bpp. The existing
+code is quite general and can deal with several pixel depths. By creating
+special case functions for 16bpp and 32bpp, by far the most common pixel
+formats used on modern systems, a significant speed-up is attained
+which can be readily felt on ARM-based devices like the Raspberry Pi
+and the Allwinner platform, but should help any platform using the
+fb layer.
+
+The special case functions allow constant folding, eliminating a number
+of instructions including divide operations, and allow the use of an
+unrolled loop, eliminating instructions with a variable shift size,
+reducing source memory access instructions, and eliminating excessive
+branching. These unrolled loops also allow much better code optimization
+by the C compiler. The code that selects which optimized variant is used
+is also simplified, eliminating integer divide instructions.
+
+The speed-up, measured by timing 'cat file.txt' in the console, varies
+between 40% and 70%, when testing on the Raspberry Pi and Allwinner
+ARM-based platforms, depending on font size and the pixel depth, with
+the greater benefit for 32bpp.
+
+Signed-off-by: Harm Hanemaaijer <fgenfb@yahoo.com>
+---
+ drivers/video/fbdev/core/cfbimgblt.c | 152 ++++++++++++++++++++++++++-
+ 1 file changed, 147 insertions(+), 5 deletions(-)
+
+--- a/drivers/video/fbdev/core/cfbimgblt.c
++++ b/drivers/video/fbdev/core/cfbimgblt.c
+@@ -28,6 +28,11 @@
+  *
+  *  Also need to add code to deal with cards endians that are different than
+  *  the native cpu endians. I also need to deal with MSB position in the word.
++ *  Modified by Harm Hanemaaijer (fgenfb@yahoo.com) 2013:
++ *  - Provide optimized versions of fast_imageblit for 16 and 32bpp that are
++ *    significantly faster than the previous implementation.
++ *  - Simplify the fast/slow_imageblit selection code, avoiding integer
++ *    divides.
+  */
+ #include <linux/module.h>
+ #include <linux/string.h>
+@@ -262,6 +267,133 @@ static inline void fast_imageblit(const
+ 	}
+ }	
+ 	
++/*
++ * Optimized fast_imageblit for bpp == 16. ppw = 2, bit_mask = 3 folded
++ * into the code, main loop unrolled.
++ */
++
++static inline void fast_imageblit16(const struct fb_image *image,
++				    struct fb_info *p, u8 __iomem * dst1,
++				    u32 fgcolor, u32 bgcolor)
++{
++	u32 fgx = fgcolor, bgx = bgcolor;
++	u32 spitch = (image->width + 7) / 8;
++	u32 end_mask, eorx;
++	const char *s = image->data, *src;
++	u32 __iomem *dst;
++	const u32 *tab = NULL;
++	int i, j, k;
++
++	tab = fb_be_math(p) ? cfb_tab16_be : cfb_tab16_le;
++
++	fgx <<= 16;
++	bgx <<= 16;
++	fgx |= fgcolor;
++	bgx |= bgcolor;
++
++	eorx = fgx ^ bgx;
++	k = image->width / 2;
++
++	for (i = image->height; i--;) {
++		dst = (u32 __iomem *) dst1;
++		src = s;
++
++		j = k;
++		while (j >= 4) {
++			u8 bits = *src;
++			end_mask = tab[(bits >> 6) & 3];
++			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
++			end_mask = tab[(bits >> 4) & 3];
++			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
++			end_mask = tab[(bits >> 2) & 3];
++			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
++			end_mask = tab[bits & 3];
++			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
++			src++;
++			j -= 4;
++		}
++		if (j != 0) {
++			u8 bits = *src;
++			end_mask = tab[(bits >> 6) & 3];
++			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
++			if (j >= 2) {
++				end_mask = tab[(bits >> 4) & 3];
++				FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
++				if (j == 3) {
++					end_mask = tab[(bits >> 2) & 3];
++					FB_WRITEL((end_mask & eorx) ^ bgx, dst);
++				}
++			}
++		}
++		dst1 += p->fix.line_length;
++		s += spitch;
++	}
++}
++
++/*
++ * Optimized fast_imageblit for bpp == 32. ppw = 1, bit_mask = 1 folded
++ * into the code, main loop unrolled.
++ */
++
++static inline void fast_imageblit32(const struct fb_image *image,
++				    struct fb_info *p, u8 __iomem * dst1,
++				    u32 fgcolor, u32 bgcolor)
++{
++	u32 fgx = fgcolor, bgx = bgcolor;
++	u32 spitch = (image->width + 7) / 8;
++	u32 end_mask, eorx;
++	const char *s = image->data, *src;
++	u32 __iomem *dst;
++	const u32 *tab = NULL;
++	int i, j, k;
++
++	tab = cfb_tab32;
++
++	eorx = fgx ^ bgx;
++	k = image->width;
++
++	for (i = image->height; i--;) {
++		dst = (u32 __iomem *) dst1;
++		src = s;
++
++		j = k;
++		while (j >= 8) {
++			u8 bits = *src;
++			end_mask = tab[(bits >> 7) & 1];
++			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
++			end_mask = tab[(bits >> 6) & 1];
++			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
++			end_mask = tab[(bits >> 5) & 1];
++			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
++			end_mask = tab[(bits >> 4) & 1];
++			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
++			end_mask = tab[(bits >> 3) & 1];
++			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
++			end_mask = tab[(bits >> 2) & 1];
++			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
++			end_mask = tab[(bits >> 1) & 1];
++			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
++			end_mask = tab[bits & 1];
++			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
++			src++;
++			j -= 8;
++		}
++		if (j != 0) {
++			u32 bits = (u32) * src;
++			while (j > 1) {
++				end_mask = tab[(bits >> 7) & 1];
++				FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
++				bits <<= 1;
++				j--;
++			}
++			end_mask = tab[(bits >> 7) & 1];
++			FB_WRITEL((end_mask & eorx) ^ bgx, dst);
++		}
++		dst1 += p->fix.line_length;
++		s += spitch;
++	}
++}
++
+ void cfb_imageblit(struct fb_info *p, const struct fb_image *image)
+ {
+ 	u32 fgcolor, bgcolor, start_index, bitstart, pitch_index = 0;
+@@ -294,11 +426,21 @@ void cfb_imageblit(struct fb_info *p, co
+ 			bgcolor = image->bg_color;
+ 		}	
+ 		
+-		if (32 % bpp == 0 && !start_index && !pitch_index && 
+-		    ((width & (32/bpp-1)) == 0) &&
+-		    bpp >= 8 && bpp <= 32) 			
+-			fast_imageblit(image, p, dst1, fgcolor, bgcolor);
+-		else 
++		if (!start_index && !pitch_index) {
++			if (bpp == 32)
++				fast_imageblit32(image, p, dst1, fgcolor,
++						 bgcolor);
++			else if (bpp == 16 && (width & 1) == 0)
++				fast_imageblit16(image, p, dst1, fgcolor,
++						 bgcolor);
++			else if (bpp == 8 && (width & 3) == 0)
++				fast_imageblit(image, p, dst1, fgcolor,
++					       bgcolor);
++			else
++				slow_imageblit(image, p, dst1, fgcolor,
++					       bgcolor,
++					       start_index, pitch_index);
++		} else
+ 			slow_imageblit(image, p, dst1, fgcolor, bgcolor,
+ 					start_index, pitch_index);
+ 	} else