b.liu | e958203 | 2025-04-17 19:18:16 +0800 | [diff] [blame] | 1 | From 36be92675cdb5eb76ec03997b6ee0b8a1863b08a Mon Sep 17 00:00:00 2001 |
| 2 | From: Harm Hanemaaijer <fgenfb@yahoo.com> |
| 3 | Date: Thu, 20 Jun 2013 20:21:39 +0200 |
| 4 | Subject: [PATCH] Speed up console framebuffer imageblit function |
| 5 | |
| 6 | Especially on platforms with a slower CPU but a relatively high |
| 7 | framebuffer fill bandwidth, like current ARM devices, the existing |
| 8 | console monochrome imageblit function used to draw console text is |
| 9 | suboptimal for common pixel depths such as 16bpp and 32bpp. The existing |
| 10 | code is quite general and can deal with several pixel depths. By creating |
| 11 | special case functions for 16bpp and 32bpp, by far the most common pixel |
| 12 | formats used on modern systems, a significant speed-up is attained |
| 13 | which can be readily felt on ARM-based devices like the Raspberry Pi |
| 14 | and the Allwinner platform, but should help any platform using the |
| 15 | fb layer. |
| 16 | |
| 17 | The special case functions allow constant folding, eliminating a number |
| 18 | of instructions including divide operations, and allow the use of an |
| 19 | unrolled loop, eliminating instructions with a variable shift size, |
| 20 | reducing source memory access instructions, and eliminating excessive |
| 21 | branching. These unrolled loops also allow much better code optimization |
| 22 | by the C compiler. The code that selects which optimized variant is used |
| 23 | is also simplified, eliminating integer divide instructions. |
| 24 | |
| 25 | The speed-up, measured by timing 'cat file.txt' in the console, varies |
| 26 | between 40% and 70%, when testing on the Raspberry Pi and Allwinner |
| 27 | ARM-based platforms, depending on font size and the pixel depth, with |
| 28 | the greater benefit for 32bpp. |
| 29 | |
| 30 | Signed-off-by: Harm Hanemaaijer <fgenfb@yahoo.com> |
| 31 | --- |
| 32 | drivers/video/fbdev/core/cfbimgblt.c | 152 ++++++++++++++++++++++++++- |
| 33 | 1 file changed, 147 insertions(+), 5 deletions(-) |
| 34 | |
| 35 | --- a/drivers/video/fbdev/core/cfbimgblt.c |
| 36 | +++ b/drivers/video/fbdev/core/cfbimgblt.c |
| 37 | @@ -28,6 +28,11 @@ |
| 38 | * |
| 39 | * Also need to add code to deal with cards endians that are different than |
| 40 | * the native cpu endians. I also need to deal with MSB position in the word. |
| 41 | + * Modified by Harm Hanemaaijer (fgenfb@yahoo.com) 2013: |
| 42 | + * - Provide optimized versions of fast_imageblit for 16 and 32bpp that are |
| 43 | + * significantly faster than the previous implementation. |
| 44 | + * - Simplify the fast/slow_imageblit selection code, avoiding integer |
| 45 | + * divides. |
| 46 | */ |
| 47 | #include <linux/module.h> |
| 48 | #include <linux/string.h> |
| 49 | @@ -262,6 +267,133 @@ static inline void fast_imageblit(const |
| 50 | } |
| 51 | } |
| 52 | |
| 53 | +/* |
| 54 | + * Optimized fast_imageblit for bpp == 16. ppw = 2, bit_mask = 3 folded |
| 55 | + * into the code, main loop unrolled. |
| 56 | + */ |
| 57 | + |
| 58 | +static inline void fast_imageblit16(const struct fb_image *image, |
| 59 | + struct fb_info *p, u8 __iomem * dst1, |
| 60 | + u32 fgcolor, u32 bgcolor) |
| 61 | +{ |
| 62 | + u32 fgx = fgcolor, bgx = bgcolor; |
| 63 | + u32 spitch = (image->width + 7) / 8; |
| 64 | + u32 end_mask, eorx; |
| 65 | + const char *s = image->data, *src; |
| 66 | + u32 __iomem *dst; |
| 67 | + const u32 *tab = NULL; |
| 68 | + int i, j, k; |
| 69 | + |
| 70 | + tab = fb_be_math(p) ? cfb_tab16_be : cfb_tab16_le; |
| 71 | + |
| 72 | + fgx <<= 16; |
| 73 | + bgx <<= 16; |
| 74 | + fgx |= fgcolor; |
| 75 | + bgx |= bgcolor; |
| 76 | + |
| 77 | + eorx = fgx ^ bgx; |
| 78 | + k = image->width / 2; |
| 79 | + |
| 80 | + for (i = image->height; i--;) { |
| 81 | + dst = (u32 __iomem *) dst1; |
| 82 | + src = s; |
| 83 | + |
| 84 | + j = k; |
| 85 | + while (j >= 4) { |
| 86 | + u8 bits = *src; |
| 87 | + end_mask = tab[(bits >> 6) & 3]; |
| 88 | + FB_WRITEL((end_mask & eorx) ^ bgx, dst++); |
| 89 | + end_mask = tab[(bits >> 4) & 3]; |
| 90 | + FB_WRITEL((end_mask & eorx) ^ bgx, dst++); |
| 91 | + end_mask = tab[(bits >> 2) & 3]; |
| 92 | + FB_WRITEL((end_mask & eorx) ^ bgx, dst++); |
| 93 | + end_mask = tab[bits & 3]; |
| 94 | + FB_WRITEL((end_mask & eorx) ^ bgx, dst++); |
| 95 | + src++; |
| 96 | + j -= 4; |
| 97 | + } |
| 98 | + if (j != 0) { |
| 99 | + u8 bits = *src; |
| 100 | + end_mask = tab[(bits >> 6) & 3]; |
| 101 | + FB_WRITEL((end_mask & eorx) ^ bgx, dst++); |
| 102 | + if (j >= 2) { |
| 103 | + end_mask = tab[(bits >> 4) & 3]; |
| 104 | + FB_WRITEL((end_mask & eorx) ^ bgx, dst++); |
| 105 | + if (j == 3) { |
| 106 | + end_mask = tab[(bits >> 2) & 3]; |
| 107 | + FB_WRITEL((end_mask & eorx) ^ bgx, dst); |
| 108 | + } |
| 109 | + } |
| 110 | + } |
| 111 | + dst1 += p->fix.line_length; |
| 112 | + s += spitch; |
| 113 | + } |
| 114 | +} |
| 115 | + |
| 116 | +/* |
| 117 | + * Optimized fast_imageblit for bpp == 32. ppw = 1, bit_mask = 1 folded |
| 118 | + * into the code, main loop unrolled. |
| 119 | + */ |
| 120 | + |
| 121 | +static inline void fast_imageblit32(const struct fb_image *image, |
| 122 | + struct fb_info *p, u8 __iomem * dst1, |
| 123 | + u32 fgcolor, u32 bgcolor) |
| 124 | +{ |
| 125 | + u32 fgx = fgcolor, bgx = bgcolor; |
| 126 | + u32 spitch = (image->width + 7) / 8; |
| 127 | + u32 end_mask, eorx; |
| 128 | + const char *s = image->data, *src; |
| 129 | + u32 __iomem *dst; |
| 130 | + const u32 *tab = NULL; |
| 131 | + int i, j, k; |
| 132 | + |
| 133 | + tab = cfb_tab32; |
| 134 | + |
| 135 | + eorx = fgx ^ bgx; |
| 136 | + k = image->width; |
| 137 | + |
| 138 | + for (i = image->height; i--;) { |
| 139 | + dst = (u32 __iomem *) dst1; |
| 140 | + src = s; |
| 141 | + |
| 142 | + j = k; |
| 143 | + while (j >= 8) { |
| 144 | + u8 bits = *src; |
| 145 | + end_mask = tab[(bits >> 7) & 1]; |
| 146 | + FB_WRITEL((end_mask & eorx) ^ bgx, dst++); |
| 147 | + end_mask = tab[(bits >> 6) & 1]; |
| 148 | + FB_WRITEL((end_mask & eorx) ^ bgx, dst++); |
| 149 | + end_mask = tab[(bits >> 5) & 1]; |
| 150 | + FB_WRITEL((end_mask & eorx) ^ bgx, dst++); |
| 151 | + end_mask = tab[(bits >> 4) & 1]; |
| 152 | + FB_WRITEL((end_mask & eorx) ^ bgx, dst++); |
| 153 | + end_mask = tab[(bits >> 3) & 1]; |
| 154 | + FB_WRITEL((end_mask & eorx) ^ bgx, dst++); |
| 155 | + end_mask = tab[(bits >> 2) & 1]; |
| 156 | + FB_WRITEL((end_mask & eorx) ^ bgx, dst++); |
| 157 | + end_mask = tab[(bits >> 1) & 1]; |
| 158 | + FB_WRITEL((end_mask & eorx) ^ bgx, dst++); |
| 159 | + end_mask = tab[bits & 1]; |
| 160 | + FB_WRITEL((end_mask & eorx) ^ bgx, dst++); |
| 161 | + src++; |
| 162 | + j -= 8; |
| 163 | + } |
| 164 | + if (j != 0) { |
| 165 | + u32 bits = (u32) * src; |
| 166 | + while (j > 1) { |
| 167 | + end_mask = tab[(bits >> 7) & 1]; |
| 168 | + FB_WRITEL((end_mask & eorx) ^ bgx, dst++); |
| 169 | + bits <<= 1; |
| 170 | + j--; |
| 171 | + } |
| 172 | + end_mask = tab[(bits >> 7) & 1]; |
| 173 | + FB_WRITEL((end_mask & eorx) ^ bgx, dst); |
| 174 | + } |
| 175 | + dst1 += p->fix.line_length; |
| 176 | + s += spitch; |
| 177 | + } |
| 178 | +} |
| 179 | + |
| 180 | void cfb_imageblit(struct fb_info *p, const struct fb_image *image) |
| 181 | { |
| 182 | u32 fgcolor, bgcolor, start_index, bitstart, pitch_index = 0; |
| 183 | @@ -294,11 +426,21 @@ void cfb_imageblit(struct fb_info *p, co |
| 184 | bgcolor = image->bg_color; |
| 185 | } |
| 186 | |
| 187 | - if (32 % bpp == 0 && !start_index && !pitch_index && |
| 188 | - ((width & (32/bpp-1)) == 0) && |
| 189 | - bpp >= 8 && bpp <= 32) |
| 190 | - fast_imageblit(image, p, dst1, fgcolor, bgcolor); |
| 191 | - else |
| 192 | + if (!start_index && !pitch_index) { |
| 193 | + if (bpp == 32) |
| 194 | + fast_imageblit32(image, p, dst1, fgcolor, |
| 195 | + bgcolor); |
| 196 | + else if (bpp == 16 && (width & 1) == 0) |
| 197 | + fast_imageblit16(image, p, dst1, fgcolor, |
| 198 | + bgcolor); |
| 199 | + else if (bpp == 8 && (width & 3) == 0) |
| 200 | + fast_imageblit(image, p, dst1, fgcolor, |
| 201 | + bgcolor); |
| 202 | + else |
| 203 | + slow_imageblit(image, p, dst1, fgcolor, |
| 204 | + bgcolor, |
| 205 | + start_index, pitch_index); |
| 206 | + } else |
| 207 | slow_imageblit(image, p, dst1, fgcolor, bgcolor, |
| 208 | start_index, pitch_index); |
| 209 | } else |