bug#41535: [PATCH] performance optimization for aarch64

bug-gzip
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
bug#41535: [PATCH] performance optimization for aarch64

From:	Li Qiang
Subject:	bug#41535: [PATCH] performance optimization for aarch64
Date:	Thu, 20 Aug 2020 16:55:26 +0800
User-agent:	Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Thunderbird/68.8.1

在 2020/5/30 17:17, Li Qiang 写道:
> 
> 
> 在 2020/5/26 10:39, l00374334 写道:
>> From: liqiang <liqiang64@huawei.com>
>>
>> By analyzing the compression and decompression process of gzip, I found 
>>
>> that the hot spots of CRC32 and longest_match function are very high.
>>
>>
>>
>> On the aarch64 architecture, we can optimize the efficiency of crc32 
>>
>> through the interface provided by the neon instruction set (12x faster 
>>
>> in aarch64), and optimize the performance of random access code through 
>>
>> prefetch instructions (about 5%~8% improvement). In some compression 
>>
>> scenarios, loop expansion can also get a certain performance improvement 
>>
>> (about 10%).
>>
>>
>>
>> Modify by Li Qiang.
>>
>> ---
>>  configure | 14 ++++++++++++++
>>  deflate.c | 30 +++++++++++++++++++++++++++++-
>>  util.c    | 45 +++++++++++++++++++++++++++++++++++++++++++++
>>  3 files changed, 88 insertions(+), 1 deletion(-)
>>
>> diff --git a/configure b/configure
>> index cab3daf..dc80cb6 100644
>> --- a/configure
>> +++ b/configure
>> @@ -14555,6 +14555,20 @@ rm -f core conftest.err conftest.$ac_objext 
>> conftest.$ac_ext
>>             ;;
>>  
>>           arm* | aarch64 )
>> +           cat confdefs.h - <<_ACEOF >conftest.$ac_ext
>> +/* end confdefs.h.  */
>> +#if defined __ARM_NEON__ || defined __ARM_NEON
>> +                   int ok;
>> +                  #else
>> +                   error fail
>> +                  #endif
>> +
>> +_ACEOF
>> +if ac_fn_c_try_compile "$LINENO"
>> +then :
>> +  CFLAGS="$CFLAGS -march=armv8-a+crc"
>> +fi
>> +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
>>             # Assume arm with EABI.
>>             # On arm64 systems, the C compiler may be generating code in one 
>> of
>>             # these ABIs:
>> diff --git a/deflate.c b/deflate.c
>> index 9d379e9..ee77ffd 100644
>> --- a/deflate.c
>> +++ b/deflate.c
>> @@ -378,6 +378,9 @@ longest_match(IPos cur_match)
>>      register int len;                           /* length of current match 
>> */
>>
>>      int best_len = prev_length;                 /* best match length so far 
>> */
>>
>>      IPos limit = strstart > (IPos)MAX_DIST ? strstart - (IPos)MAX_DIST : 
>> NIL;
>>
>> +#ifdef __aarch64__
>>
>> +    IPos next_match;
>>
>> +#endif
>>
>>      /* Stop when cur_match becomes <= limit. To simplify the code,
>>
>>       * we prevent matches with the string of window index 0.
>>
>>       */
>>
>> @@ -411,6 +414,10 @@ longest_match(IPos cur_match)
>>      do {
>>
>>          Assert(cur_match < strstart, "no future");
>>
>>          match = window + cur_match;
>>
>> +#ifdef __aarch64__
>>
>> +        next_match = prev[cur_match & WMASK];
>>
>> +        __asm__("PRFM PLDL1STRM, [%0]"::"r"(&(prev[next_match & WMASK])));
>>
>> +#endif
>>
>>  
>>
>>          /* Skip to next match if the match length cannot increase
>>
>>           * or if the match length is less than 2:
>>
>> @@ -488,8 +495,14 @@ longest_match(IPos cur_match)
>>              scan_end   = scan[best_len];
>>
>>  #endif
>>
>>          }
>>
>> -    } while ((cur_match = prev[cur_match & WMASK]) > limit
>>
>> +    }
>>
>> +#ifdef __aarch64__
>>
>> +    while ((cur_match = next_match) > limit
>>
>> +             && --chain_length != 0);
>>
>> +#else
>>
>> +    while ((cur_match = prev[cur_match & WMASK]) > limit
>>
>>               && --chain_length != 0);
>>
>> +#endif
>>
>>  
>>
>>      return best_len;
>>
>>  }
>>
>> @@ -777,7 +790,22 @@ deflate (int pack_level)
>>              lookahead -= prev_length-1;
>>
>>              prev_length -= 2;
>>
>>              RSYNC_ROLL(strstart, prev_length+1);
>>
>> +            while (prev_length >= 4) {
>>
>> +                /* After actual verification, expanding this loop
>>
>> +                 * can improve its performance in certain scenarios.
>>
>> +                 */
>>
>> +                prev_length -= 4;
>>
>> +                strstart++;
>>
>> +                INSERT_STRING(strstart, hash_head);
>>
>> +                strstart++;
>>
>> +                INSERT_STRING(strstart, hash_head);
>>
>> +                strstart++;
>>
>> +                INSERT_STRING(strstart, hash_head);
>>
>> +                strstart++;
>>
>> +                INSERT_STRING(strstart, hash_head);
>>
>> +            }
>>
>>              do {
>>
>> +                if (prev_length == 0) break;
>>
>>                  strstart++;
>>
>>                  INSERT_STRING(strstart, hash_head);
>>
>>                  /* strstart never exceeds WSIZE-MAX_MATCH, so there are
>>
>> diff --git a/util.c b/util.c
>> index 0a0fc21..c9f0e52 100644
>> --- a/util.c
>> +++ b/util.c
>> @@ -38,6 +38,12 @@
>>  
>>
>>  static int write_buffer (int, voidp, unsigned int);
>>
>>  
>>
>> +#if defined __ARM_NEON__ || defined __ARM_NEON
>>
>> +#define CRC32D(crc, val) __asm__("crc32x %w[c], %w[c], 
>> %x[v]":[c]"+r"(crc):[v]"r"(val))
>>
>> +#define CRC32W(crc, val) __asm__("crc32w %w[c], %w[c], 
>> %w[v]":[c]"+r"(crc):[v]"r"(val))
>>
>> +#define CRC32H(crc, val) __asm__("crc32h %w[c], %w[c], 
>> %w[v]":[c]"+r"(crc):[v]"r"(val))
>>
>> +#define CRC32B(crc, val) __asm__("crc32b %w[c], %w[c], 
>> %w[v]":[c]"+r"(crc):[v]"r"(val))
>>
>> +#else
>>
>>  /* ========================================================================
>>
>>   * Table of CRC-32's of all single-byte values (made by makecrc.c)
>>
>>   */
>>
>> @@ -95,6 +101,7 @@ static const ulg crc_32_tab[] = {
>>    0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL,
>>
>>    0x2d02ef8dL
>>
>>  };
>>
>> +#endif
>>
>>  
>>
>>  /* Shift register contents.  */
>>
>>  static ulg crc = 0xffffffffL;
>>
>> @@ -132,6 +139,43 @@ ulg updcrc(s, n)
>>      const uch *s;           /* pointer to bytes to pump through */
>>
>>      unsigned n;             /* number of bytes in s[] */
>>
>>  {
>>
>> +#if defined __ARM_NEON__ || defined __ARM_NEON
>>
>> +    register ulg c;
>>
>> +    static ulg crc = (ulg)0xffffffffL;
>>
>> +    register const uint8_t  *buf1;
>>
>> +    register const uint16_t *buf2;
>>
>> +    register const uint32_t *buf4;
>>
>> +    register const uint64_t *buf8;
>>
>> +    int64_t length = (int64_t)n;
>>
>> +    buf8 = (const  uint64_t *)(const void *)s;
>>
>> +
>>
>> +    if (s == NULL) {
>>
>> +        c = 0xffffffffL;
>>
>> +    } else {
>>
>> +        c = crc;
>>
>> +        while(length >= sizeof(uint64_t)) {
>>
>> +            CRC32D(c, *buf8++);
>>
>> +            length -= sizeof(uint64_t);
>>
>> +        }
>>
>> +        buf4 = (const uint32_t *)(const void *)buf8;
>>
>> +        if (length >= sizeof(uint32_t)) {
>>
>> +            CRC32W(c, *buf4++);
>>
>> +            length -= sizeof(uint32_t);
>>
>> +        }
>>
>> +        buf2 = (const uint16_t *)(const void *)buf4;
>>
>> +        if(length >= sizeof(uint16_t)) {
>>
>> +            CRC32H(c, *buf2++);
>>
>> +            length -= sizeof(uint16_t);
>>
>> +        }
>>
>> +        buf1 = (const uint8_t *)(const void *)buf2;
>>
>> +        if (length >= sizeof(uint8_t)) {
>>
>> +            CRC32B(c, *buf1);
>>
>> +            length -= sizeof(uint8_t);
>>
>> +        }
>>
>> +    }
>>
>> +    crc = c;
>>
>> +    return (c ^ 0xffffffffL);
>>
>> +#else
>>
>>      register ulg c;         /* temporary variable */
>>
>>  
>>
>>      if (s == NULL) {
>>
>> @@ -144,6 +188,7 @@ ulg updcrc(s, n)
>>      }
>>
>>      crc = c;
>>
>>      return c ^ 0xffffffffL;       /* (instead of ~c for 64-bit machines) */
>>
>> +#endif
>>
>>  }
>>
>>  
>>
>>  /* Return a current CRC value.  */
>>
> 
> Please allow me to show a set of actual test data for this patch.
> 
> First, I made an original version of the program "gzip-1.10" based
> on the gzip-1.10 source code, and then made an optimized version of
> the program "gzip-optimized" after applying my optimization patch.
> 
> Next I use gzip-1.10 version to test the compression and decompression
> time on some **xml** files:
> [XML]# time ./gzip-1.10 *.xml
> 
> real    0m5.099s
> user    0m4.384s
> sys     0m0.176s
> [XML]# time ./gzip-1.10 -d *.gz
> 
> real    0m2.173s
> user    0m1.821s
> sys     0m0.348s
> 
> Then use the optimized version to compare:
> [XML]# time ./gzip-optimized *.xml
> 
> real    0m2.785s
> user    0m2.576s
> sys     0m0.204s
> [XML]# time ./gzip-optimized -d *.gz
> 
> real    0m0.497s
> user    0m0.176s
> sys     0m0.320s
> 
> 
> The next test object is a large **log** file:
> [LOG]# time ./gzip-1.10 *.log
> 
> real    0m8.883s
> user    0m8.652s
> sys     0m0.217s
> [LOG]# time ./gzip-1.10 -d *.gz
> 
> real    0m3.049s
> user    0m2.604s
> sys     0m0.439s
> 
> Also use the optimized version to compare:
> [LOG]# time ./gzip-optimized *.log
> 
> real    0m6.882s
> user    0m6.607s
> sys     0m0.264s
> [LOG]# time ./gzip-optimized -d *.gz
> 
> real    0m1.054s
> user    0m0.622s
> sys     0m0.431s
> 
> The above experimental data are from the aarch64 platform.
> 

Gentle ping.
: )

-- 
Best regards,
Li Qiang
[Prev in Thread]
Current Thread
[Next in Thread]
bug#41535: [PATCH] performance optimization for aarch64, Li Qiang <=
Next by Date: bug#42965: Wrong uncompressed size with option --list
Next by thread: bug#42965: Wrong uncompressed size with option --list
Index(es):
- Date
- Thread