lwip-users
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [lwip-users] checksum routine in assembler


From: address@hidden
Subject: Re: [lwip-users] checksum routine in assembler
Date: Tue, 17 Nov 2009 19:35:14 +0100
User-agent: Thunderbird 2.0.0.23 (Macintosh/20090812)

Jan Wester wrote:

Hi all

I’m trying to optimize my TCP/IP communication

Is it anyone how have the checksum, htons and htonl routines in assembler for ARM

I do have a checksum routine in (gcc) assembler. It's quite optimized, but I haven't used it for some years and don't remember if it can be improved... I'll attach it.

I didn't need htons/htonl though, as I was running the arm in big endian mode. Note that another possibility for optimization is to provide an improved alternative for memcpy. I had a gcc arm assembler version for that, too, but I can only find the word-only routine, which doesn't help much as a generic memcpy replacement.

Feel free to use the routine as you like, I don't think there much brain power in it ;-) - but don't blame me if it doesn't work!

Simon
#ifndef _ASM_FNS_H
#define _ASM_FNS_H

unsigned short asm_ip_chksum(void *buf, unsigned int length);
unsigned short asm_udp_chksum(void *buf, unsigned int len_udp, void *src_addr, 
void *dest_addr);


unsigned short cksum(void const * data, unsigned short byte_cnt);

#endif

.text

        // r0 / r1: args / output
        addr .req r0
        in_len .req r1
        loopc_b .req r1
        // r2:          carry counter
        carries .req r2
        // r3:          sum
        sum .req r3
        // r4:          loaded long / temp1
        temp1 .req r4
        // r5:          current address
        //addr .req r5
        temp2 .req r5
        // r6:          loop counter (longs)
        loopc .req r6
        // r7:          loop counter (short)
        loopc_s .req r7
        temp3 .req r7
        // r8:          loop counter (bytes)
        //loopc_b .req r8
        // r9:          temp2
        //temp2 .req r9
        temp4 .req r8
        temp5 .req r9
        temp6 .req r10
        last_used_reg .req r10



// unsigned short asm_ip_chksum(unsigned int length, void *buf);

        .globl asm_ip_chksum
asm_ip_chksum:
        CMP             r0, #0
        MOVEQ   r0, #-1
        MOVEQ   pc, lr
        STMFD   sp!, {r1-last_used_reg,r14}

// init
        MOV             carries, #0                             // clear carry 
counter
        MOV             sum, #0                                 // sum=0

// testen, ob byte-aligned
        ANDS    temp3, addr, #0x1               // temp3 = addr & 1
        BEQ             ip_word_sum                         // if temp3 == 0 
goto ip_word_sum
// [byte][..]
        LDRB    temp1, [addr], #1               // load 1 byte
        ADD             sum, sum, temp1, lsl #16// sum += first_halfword
        SUB             loopc_b, loopc_b, #1    // loop_b == in_len (r1)

ip_word_sum:

// testen, ob word-aligned
        ANDS    temp3, addr, #0x2               // temp3 &= 2
        BEQ             ip_begin_sum                    // goto begin_sum if 
not short-aligned
// [short][dwords]
        LDRH    temp1, [addr], #2               // load 1 byte
        ADD             sum, sum, temp1, lsl #16// sum += first_halfword
        SUB             loopc_b, loopc_b, #2    // loop_b == in_len (r1)

ip_begin_sum:

// Testen, ob länge durch 4 teilbar
        MOV             loopc, loopc_b, asr #2  // loopc = (arg0 / 4) 
(long_count)
        AND             loopc_s, loopc_b, #2    // store last 2 bits of in_len 
in temp2 (anzahl bytes (0-3) die kein ganzes dword sind)
        AND             loopc_b, loopc_b, #1    // store last 2 bits of in_len 
in temp2 (anzahl bytes (0-3) die kein ganzes dword sind)

        CMP             loopc, #4
        BLT             ip_after_quad_dword_loop

ip_quad_dword_loop_begin:
// add 4 dwords
        LDMIA   addr!, {temp1, temp4, temp5, temp6}
        ADDS    sum, sum, temp1                 // sum += buf[i]
        ADC             carries, carries, #0    // add carry to carry counter
        ADDS    sum, sum, temp4                 // sum += buf[i]
        ADC             carries, carries, #0    // add carry to carry counter
        ADDS    sum, sum, temp5                 // sum += buf[i]
        ADC             carries, carries, #0    // add carry to carry counter
        ADDS    sum, sum, temp6                 // sum += buf[i]
        ADC             carries, carries, #0    // add carry to carry counter

        SUBS    loopc, loopc, #4                // 4 long per loop
        CMP             loopc, #4
        BGE             ip_quad_dword_loop_begin

ip_after_quad_dword_loop:
        CMP             loopc, #1
        BLT             ip_after_dword_loop

// first, only long words.
ip_dword_loop_begin:
        LDR             temp1, [addr], #4               // load long from 
address of addr and inc.after
        ADDS    sum, sum, temp1                 // sum += buf[i]
        ADC             carries, carries, #0    // add carry to carry counter
        SUBS    loopc, loopc, #1                // 1 long per loop
        BNE             ip_dword_loop_begin
ip_after_dword_loop:

// fold
        // check if > 16 bit
        LDR             temp1, =0xffff                  // temp1 = 0xffff
        MOVS    temp2, sum, lsr #16             // temp2 = sum >> 16
        BEQ             ip_nofold1
        AND             sum, sum, temp1                 // sum &= 0xffff
        ADD             sum, sum, temp2                 // sum += temp2
ip_nofold1:

// now add shorts (if left)
        CMP             loopc_s, #0
        BEQ             ip_no_short_left                // if loopc_s != 0 go 
on here
        LDRH    temp2, [addr], #2               // load 2 bytes
        ADD             sum, sum, temp2                 // sum += 2byte
ip_no_short_left:

// now add bytes (if left)
        CMP             loopc_b, #0
        BEQ             ip_no_byte_left
// there are some bytes left
        LDRB    temp2, [addr], #1               // load 1 byte
        ADD             sum, sum, temp2, lsl #8 // sum += (1byte << 8)
ip_no_byte_left:


// add carries
        ADD             sum, sum, carries               // sum += carries (sum 
+= all_carries)
// fold again
        // check if > 16 bit
        MOVS    temp2, sum, lsr #16             // temp2 = sum >> 16
        BEQ             ip_nofold2
        //LDR           temp1, =0xffff                  // temp1 = 0xffff
        AND             temp2, temp2, temp1             // temp2 &= 0xffff
        AND             sum, sum, temp1                 // sum &= 0xffff
        ADD             sum, sum, temp2                 // sum += temp2
ip_nofold2:

        // NO INVERT! for lwIP
        MOV             r0, sum                                 // store in sum 
result-reg
        // invert
        //MVN           r0, sum                         // invert sum and store 
in result-reg
        // convert to u16_t again
        //AND           r0, r0, temp1           // temp2 &= 0xffff

        // out of here
        LDMIA   sp!,{r1-last_used_reg,r15}

reply via email to

[Prev in Thread] Current Thread [Next in Thread]