c - Optmization for quicksort in x86 32-bit assembly -


i'm trying learn basic x86 32-bit assembly programming. in pursuing decided implement quicksort in assembly (sorting integers). first made c-version of sorting function , made assembly version.

however, when comparing assembly version c-version (compiled gcc on debian), c-version performs more 10 times faster on array of 10000 integers.

so question if can give feedback on obvious optimizations can made on quick sort assembly routine. it's purely educational purposes , i'm not expecting beat compiler makers in terms of producing high speed code i'm interested in knowing if i'm making obvious mistakes hampers speed.

the c-version:

void myqsort(int* elems, int sidx, int eidx) {      if (sidx < eidx)     {         int pivot = elems[eidx];         int = sidx;         (int j = sidx; j < eidx; j++)         {             if (elems[j] <= pivot)             {                 swap(&elems[i], &elems[j]);                 = + 1;             }         }         swap(&elems[i], &elems[eidx]);         myqsort(elems, sidx, - 1);         myqsort(elems, + 1, eidx);     } } void swap(int* a, int* b) {     int tmp = *a;     *a = *b;     *b = tmp; } 

assembly version (nasm):

; ; void asm_quick_sort(int* elems, int startindex, int endindex) ; params: ;       elems - pointer elements sort - [ebp + 0x8] ;       sid - start index of items - [ebp + 0xc] ;       eid - end index of items - [ebp + 0x10] asm_quick_sort:      push ebp     mov ebp, esp      push edi     push esi     push ebx      mov eax, dword [ebp + 0xc]  ; store start index,  =     mov ebx, dword [ebp + 0x10] ; store end index     mov esi, dword [ebp + 0x8]  ; store pointer first element in esi      cmp eax, ebx     jnl qsort_done      mov ecx, eax                        ; ecx = j, = sid     mov edx, dword [esi + (0x4 * ebx)]  ; pivot element, elems[eid], edx = pivot qsort_part_loop:     ; j = sid; j < eid; j++     cmp ecx, ebx                    ; if ecx < end index     jnb qsort_end_part     ; if elems[j] <= pivot     cmp edx, dword [esi + (0x4*ecx)]     jb qsort_cont_loop     ; swap, elems[i], elems[j]     push edx ; save pivot     mov edx, dword [esi + (0x4*ecx)]        ; edx = elems[j]     mov edi, dword [esi + (0x4*eax)]        ; edi = elems[i]     mov dword [esi + (0x4*eax)], edx        ; elems[i] = elems[j]     mov dword [esi + (0x4*ecx)], edi        ; elems[j] = elems[i]     pop edx ; restore pivot     ; i++     add eax, 0x1 qsort_cont_loop:     add ecx, 0x1     jmp qsort_part_loop qsort_end_part:     ; swap, elems[i], elems[eid]     mov edx, dword [esi + (0x4*eax)]        ; edx = elems[i]     mov edi, dword [esi + (0x4*ebx)]        ; edi = elems[eid]     mov dword [esi + (0x4*ebx)], edx        ; elems[eidx] = elems[i]     mov dword [esi + (0x4*eax)], edi        ; elems[i] = elems[eidx]      ; qsort(elems, sid, - 1)     ; qsort(elems, + 1, eid)     sub eax, 0x1     push eax     push dword [ebp + 0xc]  ; push start idx     push dword [ebp + 0x8]  ; push elems vector     call asm_quick_sort     add esp, 0x8     pop eax     add eax, 0x1     push dword [ebp + 0x10] ; push end idx     push eax     push dword [ebp + 0x8]  ; push elems vector     call asm_quick_sort     add esp, 0xc   qsort_done:     pop ebx     pop esi     pop edi      mov esp, ebp     pop ebp      ret 

i call assembly routine c , use clock() timing routines.

edit difference in performance no longer issue after correcting bugs pointed out fellow stackoverflowers.

you have error in assembly sort implementation, , speed comparisons useless until resolve it. problem recursive call:

    myqsort(elems, sidx, - 1); 

seeing not case i not sidx, might pass value less sidx function, including -1 if sidx 0. handled in c implementation:

if (sidx < eidx) 

but in assembly version:

cmp eax, ebx jae qsort_done 

that's unsigned comparison branch instruction! should using jge. see segfault due problem. when fixed, performance of both implementations appears same according quick tests (compiling -o3). used following test driver:

#include <stdlib.h> #include <stdio.h>  void myqsort(int * elems, int sidx, int eidx);  #define size 100000  int main(int argc, char **argv) {     int * elems = malloc(size * sizeof(int));      (int j = 0; j < 1000; j++) {          (int = 0; < size; i++) {             elems[i] = rand();         }          myqsort(elems, 0, size - 1);     }     return 0; } 

with c version, run-time approx 5.854 seconds. assembly version, 5.829 seconds (i.e. faster).


Comments

Popular posts from this blog

resizing Telegram inline keyboard -

command line - How can a Python program background itself? -

php - "cURL error 28: Resolving timed out" on Wordpress on Azure App Service on Linux -