c - Optmization for quicksort in x86 32-bit assembly -
i'm trying learn basic x86 32-bit assembly programming. in pursuing decided implement quicksort in assembly (sorting integers). first made c-version of sorting function , made assembly version.
however, when comparing assembly version c-version (compiled gcc on debian), c-version performs more 10 times faster on array of 10000 integers.
so question if can give feedback on obvious optimizations can made on quick sort assembly routine. it's purely educational purposes , i'm not expecting beat compiler makers in terms of producing high speed code i'm interested in knowing if i'm making obvious mistakes hampers speed.
the c-version:
void myqsort(int* elems, int sidx, int eidx) { if (sidx < eidx) { int pivot = elems[eidx]; int = sidx; (int j = sidx; j < eidx; j++) { if (elems[j] <= pivot) { swap(&elems[i], &elems[j]); = + 1; } } swap(&elems[i], &elems[eidx]); myqsort(elems, sidx, - 1); myqsort(elems, + 1, eidx); } } void swap(int* a, int* b) { int tmp = *a; *a = *b; *b = tmp; }
assembly version (nasm):
; ; void asm_quick_sort(int* elems, int startindex, int endindex) ; params: ; elems - pointer elements sort - [ebp + 0x8] ; sid - start index of items - [ebp + 0xc] ; eid - end index of items - [ebp + 0x10] asm_quick_sort: push ebp mov ebp, esp push edi push esi push ebx mov eax, dword [ebp + 0xc] ; store start index, = mov ebx, dword [ebp + 0x10] ; store end index mov esi, dword [ebp + 0x8] ; store pointer first element in esi cmp eax, ebx jnl qsort_done mov ecx, eax ; ecx = j, = sid mov edx, dword [esi + (0x4 * ebx)] ; pivot element, elems[eid], edx = pivot qsort_part_loop: ; j = sid; j < eid; j++ cmp ecx, ebx ; if ecx < end index jnb qsort_end_part ; if elems[j] <= pivot cmp edx, dword [esi + (0x4*ecx)] jb qsort_cont_loop ; swap, elems[i], elems[j] push edx ; save pivot mov edx, dword [esi + (0x4*ecx)] ; edx = elems[j] mov edi, dword [esi + (0x4*eax)] ; edi = elems[i] mov dword [esi + (0x4*eax)], edx ; elems[i] = elems[j] mov dword [esi + (0x4*ecx)], edi ; elems[j] = elems[i] pop edx ; restore pivot ; i++ add eax, 0x1 qsort_cont_loop: add ecx, 0x1 jmp qsort_part_loop qsort_end_part: ; swap, elems[i], elems[eid] mov edx, dword [esi + (0x4*eax)] ; edx = elems[i] mov edi, dword [esi + (0x4*ebx)] ; edi = elems[eid] mov dword [esi + (0x4*ebx)], edx ; elems[eidx] = elems[i] mov dword [esi + (0x4*eax)], edi ; elems[i] = elems[eidx] ; qsort(elems, sid, - 1) ; qsort(elems, + 1, eid) sub eax, 0x1 push eax push dword [ebp + 0xc] ; push start idx push dword [ebp + 0x8] ; push elems vector call asm_quick_sort add esp, 0x8 pop eax add eax, 0x1 push dword [ebp + 0x10] ; push end idx push eax push dword [ebp + 0x8] ; push elems vector call asm_quick_sort add esp, 0xc qsort_done: pop ebx pop esi pop edi mov esp, ebp pop ebp ret
i call assembly routine c , use clock() timing routines.
edit difference in performance no longer issue after correcting bugs pointed out fellow stackoverflowers.
you have error in assembly sort implementation, , speed comparisons useless until resolve it. problem recursive call:
myqsort(elems, sidx, - 1);
seeing not case i
not sidx
, might pass value less sidx
function, including -1 if sidx
0. handled in c implementation:
if (sidx < eidx)
but in assembly version:
cmp eax, ebx jae qsort_done
that's unsigned comparison branch instruction! should using jge
. see segfault due problem. when fixed, performance of both implementations appears same according quick tests (compiling -o3). used following test driver:
#include <stdlib.h> #include <stdio.h> void myqsort(int * elems, int sidx, int eidx); #define size 100000 int main(int argc, char **argv) { int * elems = malloc(size * sizeof(int)); (int j = 0; j < 1000; j++) { (int = 0; < size; i++) { elems[i] = rand(); } myqsort(elems, 0, size - 1); } return 0; }
with c version, run-time approx 5.854 seconds. assembly version, 5.829 seconds (i.e. faster).
Comments
Post a Comment