[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: Faster Array transpose
From: |
David Bateman |
Subject: |
Re: Faster Array transpose |
Date: |
Thu, 01 May 2008 00:53:20 +0200 |
User-agent: |
Thunderbird 2.0.0.12 (X11/20080306) |
I think the code below is a good compromise for this function. Sorry I
can't easily create a changeset as thare are other uncomitted changes in
Array.cc in by repository at the moment
D.
template <class T>
Array<T>
Array<T>::transpose (void) const
{
assert (ndims () == 2);
octave_idx_type nr = dim1 ();
octave_idx_type nc = dim2 ();
if (nr >= 8 && nc >= 8)
{
Array<T> result (dim_vector (nc, nr));
// Blocked transpose to attempt to avoid cache misses.
// Don't use OCTAVE_LOCAL_BUFFER here as it doesn't work with bool
// on some compilers.
Array<T> tmp (64);
T *buf = tmp.fortran_vec ();
octave_idx_type ii = 0, jj;
for (jj = 0; jj < (nc - 8 + 1); jj += 8)
{
for (ii = 0; ii < (nr - 8 + 1); ii += 8)
{
// Copy to buffer
for (octave_idx_type j = jj, k = 0, idxj = jj * nr;
j < jj + 8; j++, idxj += nr)
for (octave_idx_type i = ii; i < ii + 8; i++)
buf[k++] = xelem (i + idxj);
// Copy from buffer
for (octave_idx_type i = ii, idxi = ii * nc; i < ii + 8;
i++, idxi += nc)
for (octave_idx_type j = jj, k = i; j < jj + 8;
j++, k+=8)
result.xelem (j + idxi) = buf[k];
}
if (ii < nr)
for (octave_idx_type j = jj; j < jj + 8; j++)
for (octave_idx_type i = ii; i < nr; i++)
result.xelem (j, i) = xelem (i, j);
}
for (octave_idx_type j = jj; j < nc; j++)
for (octave_idx_type i = ii; i < nr; i++)
result.xelem (j, i) = xelem (i, j);
return result;
}
else if (nr > 1 && nc > 1)
{
Array<T> result (dim_vector (nc, nr));
for (octave_idx_type j = 0; j < nc; j++)
for (octave_idx_type i = 0; i < nr; i++)
result.xelem (j, i) = xelem (i, j);
return result;
}
else
{
// Fast transpose for vectors and empty matrices
return Array<T> (*this, dim_vector (nc, nr));
}
}