SIMD high performace xorcpy in C

In my current project I need a high performace xorcpy implemented in C. Here is my implementation to share and it is tested on my Xeon E5-2680 PC, it reaches almost 1.7GB/s.

#include <emmintrin.h>
...
__forceinline unsigned char* xorcpy(unsigned char* dst, const unsigned char* src, unsigned block_size)
{
    // Do the bulk of the copy a __m128i at a time, for faster speed
    __m128i* mto = (__m128i*)dst;
    const __m128i* mfrom = (__m128i*)(src);
    for(int i=(block_size / sizeof(__m128i) - 1); i>=0; i--)
    {
        __m128i xmm1 = _mm_loadu_si128(mto);
        __m128i xmm2 = _mm_loadu_si128(mfrom);

        xmm1 = _mm_xor_si128(xmm1, xmm2);     //  XOR 16 bytes
        _mm_storeu_si128(mto, xmm1);
        ++mto;
        ++mfrom;
    }

    // The rest bytes we have to do a byte a time though
    unsigned char* cto = (unsigned char*) mto;
    const unsigned char* cfrom = (const unsigned char*)mfrom;
    for(int i=(block_size % sizeof(__m128i)) - 1; i>=0; i--)
    {
        *cto++ ^= (*cfrom++);
    }
    return dst;
}