/////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2000 John Adcock, Tom Barry, Steve Grimm  All rights reserved.
/////////////////////////////////////////////////////////////////////////////
//
//  This file is subject to the terms of the GNU General Public License as
//  published by the Free Software Foundation.  A copy of this license is
//  included with this software distribution in the file COPYING.  If you
//  do not have a copy, you may obtain a copy by writing to the Free
//  Software Foundation, 51 Franklin Steet, Fifth Floor, Cambridge, MA 02110-1301, USA.
//
//  This software is distributed in the hope that it will be useful,
//  but WITHOUT ANY WARRANTY; without even the implied warranty of
//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//  GNU General Public License for more details
///////////////////////////////////////////////////////////////////////////////

///////////////////////////////////////////////////////////////////////////////
// Field 1 | Field 2 | Field 3 | Field 4 |
//   T0    |         |    T1   |         |
//         |   M0    |         |    M1   |
//   B0    |         |    B1   |         |
//

// debugging feature
// output the value of mm4 at this point which is pink where we will weave
// and green were we are going to bob
// uncomment next line to see this
//#define CHECK_BOBWEAVE

#if defined(IS_SSE)
void DScalerFilterGreedy2Frame::filterDScaler_SSE(TDeinterlaceInfo* pInfo)
#elif defined(IS_3DNOW)
void DScalerFilterGreedy2Frame::filterDScaler_3DNOW(TDeinterlaceInfo* pInfo)
#else
void DScalerFilterGreedy2Frame::filterDScaler_MMX(TDeinterlaceInfo* pInfo)
#endif
{
    unsigned char* M1;
    unsigned char* M0;
    unsigned char* T0;
    unsigned char* T1;
    unsigned char* B1;
    unsigned char* B0;
	unsigned char* B0UseInAsm;
    unsigned char* Dest = pInfo->Overlay;
    unsigned char* Dest2;
    unsigned long  Pitch = pInfo->InputPitch;
    unsigned long  LineLength = pInfo->LineLength;
    unsigned long  oldbx;
    unsigned long  oldsp;

    int64_t Mask     = 0x7f7f7f7f7f7f7f7full;
    int64_t DwordOne = 0x0000000100000001ull;
    int64_t DwordTwo = 0x0000000200000002ull;

    int64_t qwGreedyTwoFrameThreshold = _lumaThreshold;
    qwGreedyTwoFrameThreshold += (_chromaThreshold << 8);
    qwGreedyTwoFrameThreshold += (qwGreedyTwoFrameThreshold << 48) +
                                 (qwGreedyTwoFrameThreshold << 32) +
                                 (qwGreedyTwoFrameThreshold << 16);


    if(pInfo->PictureHistory[0]->Flags & PICTURE_INTERLACED_ODD) {
        M1 = pInfo->PictureHistory[0]->pData;
        T1 = pInfo->PictureHistory[1]->pData;
        B1 = T1 + Pitch;
        M0 = pInfo->PictureHistory[2]->pData;
        T0 = pInfo->PictureHistory[3]->pData;
        B0 = T0 + Pitch;
    } else {
        M1 = pInfo->PictureHistory[0]->pData + Pitch;
        T1 = pInfo->PictureHistory[1]->pData;
        B1 = T1 + Pitch;
        M0 = pInfo->PictureHistory[2]->pData + Pitch;
        T0 = pInfo->PictureHistory[3]->pData;
        B0 = T0 + Pitch;

        pInfo->pMemcpy(Dest, pInfo->PictureHistory[0]->pData, pInfo->LineLength);
        Dest += pInfo->OverlayPitch;
    }

    for (long Line = 0; Line < pInfo->FieldHeight - 1; ++Line) {
        // Always use the most recent data verbatim.  By definition it's correct (it'd
        // be shown on an interlaced display) and our job is to fill in the spaces
        // between the new lines.
        memcpy(Dest, T1, pInfo->LineLength);
        Dest += pInfo->OverlayPitch;
        Dest2 = Dest;

        B0UseInAsm = B0;

        __asm__ __volatile__
            (
             // Save ebx manually (-fPIC trouble)
             MOVX" %%"XBX", %[oldbx]\n\t"
             // Save esp manually (it makes no sense to push/pop the stack pointer itself)
             MOVX" %%"XSP", %[oldsp]\n\t"

             // Figure out what to do with the scanline above the one we just copied.
             // See above for a description of the algorithm.
             MOVX" %[LineLength], %%"XCX"\n\t"   // LineLength
             MOVX" %[T1],         %%"XAX"\n\t"   // T1
             MOVX" %[M1],         %%"XBX"\n\t"   // M1
             MOVX" %[B1],         %%"XDX"\n\t"   // B1
             MOVX" %[M0],         %%"XSI"\n\t"   // M0
             MOVX" %[T0],         %%"XSP"\n\t"   // T0

             SHRX" $3,      %%"XCX"\n\t"         // there are LineLength / 8 qwords
             "movq %[Mask], %%mm6\n\t"           // Mask

             ".align 8\n\t"
             "1:\n\t"

             MOVX" %[B0UseInAsm], %%"XDI"\n\t" // B0UseInAsm
             "movq (%%"XAX"),     %%mm1\n\t"   // *T1
             "movq (%%"XBX"),     %%mm0\n\t"   // *M1
             "movq (%%"XDX"),     %%mm3\n\t"   // *B1
             "movq (%%"XSI"),     %%mm2\n\t"   // *M0

             // Average T1 and B1 so we can do interpolated bobbing if we bob onto T1.
             "movq   %%mm3,  %%mm7\n\t"       // mm7 = B1
#if defined(IS_SSE)
             "pavgb   %%mm1, %%mm7\n\t"
#elif defined(IS_3DNOW)
             "pavgusb %%mm1, %%mm7\n\t"
#else
             "movq  %%mm1, %%mm5\n\t"       // mm5 = T1
             "psrlw $1,    %%mm7\n\t"       // mm7 = B1 / 2
             "pand  %%mm6, %%mm7\n\t"       // mask off lower bits
             "psrlw $1,    %%mm5\n\t"       // mm5 = T1 / 2
             "pand  %%mm6, %%mm5\n\t"       // mask off lower bits
             "paddw %%mm5, %%mm7\n\t"       // mm7 = (T1 + B1) / 2
#endif

             // calculate |M1-M0| put result in mm4 need to keep mm0 intact
             // if we have a good processor then make mm0 the average of M1 and M0
             // which should make weave look better when there is small amounts of
             // movement
#if defined(IS_SSE)
             "movq    %%mm0, %%mm4\n\t"
             "movq    %%mm2, %%mm5\n\t"
             "psubusb %%mm2, %%mm4\n\t"
             "psubusb %%mm0, %%mm5\n\t"
             "por     %%mm5, %%mm4\n\t"
             "psrlw   $1,    %%mm4\n\t"
             "pavgb   %%mm2, %%mm0\n\t"
             "pand    %%mm6, %%mm4\n\t"
#elif defined(IS_3DNOW)
             "movq    %%mm0, %%mm4\n\t"
             "movq    %%mm2, %%mm5\n\t"
             "psubusb %%mm2, %%mm4\n\t"
             "psubusb %%mm0, %%mm5\n\t"
             "por     %%mm5, %%mm4\n\t"
             "psrlw   $1,    %%mm4\n\t"
             "pavgusb %%mm2, %%mm0\n\t"
             "pand    %%mm6, %%mm4\n\t"
#else
             "movq    %%mm0, %%mm4\n\t"
             "psubusb %%mm2, %%mm4\n\t"
             "psubusb %%mm0, %%mm2\n\t"
             "por     %%mm2, %%mm4\n\t"
             "psrlw   $1,    %%mm4\n\t"
             "pand    %%mm6, %%mm4\n\t"
#endif

             // if |M1-M0| > Threshold we want dword worth of twos
             "pcmpgtb %[qwGreedyTwoFrameThreshold],  %%mm4\n\t"          // (qwGreedyTwoFrameThreshold)
             "pand    %[Mask],  %%mm4\n\t"      // (Mask) get rid of any sign bit
             "pcmpgtd %[DwordOne], %%mm4\n\t"   // (DwordOne) do we want to bob
             "pandn   %[DwordTwo], %%mm4\n\t"

             "movq    (%%"XSP"), %%mm2\n\t"    // mm2 = T0

             // calculate |T1-T0| put result in mm5
             "movq    %%mm2, %%mm5\n\t"
             "psubusb %%mm1, %%mm5\n\t"
             "psubusb %%mm2, %%mm1\n\t"
             "por     %%mm1, %%mm5\n\t"
             "psrlw   $1,    %%mm5\n\t"
             "pand    %%mm6, %%mm5\n\t"

             // if |T1-T0| > Threshold we want dword worth of ones
             "pcmpgtb %[qwGreedyTwoFrameThreshold], %%mm5\n\t"
             "pand    %%mm6,                        %%mm5\n\t" // get rid of any sign bit
             "pcmpgtd %[DwordOne],                  %%mm5\n\t"
             "pandn   %[DwordOne],                  %%mm5\n\t"
             "paddd   %%mm5,                        %%mm4\n\t"

             "movq    (%%"XDI"), %%mm2\n\t"    // B0

             // calculate |B1-B0| put result in mm5
             "movq    %%mm2, %%mm5\n\t"
             "psubusb %%mm3, %%mm5\n\t"
             "psubusb %%mm2, %%mm3\n\t"
             "por     %%mm3, %%mm5\n\t"
             "psrlw   $1,    %%mm5\n\t"
             "pand    %%mm6, %%mm5\n\t"

             // if |B1-B0| > Threshold we want dword worth of ones
             "pcmpgtb %[qwGreedyTwoFrameThreshold], %%mm5\n\t" // (qwGreedyTwoFrameThreshold)
             "pand    %%mm6,                        %%mm5\n\t" // get rid of any sign bit
             "pcmpgtd %[DwordOne],                  %%mm5\n\t"
             "pandn   %[DwordOne],                  %%mm5\n\t"
             "paddd   %%mm5,                        %%mm4\n\t"

             // Get the dest pointer.
             ADDX"    $8,       %%"XDI"\n\t"
             MOVX"    %%"XDI",  %[B0UseInAsm]\n\t"
             MOVX"    %[Dest2], %%"XDI"\n\t"

             "pcmpgtd %[DwordTwo], %%mm4\n\t"

             // debugging feature
             // output the value of mm4 at this point which is pink where we will weave
             // and green were we are going to bob
#ifdef CHECK_BOBWEAVE
#ifdef IS_SSE
             "movntq %%mm4, (%%"XDI")\n\t"
#else
             "movq   %%mm4, (%%"XDI")\n\t"
#endif
#else
             "movq   %%mm4, %%mm5\n\t"
             // mm4 now is 1 where we want to weave and 0 where we want to bob
             "pand   %%mm0, %%mm4\n\t"
             "pandn  %%mm7, %%mm5\n\t"
             "por    %%mm5, %%mm4\n\t"
#ifdef IS_SSE
             "movntq %%mm4, (%%"XDI")\n\t"
#else
             "movq   %%mm4, (%%"XDI")\n\t"
#endif
#endif

             // Advance to the next set of pixels.
             ADDX" $8,      %%"XDI"\n\t"
             ADDX" $8,      %%"XAX"\n\t"
             ADDX" $8,      %%"XBX"\n\t"
             ADDX" $8,      %%"XDX"\n\t"
             MOVX" %%"XDI", %[Dest2]\n\t" // Dest2
             ADDX" $8,      %%"XSI"\n\t"
             ADDX" $8,      %%"XSP"\n\t"

             DECX" %%"XCX"\n\t"
             "jne 1b\n\t"

             MOVX" %[oldbx], %%"XBX"\n\t"
             MOVX" %[oldsp], %%"XSP"\n\t"

             : /* no outputs */

             : [LineLength]                "m"(LineLength),
               [T1]                        "m"(T1),
               [M1]                        "m"(M1),
               [B1]                        "m"(B1),
               [M0]                        "m"(M0),
               [T0]                        "m"(T0),
               [Mask]                      "m"(Mask),
               [B0UseInAsm]                "m"(B0UseInAsm),
               [qwGreedyTwoFrameThreshold] "m"(qwGreedyTwoFrameThreshold),
               [DwordOne]                  "m"(DwordOne),
               [DwordTwo]                  "m"(DwordTwo),
               [Dest2]                     "m"(Dest2),
               [oldbx]                     "m"(oldbx),
               [oldsp]                     "m"(oldsp)

             : XAX, XCX, XDX, XSI, XDI,
#ifdef ARCH_386
               "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)",
#endif
               "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7",
               "memory", "cc"
             );

        Dest += pInfo->OverlayPitch;

        M1 += Pitch;
        T1 += Pitch;
        B1 += Pitch;
        M0 += Pitch;
        T0 += Pitch;
        B0 += Pitch;
    }

    if(pInfo->PictureHistory[0]->Flags & PICTURE_INTERLACED_ODD) {
        pInfo->pMemcpy(Dest, T1, pInfo->LineLength);
        Dest += pInfo->OverlayPitch;
        pInfo->pMemcpy(Dest, M1, pInfo->LineLength);
    } else {
        pInfo->pMemcpy(Dest, T1, pInfo->LineLength);
    }

    // clear out the MMX registers ready for doing floating point again
#ifdef ARCH_386
    __asm__ __volatile__("emms");
#endif
}
