/*
 * Copyright (C) 2007-2019 Xagasoft, All rights reserved.
 *
 * This file is part of the libbu++ library and is released under the
 * terms of the license contained in the file LICENSE.
 */

#include "bu/utfstring.h"

#include "bu/string.h"
#include "bu/blob.h"
#include "bu/stream.h"
#include "bu/config.h"
#include "bu/sio.h"
#include "bu/membuf.h"
#include "bu/formatter.h"

using Bu::sio;

uint8_t Bu::UtfString::utf8_lmask[8] = {
    0x00,
    0x01,
    0x03,
    0x07,
    0x0f,
    0x1f,
    0x3f,
    0x7f
};

Bu::UtfString::UtfString()
{
}

Bu::UtfString::UtfString( const Bu::String &sInput, Encoding eEnc )
{
    set( sInput, eEnc );
}

Bu::UtfString::UtfString( const Bu::Blob &sInput, Encoding eEnc )
{
    set( sInput, eEnc );
}

Bu::UtfString::UtfString( const char *sInput, Encoding eEnc )
{
    set( Bu::Blob(sInput), eEnc );
}

Bu::UtfString::~UtfString()
{
}

Bu::UtfString::iterator Bu::UtfString::begin()
{
    return Bu::UtfString::iterator( this, 0 );
}

Bu::UtfString::const_iterator Bu::UtfString::begin() const
{
    return Bu::UtfString::const_iterator( this, 0 );
}

void Bu::UtfString::set( const Bu::String &sInput, Encoding eEnc )
{
    set( sInput, eEnc );
}

void Bu::UtfString::set( const Bu::Blob &sInput, Encoding eEnc )
{
    switch( eEnc )
    {
        case Utf8:
            setUtf8( sInput );
            break;

        case Utf16:
            setUtf16( sInput );
            break;

        case Utf16be:
            setUtf16be( sInput );
            break;

        case Utf16le:
            setUtf16le( sInput );
            break;

        case Utf32:
            setUtf32( sInput );
            break;

        case Utf32be:
            setUtf32be( sInput );
            break;

        case Utf32le:
            setUtf32le( sInput );
            break;

        case Ucs2:
            throw Bu::ExceptionBase("Ucs2 not supported yet.");
            break;

        case Ucs4:
            throw Bu::ExceptionBase("Ucs4 not supported yet.");
            break;

        case GuessEncoding:
            throw Bu::ExceptionBase("Guessing mode not supported yet.");
            break;
    }
}

void Bu::UtfString::append( UtfChar ch )
{
    if( ch >= 0x10000 )
    {
        ch -= 0x10000;
        append16( ((ch>>10)&0x3FF)| 0xD800u );
        append16( (ch&0x3FF)| 0xDC00u );
    }
    else
    {
        append16( (uint16_t)(ch) );
    }
}

void Bu::UtfString::append( const UtfString &rSrc )
{
    aData.append( rSrc.aData );
    iRawLen += rSrc.iRawLen;
    iCharLen += rSrc.iCharLen;
}

void Bu::UtfString::setUtf8( const Bu::Blob &sInput )
{
    for( Bu::Blob::const_iterator i = sInput.begin(); i; i++ )
    {
        if( ((int)(uint8_t)*i)&0x80 )
        {
            int iBytes = 1;
            for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { }
            Bu::UtfChar uPt = ((*i) & utf8_lmask[7-iBytes])<<(6*(iBytes-1));
            for( iBytes--; iBytes >= 1; iBytes-- )
            {
                i++;
                uPt |= ((*i)&utf8_lmask[6])<<(6*(iBytes-1));
            }
            append( uPt );
        }
        else
        {
            append( (Bu::UtfChar)(*i) );
        }
    }
}

void Bu::UtfString::setUtf16( const Bu::Blob &sInput )
{
//  Bu::Blob::const_iterator i = sInput.begin();
    if( (uint8_t)*sInput.begin() == 0xFF &&
        (uint8_t)*(sInput.begin()+1) == 0xFE )
    {
        setUtf16le( sInput );
        return;
    }
    setUtf16be( sInput );
}

void Bu::UtfString::setUtf16be( const Bu::Blob &sInput )
{
    Bu::Blob::const_iterator i = sInput.begin();
    if( (uint8_t)*sInput.begin() == 0xFE &&
        (uint8_t)*(sInput.begin()+1) == 0xFF )

    {
        i += 2;
        sio << "Verified big endian." << sio.nl;
    }
    else
    {
        sio << "Assuming big endian." << sio.nl;
    }
    uint16_t hi, lo;
    for( ; i; i++ )
    {
        hi = (((uint8_t)*i)<<8) | ((uint8_t)*(++i));
        append16( hi );
        if( (hi&0xD800u) == 0xD800u )
        {
            lo = (((uint8_t)*(++i))<<8) | ((uint8_t)*(++i));
            append16( lo );
        }
    }
}

void Bu::UtfString::setUtf16le( const Bu::Blob &sInput )
{
    Bu::Blob::const_iterator i = sInput.begin();
    if( (uint8_t)*sInput.begin() == 0xFF &&
        (uint8_t)*(sInput.begin()+1) == 0xFE )
    {
        i += 2;
        sio << "Verified little endian." << sio.nl;
    }
    else
    {
        sio << "Assuming little endian." << sio.nl;
    }
    uint16_t hi, lo;
    for( ; i; i++ )
    {
        hi = (((uint8_t)*i)) | ((uint8_t)*(++i)<<8);
        append16( hi );
        if( (hi&0xD800u) == 0xD800u )
        {
            lo = (((uint8_t)*(++i))) | ((uint8_t)*(++i)<<8);
            append16( lo );
        }
    }
}

void Bu::UtfString::setUtf32( const Bu::Blob &sInput )
{
    Bu::Blob::const_iterator i = sInput.begin();
    if( (uint8_t)*i == 0x00 &&
        (uint8_t)*(++i) == 0x00 &&
        (uint8_t)*(++i) == 0xFF &&
        (uint8_t)*(++i) == 0xFE )
    {
        setUtf32le( sInput );
        return;
    }
    setUtf32be( sInput );
}

void Bu::UtfString::setUtf32be( const Bu::Blob &sInput )
{
    Bu::Blob::const_iterator i = sInput.begin();
    if( (uint8_t)*i == 0x00 &&
        (uint8_t)*(++i) == 0x00 &&
        (uint8_t)*(++i) == 0xFE &&
        (uint8_t)*(++i) == 0xFF )
    {
        i++;
        sio << "Verified big endian." << sio.nl;
    }
    else
    {
        i = sInput.begin();
        sio << "Assuming big endian." << sio.nl;
    }
    for( ; i; i++ )
    {
        append( (((uint8_t)*i)<<24) |
                (((uint8_t)*(++i))<<16) |
                (((uint8_t)*(++i))<<8) |
                ((uint8_t)*(++i))
              );
    }
}

void Bu::UtfString::setUtf32le( const Bu::Blob &sInput )
{
    Bu::Blob::const_iterator i = sInput.begin();
    if( (uint8_t)*i == 0x00 &&
        (uint8_t)*(++i) == 0x00 &&
        (uint8_t)*(++i) == 0xFF &&
        (uint8_t)*(++i) == 0xFE )
    {
        i++;
        sio << "Verified little endian." << sio.nl;
    }
    else
    {
        i = sInput.begin();
        sio << "Assuming little endian." << sio.nl;
    }
    for( ; i; i++ )
    {
        append( ((uint8_t)*i) |
                (((uint8_t)*(++i))<<8) |
                (((uint8_t)*(++i))<<16) |
                (((uint8_t)*(++i))<<24)
              );
    }
}

void Bu::UtfString::write( Bu::Stream &sOut, Encoding eEnc ) const
{
    switch( eEnc )
    {
        case Utf8:
            writeUtf8( sOut );
            break;

        case Utf16:
//          writeUtf16( sOut );
//          break;

        case Utf16be:
            writeUtf16be( sOut );
            break;

        case Utf16le:
            writeUtf16le( sOut );
            break;

        case Utf32:
//          writeUtf32( sOut );
//          break;

        case Utf32be:
            writeUtf32be( sOut );
            break;

        case Utf32le:
            writeUtf32le( sOut );
            break;

        case Ucs2:
            throw Bu::ExceptionBase("Ucs2 not supported yet.");
            break;

        case Ucs4:
            throw Bu::ExceptionBase("Ucs4 not supported yet.");
            break;

        case GuessEncoding:
            throw Bu::ExceptionBase(
                "GuessEncoding is incompatible with encoding.");
            break;

    }
}

int Bu::UtfString::readPoint( Bu::Stream &sIn, Bu::UtfChar &c,
        Bu::UtfString::Encoding sEnc )
{
    switch( sEnc )
    {
        case Utf8:
            {
                uint8_t i;
                int iRead = 1;
                if( sIn.read( &i, 1 ) < 1 )
                    return 0;
                if( ((int)i)&0x80 )
                {
                    int iBytes = 1;
                    for(; (((uint8_t)i)<<iBytes)&0x80; iBytes++ ) { }
                    iRead = iBytes;
                    c = (i & utf8_lmask[7-iBytes])<<(6*(iBytes-1));
                    for( iBytes--; iBytes >= 1; iBytes-- )
                    {
                        if( sIn.read( &i, 1 ) < 1 )
                            return 0;
                        c |= (i&utf8_lmask[6])<<(6*(iBytes-1));
                    }
                    return iRead;
                }
                else
                {
                    c = (Bu::UtfChar)i;
                    return 1;
                }
            }
            break;

        case Utf16:
        case Utf16be:
        case Utf16le:
        case Utf32:
        case Utf32be:
        case Utf32le:
        case Ucs2:
        case Ucs4:
        case GuessEncoding:
            throw Bu::ExceptionBase("Not implemented.");
            break;
    }
    return -1;
}

int Bu::UtfString::writePoint( Bu::Stream &sOut, const Bu::UtfChar &c,
        Bu::UtfString::Encoding sEnc )
{
    switch( sEnc )
    {
        case Utf8:
            {
                uint8_t uByte;
                if( c >= 0x010000 )
                {
                    // Four bytes
                    // 111 111111 111111 111111
                    uByte = (c>>18)|0xF0;
                    sOut.write( &uByte, 1 );
                    uByte = ((c>>12)&0x3F)|0x80;
                    sOut.write( &uByte, 1 );
                    uByte = ((c>>6)&0x3F)|0x80;
                    sOut.write( &uByte, 1 );
                    uByte = (c&0x3F)|0x80;
                    sOut.write( &uByte, 1 );
                    return 4;
                }
                else if( c >= 0x800 )
                {
                    // Three bytes
                    // 1111 111111 111111
                    uByte = (c>>12)|0xE0;
                    sOut.write( &uByte, 1 );
                    uByte = ((c>>6)&0x3F)|0x80;
                    sOut.write( &uByte, 1 );
                    uByte = (c&0x3F)|0x80;
                    sOut.write( &uByte, 1 );
                    return 3;
                }
                else if( c >= 0x80 )
                {
                    // Two bytes
                    // 11111 111111
                    uByte = (c>>6)|0xC0;
                    sOut.write( &uByte, 1 );
                    uByte = (c&0x3F)|0x80;
                    sOut.write( &uByte, 1 );
                    return 2;
                }
                else
                {
                    // One byte
                    uByte = c;
                    sOut.write( &uByte, 1 );
                    return 1;
                }
            }
            break;

        case Utf16:
        case Utf16be:
        case Utf16le:
        case Utf32:
        case Utf32be:
        case Utf32le:
        case Ucs2:
        case Ucs4:
        case GuessEncoding:
            throw Bu::ExceptionBase("Not implemented.");
            break;
    }
    return -1;
}

int32_t Bu::UtfString::toInt32( int iRadix ) const
{
    return strtol( get().getData(), NULL, iRadix );
}

int64_t Bu::UtfString::toInt64( int iRadix ) const
{
    return strtoll( get().getData(), NULL, iRadix );
}

void Bu::UtfString::writeUtf8( Bu::Stream &sOut ) const
{
    int iPos = 0;
    while( iPos < aData.getSize() )
    {
        uint8_t uByte;
        Bu::UtfChar chr = nextChar( iPos );
        if( chr >= 0x010000 )
        {
            // Four bytes
            // 111 111111 111111 111111
            uByte = (chr>>18)|0xF0;
            sOut.write( &uByte, 1 );
            uByte = ((chr>>12)&0x3F)|0x80;
            sOut.write( &uByte, 1 );
            uByte = ((chr>>6)&0x3F)|0x80;
            sOut.write( &uByte, 1 );
            uByte = (chr&0x3F)|0x80;
            sOut.write( &uByte, 1 );
        }
        else if( chr >= 0x800 )
        {
            // Three bytes
            // 1111 111111 111111
            uByte = (chr>>12)|0xE0;
            sOut.write( &uByte, 1 );
            uByte = ((chr>>6)&0x3F)|0x80;
            sOut.write( &uByte, 1 );
            uByte = (chr&0x3F)|0x80;
            sOut.write( &uByte, 1 );
        }
        else if( chr >= 0x80 )
        {
            // Two bytes
            // 11111 111111
            uByte = (chr>>6)|0xC0;
            sOut.write( &uByte, 1 );
            uByte = (chr&0x3F)|0x80;
            sOut.write( &uByte, 1 );
        }
        else
        {
            // One byte
            uByte = chr;
            sOut.write( &uByte, 1 );
        }
    }
}
/*
void Bu::UtfString::writeUtf16( Bu::Stream &sOut )
{
}
*/
void Bu::UtfString::writeUtf16be( Bu::Stream &sOut ) const
{
#if BYTE_ORDER == BIG_ENDIAN
    uint16_t iTmp = 0xFEFF; // Byte Order Marker
    sOut.write( &iTmp, 2 );
    for( Array<uint16_t>::const_iterator i = aData.begin(); i; i++ )
    {
        iTmp = *i;
        sOut.write( &iTmp, 2 );
    }
#else
    uint16_t iTmp = 0xFEFF; // Byte Order Marker
    iTmp = (iTmp>>8) | (iTmp<<8);
    sOut.write( &iTmp, 2 );
    for( Array<uint16_t>::const_iterator i = aData.begin(); i; i++ )
    {
        iTmp = *i;
        iTmp = (iTmp>>8) | (iTmp<<8);
        sOut.write( &iTmp, 2 );
    }
#endif
}

void Bu::UtfString::writeUtf16le( Bu::Stream &sOut ) const
{
#if BYTE_ORDER == LITTLE_ENDIAN
    uint16_t iTmp = 0xFEFF; // Byte Order Marker
    sOut.write( &iTmp, 2 );
    for( Array<uint16_t>::const_iterator i = aData.begin(); i; i++ )
    {
        iTmp = *i;
        sOut.write( &iTmp, 2 );
    }
#else
    uint16_t iTmp = 0xFEFF; // Byte Order Marker
    iTmp = (iTmp>>8) | (iTmp<<8);
    sOut.write( &iTmp, 2 );
    for( Array<uint16_t>::const_iterator i = aData.begin(); i; i++ )
    {
        iTmp = *i;
        iTmp = (iTmp>>8) | (iTmp<<8);
        sOut.write( &iTmp, 2 );
    }
#endif
}

void Bu::UtfString::writeUtf32be( Bu::Stream &sOut ) const
{
#if BYTE_ORDER == BIG_ENDIAN
    uint32_t iTmp = 0xFEFF; // Byte Order Marker
    sOut.write( &iTmp, 4 );
    int i = 0;
    while( i < aData.getSize() )
    {
        iTmp = nextChar( i );
        sOut.write( &iTmp, 4 );
    }
#else
    uint32_t iTmp = 0xFEFF; // Byte Order Marker
    iTmp = (iTmp>>24)|(iTmp<<24)|((iTmp&0xff0000)>>8)|((iTmp&0xff00)<<8);
    sOut.write( &iTmp, 4 );
    int i = 0;
    while( i < aData.getSize() )
    {
        iTmp = nextChar( i );
        iTmp = (iTmp>>24)|(iTmp<<24)|((iTmp&0xff0000)>>8)|((iTmp&0xff00)<<8);
        sOut.write( &iTmp, 4 );
    }
#endif
}

void Bu::UtfString::writeUtf32le( Bu::Stream &sOut ) const
{
#if BYTE_ORDER == LITTLE_ENDIAN
    uint32_t iTmp = 0xFEFF; // Byte Order Marker
    sOut.write( &iTmp, 4 );
    int i = 0;
    while( i < aData.getSize() )
    {
        iTmp = nextChar( i );
        sOut.write( &iTmp, 4 );
    }
#else
    uint32_t iTmp = 0xFEFF; // Byte Order Marker
    iTmp = (iTmp>>24)|(iTmp<<24)|((iTmp&0xff0000)>>8)|((iTmp&0xff00)<<8);
    sOut.write( &iTmp, 4 );
    int i = 0;
    while( i < aData.getSize() )
    {
        iTmp = nextChar( i );
        iTmp = (iTmp>>24)|(iTmp<<24)|((iTmp&0xff0000)>>8)|((iTmp&0xff00)<<8);
        sOut.write( &iTmp, 4 );
    }
#endif
}

Bu::UtfChar Bu::UtfString::get( int iIndex ) const
{
    return nextChar( iIndex );
}

Bu::UtfChar Bu::UtfString::nextChar( int &iIndex ) const
{
    Bu::UtfChar i = aData[iIndex++];
    switch( i&0xFC00 )
    {
        case 0xD800:
            return (((i&0x3FF)<<10) | ((aData[iIndex++]&0x3FF)))+0x10000;

        case 0xDC00:
            return (((aData[iIndex-2]&0x3FF)<<10) | ((i&0x3FF)))+0x10000;

        default:
            return i;
    }
}

bool Bu::UtfString::operator==( const Bu::UtfString &rhs ) const
{
    return aData == rhs.aData;
}

bool Bu::UtfString::operator==( const Bu::Blob &rhs ) const
{
    // Nieve comparison
    if( aData.getSize() != rhs.getSize() )
        return  false;

    for( int j = 0; j < aData.getSize(); j++ )
    {
        if( aData[j] != rhs[j] )
            return false;
    }

    return true;
}

bool Bu::UtfString::operator==( const char *rhs ) const
{
    // Nieve comparison
    int j;
    for( j = 0; j < aData.getSize(); j++ )
    {
        if( rhs[j] == '\0' || aData[j] != rhs[j] )
            return false;
    }
    if( rhs[j] != '\0' )
        return false;

    return true;
}

Bu::UtfString &Bu::UtfString::operator+=( const Bu::UtfString &rhs )
{
    append( rhs );
    return *this;
}

Bu::UtfString &Bu::UtfString::operator+=( const UtfChar &rhs )
{
    append( rhs );
    return *this;
}

bool Bu::UtfString::operator<( const Bu::UtfString &rhs ) const
{
    for( int j = 0; j < aData.getSize() && j < rhs.aData.getSize(); j++ )
    {
        if( aData[j] != rhs.aData[j] )
            return aData[j] < rhs.aData[j];
    }

    return false;
}

bool Bu::UtfString::operator<=( const Bu::UtfString &rhs ) const
{
    for( int j = 0; j < aData.getSize() && j < rhs.aData.getSize(); j++ )
    {
        if( aData[j] != rhs.aData[j] )
            return aData[j] < rhs.aData[j];
    }

    if( aData.getSize() == rhs.aData.getSize() )
        return true;

    return false;
}

bool Bu::UtfString::operator>( const Bu::UtfString &rhs ) const
{
    for( int j = 0; j < aData.getSize() && j < rhs.aData.getSize(); j++ )
    {
        if( aData[j] != rhs.aData[j] )
            return aData[j] > rhs.aData[j];
    }

    return false;
}

bool Bu::UtfString::operator>=( const Bu::UtfString &rhs ) const
{
    for( int j = 0; j < aData.getSize() && j < rhs.aData.getSize(); j++ )
    {
        if( aData[j] != rhs.aData[j] )
            return aData[j] > rhs.aData[j];
    }

    if( aData.getSize() == rhs.aData.getSize() )
        return true;

    return false;
}

Bu::String Bu::UtfString::get( Encoding eEnc ) const
{
    Bu::MemBuf mb;
    write( mb, eEnc );
    //return Bu::Blob( mb.getString().getStr(), mb.getString().getSize() );
    return Bu::String( mb.getString().getStr(), mb.getString().getSize() );
}

void Bu::UtfString::debug() const
{
    sio << "Raw Utf16: ";
    for( int i = 0; i < aData.getSize(); i++ )
    {
        if( i > 0 )
            sio << ", ";
        sio << "0x" << Fmt::hex() << aData[i];
    }
    sio << sio.nl;
    sio << "Code Points: ";
    for( int i = 0; i < aData.getSize(); i++ )
    {
        if( i > 0 )
            sio << ", ";
        sio << "0x" << Fmt::hex() << nextChar( i );
    }
    sio << sio.nl;
}
/*
void Bu::UtfString::debugUtf8( const Bu::Blob &sUtf8 )
{
    for( Bu::Blob::const_iterator i = sUtf8.begin(); i; i++ )
    {
        if( i != sUtf8.begin() )
            sio << ", ";
        if( ((int)(uint8_t)*i)&0x80 )
        {
//          sio << "Flag byte: " << Bu::Fmt().radix(2).width(8).fill('0')
//              << (int)(uint8_t)*i << sio.nl;
            int iBytes = 1;
            for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { }
//          sio << "iBytes = " << iBytes << sio.nl;
            Bu::UtfChar uPt = ((*i) & utf8_lmask[7-iBytes])<<(6*(iBytes-1));
//          sio << "mask: " << Bu::Fmt().radix(2).width(8).fill('0')
//              << (int)utf8_lmask[7-iBytes] << sio.nl;
            for( iBytes--; iBytes >= 1; iBytes-- )
            {
//              sio << "iBytes = " << iBytes << ", shift = " << (6*(iBytes-1))
//                  << sio.nl;
//              sio << "next: " << Bu::Fmt().radix(2).width(8).fill('0')
//                  << (int)(uint8_t)*i << sio.nl
//                  << "mask: " << Bu::Fmt().radix(2).width(8).fill('0')
//                  << (int)utf8_lmask[6] << sio.nl;
                i++;
                uPt |= ((*i)&utf8_lmask[6])<<(6*(iBytes-1));
            }
            sio << uPt;
//          sio << " (" << Bu::Fmt( 8, 2 ).fill('0')
//              << uPt << ")";
        }
        else
        {
            sio << (int)((uint8_t)*i);
        }
    }
    sio << sio.nl;
}
*/

template<> uint32_t Bu::__calcHashCode<Bu::UtfString>( const Bu::UtfString &k )
{
    uint32_t uCode = 0;

    for( Bu::UtfString::const_iterator i = k.begin(); i; i++ )
    {
        uCode = *i + (uCode<<6) + (uCode<<16) - uCode;
    }

    return uCode;
}

template<> bool Bu::__cmpHashKeys<Bu::UtfString>(
        const Bu::UtfString &a, const Bu::UtfString &b )
{
    return a == b;
}

Bu::Formatter Bu::operator<<( Bu::Formatter &f, const Bu::UtfString &s )
{
    return f << s.get();
}