uniborg/stdplugins/tl.py

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""
Translates stuff into English
"""
import aiohttp
import asyncio
import io
import math
import mimetypes
import re
import time

from telethon import helpers, types


mimetypes.add_type('audio/mpeg', '.borg+tts')


LANGUAGES = {
    'af': 'Afrikaans',
    'sq': 'Albanian',
    'am': 'Amharic',
    'ar': 'Arabic',
    'hy': 'Armenian',
    'az': 'Azerbaijani',
    'eu': 'Basque',
    'be': 'Belarusian',
    'bn': 'Bengali',
    'bs': 'Bosnian',
    'bg': 'Bulgarian',
    'ca': 'Catalan',
    'ceb': 'Cebuano',
    'ny': 'Chichewa',
    'zh-CN': 'Chinese (Simplified)',
    'zh-TW': 'Chinese (Traditional)',
    'co': 'Corsican',
    'hr': 'Croatian',
    'cs': 'Czech',
    'da': 'Danish',
    'nl': 'Dutch',
    'en': 'English',
    'eo': 'Esperanto',
    'et': 'Estonian',
    'tl': 'Filipino',
    'fi': 'Finnish',
    'fr': 'French',
    'fy': 'Frisian',
    'gl': 'Galician',
    'ka': 'Georgian',
    'de': 'German',
    'el': 'Greek',
    'gu': 'Gujarati',
    'ht': 'Haitian Creole',
    'ha': 'Hausa',
    'haw': 'Hawaiian',
    'iw': 'Hebrew',
    'hi': 'Hindi',
    'hmn': 'Hmong',
    'hu': 'Hungarian',
    'is': 'Icelandic',
    'ig': 'Igbo',
    'id': 'Indonesian',
    'ga': 'Irish',
    'it': 'Italian',
    'ja': 'Japanese',
    'jw': 'Javanese',
    'kn': 'Kannada',
    'kk': 'Kazakh',
    'km': 'Khmer',
    'rw': 'Kinyarwanda',
    'ko': 'Korean',
    'ku': 'Kurdish (Kurmanji)',
    'ky': 'Kyrgyz',
    'lo': 'Lao',
    'la': 'Latin',
    'lv': 'Latvian',
    'lt': 'Lithuanian',
    'lb': 'Luxembourgish',
    'mk': 'Macedonian',
    'mg': 'Malagasy',
    'ms': 'Malay',
    'ml': 'Malayalam',
    'mt': 'Maltese',
    'mi': 'Maori',
    'mr': 'Marathi',
    'mn': 'Mongolian',
    'my': 'Myanmar (Burmese)',
    'ne': 'Nepali',
    'no': 'Norwegian',
    'or': 'Odia (Oriya)',
    'ps': 'Pashto',
    'fa': 'Persian',
    'pl': 'Polish',
    'pt': 'Portuguese',
    'pa': 'Punjabi',
    'ro': 'Romanian',
    'ru': 'Russian',
    'sm': 'Samoan',
    'gd': 'Scots Gaelic',
    'sr': 'Serbian',
    'st': 'Sesotho',
    'sn': 'Shona',
    'sd': 'Sindhi',
    'si': 'Sinhala',
    'sk': 'Slovak',
    'sl': 'Slovenian',
    'so': 'Somali',
    'es': 'Spanish',
    'su': 'Sundanese',
    'sw': 'Swahili',
    'sv': 'Swedish',
    'tg': 'Tajik',
    'ta': 'Tamil',
    'tt': 'Tatar',
    'te': 'Telugu',
    'th': 'Thai',
    'tr': 'Turkish',
    'tk': 'Turkmen',
    'uk': 'Ukrainian',
    'ur': 'Urdu',
    'ug': 'Uyghur',
    'uz': 'Uzbek',
    'vi': 'Vietnamese',
    'cy': 'Welsh',
    'xh': 'Xhosa',
    'yi': 'Yiddish',
    'yo': 'Yoruba',
    'zu': 'Zulu'
}


def split_text(text, n=40):
    words = text.split()
    while len(words) > n:
        comma = None
        semicolon = None
        for i in reversed(range(n)):
            if words[i].endswith('.'):
                yield ' '.join(words[:i + 1])
                words = words[i + 1:]
                break
            elif not semicolon and words[i].endswith(';'):
                semicolon = i + 1
            elif not comma and words[i].endswith(','):
                comma = i + 1
        else:
            cut = semicolon or comma or n
            yield ' '.join(words[:cut])
            words = words[cut:]
    if words:
        yield ' '.join(words)


class Translator:
    _TKK_RE = re.compile(r"tkk:'(\d+)\.(\d+)'", re.DOTALL)
    _BASE_URL = 'https://translate.google.com'
    _TRANSLATE_URL = 'https://translate.google.com/translate_a/single'
    _TRANSLATE_TTS_URL = 'https://translate.google.com/translate_tts'
    _HEADERS = {
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:75.0) Gecko/20100101 Firefox/75.0'
    }

    def __init__(self, target='en', source='auto'):
        self._target = target
        self._source = source
        self._session = aiohttp.ClientSession(headers=self._HEADERS)
        self._tkk = None
        self._tkk_lock = asyncio.Lock()

    async def _fetch_tkk(self):
        async with self._session.get(self._BASE_URL) as resp:
            html = await resp.text()
            return tuple(map(int, self._TKK_RE.search(html).groups()))

    def _need_refresh_tkk(self):
        return (self._tkk is None) or (self._tkk[0] != int(time.time() / 3600))

    def _calc_token(self, text):
        """
        Original code by ultrafunkamsterdam/googletranslate:
        https://github.com/ultrafunkamsterdam/googletranslate/blob/bd3f4d0a1386ffa634c8ebbebb3603279f3ece99/googletranslate/__init__.py#L263

        If this ever breaks, the way it was found was in one of the top-100
        longest lines of `translate_m.js` used by translate.google.com, it
        uses a single-line with all these "magic" values and one can look
        around there and use a debugger to figure out how it works. It's
        a very straight-forward port.
        """
        def xor_rot(a, b):
            size_b = len(b)
            c = 0
            while c < size_b - 2:
                d = b[c + 2]
                d = ord(d[0]) - 87 if 'a' <= d else int(d)
                d = (a % 0x100000000) >> d if '+' == b[c + 1] else a << d
                a = a + d & 4294967295 if '+' == b[c] else a ^ d
                c += 3
            return a

        a = []
        text = helpers.add_surrogate(text)
        for i in text:
            val = ord(i)
            if val < 0x10000:
                a += [val]
            else:
                a += [
                    math.floor((val - 0x10000) / 0x400 + 0xD800),
                    math.floor((val - 0x10000) % 0x400 + 0xDC00),
                ]

        d = self._tkk
        b = d[0]
        e = []
        g = 0
        size = len(text)
        while g < size:
            l = a[g]
            if l < 128:
                e.append(l)
            else:
                if l < 2048:
                    e.append(l >> 6 | 192)
                else:
                    if (
                            (l & 64512) == 55296
                            and g + 1 < size
                            and a[g + 1] & 64512 == 56320
                    ):
                        g += 1
                        l = 65536 + ((l & 1023) << 10) + (a[g] & 1023)
                        e.append(l >> 18 | 240)
                        e.append(l >> 12 & 63 | 128)
                    else:
                        e.append(l >> 12 | 224)
                    e.append(l >> 6 & 63 | 128)
                e.append(l & 63 | 128)
            g += 1
        a = b
        for i, value in enumerate(e):
            a += value
            a = xor_rot(a, '+-a^+6')
        a = xor_rot(a, '+-3^+b+-f')
        a ^= d[1]
        if a < 0:
            a = (a & 2147483647) + 2147483648
        a %= 1000000
        return '{}.{}'.format(a, a ^ b)

    async def translate(self, text, target=None, source=None):
        if self._need_refresh_tkk():
            async with self._tkk_lock:
                self._tkk = await self._fetch_tkk()

        params = [
            ('client', 'webapp'),
            ('sl', source or self._source),
            ('tl', target or self._target),
            ('hl', 'en'),
            *[('dt', x) for x in ['at', 'bd', 'ex', 'ld', 'md', 'qca', 'rw', 'rm', 'sos', 'ss', 't']],
            ('ie', 'UTF-8'),
            ('oe', 'UTF-8'),
            ('otf', 1),
            ('ssel', 0),
            ('tsel', 0),
            ('tk', self._calc_token(text)),
            ('q', text),
        ]

        async with self._session.get(self._TRANSLATE_URL, params=params) as resp:
            data = await resp.json()
            return ''.join(part[0] for part in data[0] if part[0] is not None)

    async def tts(self, text, target=None):
        if self._need_refresh_tkk():
            async with self._tkk_lock:
                self._tkk = await self._fetch_tkk()

        parts = list(split_text(text))
        result = b''
        for i, part in enumerate(parts):
            params = [
                ('ie', 'UTF-8'),
                ('q', part),
                ('tl', target or self._target),
                ('total', len(parts)),
                ('idx', i),
                ('textlen', len(helpers.add_surrogate(part))),
                ('tk', self._calc_token(part)),
                ('client', 'webapp'),
                ('prev', 'input'),
            ]

            async with self._session.get(self._TRANSLATE_TTS_URL, params=params) as resp:
                if resp.status == 404:
                    raise ValueError('unknown target language')
                else:
                    result += await resp.read()

        return result

    async def close(self):
        await self._session.close()


translator = Translator()


@borg.on(borg.cmd(r"tl"))
async def _(event):
    if event.is_reply:
        text = (await event.get_reply_message()).raw_text
    elif not borg.me.bot:
        text = ''
        started = False
        async for m in borg.iter_messages(event.chat_id):
            if started and m.sender_id == borg.uid:
                break
            if m.sender_id != borg.uid:
                started = True
            if not started or not m.raw_text:
                continue
            if ' ' in m.raw_text:
                text = m.raw_text + '\n' + text
            else:
                text = m.raw_text + ' ' + text
    else:
        return

    translated = await translator.translate(text.strip())
    action = event.edit if not borg.me.bot else event.respond
    await action('translation: ' + translated, parse_mode=None)


@borg.on(borg.cmd(r"tts"))
async def _(event):
    if not borg.me.bot:
        await event.delete()

    ts = event.raw_text.split(maxsplit=1)
    text = None if len(ts) < 2 else ts[1]

    if not text and event.is_reply:
        text = (await event.get_reply_message()).raw_text

    if not text:
        return

    file = io.BytesIO(await translator.tts(text))
    file.name = 'a.borg+tts'
    await borg.send_file(
        event.chat_id,
        file,
        reply_to=event.reply_to_msg_id if not borg.me.bot else None,
        attributes=[types.DocumentAttributeAudio(
            duration=0,
            voice=True
        )]
    )


async def unload():
    await translator.close()