This repository has been archived on 2020-08-02. You can view files and clone it, but cannot push or open issues/pull-requests.
uniborg/stdplugins/tl.py

366 lines
10 KiB
Python

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""
Translates stuff into English
"""
import aiohttp
import asyncio
import io
import math
import mimetypes
import re
import time
from telethon import helpers, types
mimetypes.add_type('audio/mpeg', '.borg+tts')
LANGUAGES = {
'af': 'Afrikaans',
'sq': 'Albanian',
'am': 'Amharic',
'ar': 'Arabic',
'hy': 'Armenian',
'az': 'Azerbaijani',
'eu': 'Basque',
'be': 'Belarusian',
'bn': 'Bengali',
'bs': 'Bosnian',
'bg': 'Bulgarian',
'ca': 'Catalan',
'ceb': 'Cebuano',
'ny': 'Chichewa',
'zh-CN': 'Chinese (Simplified)',
'zh-TW': 'Chinese (Traditional)',
'co': 'Corsican',
'hr': 'Croatian',
'cs': 'Czech',
'da': 'Danish',
'nl': 'Dutch',
'en': 'English',
'eo': 'Esperanto',
'et': 'Estonian',
'tl': 'Filipino',
'fi': 'Finnish',
'fr': 'French',
'fy': 'Frisian',
'gl': 'Galician',
'ka': 'Georgian',
'de': 'German',
'el': 'Greek',
'gu': 'Gujarati',
'ht': 'Haitian Creole',
'ha': 'Hausa',
'haw': 'Hawaiian',
'iw': 'Hebrew',
'hi': 'Hindi',
'hmn': 'Hmong',
'hu': 'Hungarian',
'is': 'Icelandic',
'ig': 'Igbo',
'id': 'Indonesian',
'ga': 'Irish',
'it': 'Italian',
'ja': 'Japanese',
'jw': 'Javanese',
'kn': 'Kannada',
'kk': 'Kazakh',
'km': 'Khmer',
'rw': 'Kinyarwanda',
'ko': 'Korean',
'ku': 'Kurdish (Kurmanji)',
'ky': 'Kyrgyz',
'lo': 'Lao',
'la': 'Latin',
'lv': 'Latvian',
'lt': 'Lithuanian',
'lb': 'Luxembourgish',
'mk': 'Macedonian',
'mg': 'Malagasy',
'ms': 'Malay',
'ml': 'Malayalam',
'mt': 'Maltese',
'mi': 'Maori',
'mr': 'Marathi',
'mn': 'Mongolian',
'my': 'Myanmar (Burmese)',
'ne': 'Nepali',
'no': 'Norwegian',
'or': 'Odia (Oriya)',
'ps': 'Pashto',
'fa': 'Persian',
'pl': 'Polish',
'pt': 'Portuguese',
'pa': 'Punjabi',
'ro': 'Romanian',
'ru': 'Russian',
'sm': 'Samoan',
'gd': 'Scots Gaelic',
'sr': 'Serbian',
'st': 'Sesotho',
'sn': 'Shona',
'sd': 'Sindhi',
'si': 'Sinhala',
'sk': 'Slovak',
'sl': 'Slovenian',
'so': 'Somali',
'es': 'Spanish',
'su': 'Sundanese',
'sw': 'Swahili',
'sv': 'Swedish',
'tg': 'Tajik',
'ta': 'Tamil',
'tt': 'Tatar',
'te': 'Telugu',
'th': 'Thai',
'tr': 'Turkish',
'tk': 'Turkmen',
'uk': 'Ukrainian',
'ur': 'Urdu',
'ug': 'Uyghur',
'uz': 'Uzbek',
'vi': 'Vietnamese',
'cy': 'Welsh',
'xh': 'Xhosa',
'yi': 'Yiddish',
'yo': 'Yoruba',
'zu': 'Zulu'
}
def split_text(text, n=40):
words = text.split()
while len(words) > n:
comma = None
semicolon = None
for i in reversed(range(n)):
if words[i].endswith('.'):
yield ' '.join(words[:i + 1])
words = words[i + 1:]
break
elif not semicolon and words[i].endswith(';'):
semicolon = i + 1
elif not comma and words[i].endswith(','):
comma = i + 1
else:
cut = semicolon or comma or n
yield ' '.join(words[:cut])
words = words[cut:]
if words:
yield ' '.join(words)
class Translator:
_TKK_RE = re.compile(r"tkk:'(\d+)\.(\d+)'", re.DOTALL)
_BASE_URL = 'https://translate.google.com'
_TRANSLATE_URL = 'https://translate.google.com/translate_a/single'
_TRANSLATE_TTS_URL = 'https://translate.google.com/translate_tts'
_HEADERS = {
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:75.0) Gecko/20100101 Firefox/75.0'
}
def __init__(self, target='en', source='auto'):
self._target = target
self._source = source
self._session = aiohttp.ClientSession(headers=self._HEADERS)
self._tkk = None
self._tkk_lock = asyncio.Lock()
async def _fetch_tkk(self):
async with self._session.get(self._BASE_URL) as resp:
html = await resp.text()
return tuple(map(int, self._TKK_RE.search(html).groups()))
def _need_refresh_tkk(self):
return (self._tkk is None) or (self._tkk[0] != int(time.time() / 3600))
def _calc_token(self, text):
"""
Original code by ultrafunkamsterdam/googletranslate:
https://github.com/ultrafunkamsterdam/googletranslate/blob/bd3f4d0a1386ffa634c8ebbebb3603279f3ece99/googletranslate/__init__.py#L263
If this ever breaks, the way it was found was in one of the top-100
longest lines of `translate_m.js` used by translate.google.com, it
uses a single-line with all these "magic" values and one can look
around there and use a debugger to figure out how it works. It's
a very straight-forward port.
"""
def xor_rot(a, b):
size_b = len(b)
c = 0
while c < size_b - 2:
d = b[c + 2]
d = ord(d[0]) - 87 if 'a' <= d else int(d)
d = (a % 0x100000000) >> d if '+' == b[c + 1] else a << d
a = a + d & 4294967295 if '+' == b[c] else a ^ d
c += 3
return a
a = []
text = helpers.add_surrogate(text)
for i in text:
val = ord(i)
if val < 0x10000:
a += [val]
else:
a += [
math.floor((val - 0x10000) / 0x400 + 0xD800),
math.floor((val - 0x10000) % 0x400 + 0xDC00),
]
d = self._tkk
b = d[0]
e = []
g = 0
size = len(text)
while g < size:
l = a[g]
if l < 128:
e.append(l)
else:
if l < 2048:
e.append(l >> 6 | 192)
else:
if (
(l & 64512) == 55296
and g + 1 < size
and a[g + 1] & 64512 == 56320
):
g += 1
l = 65536 + ((l & 1023) << 10) + (a[g] & 1023)
e.append(l >> 18 | 240)
e.append(l >> 12 & 63 | 128)
else:
e.append(l >> 12 | 224)
e.append(l >> 6 & 63 | 128)
e.append(l & 63 | 128)
g += 1
a = b
for i, value in enumerate(e):
a += value
a = xor_rot(a, '+-a^+6')
a = xor_rot(a, '+-3^+b+-f')
a ^= d[1]
if a < 0:
a = (a & 2147483647) + 2147483648
a %= 1000000
return '{}.{}'.format(a, a ^ b)
async def translate(self, text, target=None, source=None):
if self._need_refresh_tkk():
async with self._tkk_lock:
self._tkk = await self._fetch_tkk()
params = [
('client', 'webapp'),
('sl', source or self._source),
('tl', target or self._target),
('hl', 'en'),
*[('dt', x) for x in ['at', 'bd', 'ex', 'ld', 'md', 'qca', 'rw', 'rm', 'sos', 'ss', 't']],
('ie', 'UTF-8'),
('oe', 'UTF-8'),
('otf', 1),
('ssel', 0),
('tsel', 0),
('tk', self._calc_token(text)),
('q', text),
]
async with self._session.get(self._TRANSLATE_URL, params=params) as resp:
data = await resp.json()
return ''.join(part[0] for part in data[0] if part[0] is not None)
async def tts(self, text, target=None):
if self._need_refresh_tkk():
async with self._tkk_lock:
self._tkk = await self._fetch_tkk()
parts = list(split_text(text))
result = b''
for i, part in enumerate(parts):
params = [
('ie', 'UTF-8'),
('q', part),
('tl', target or self._target),
('total', len(parts)),
('idx', i),
('textlen', len(helpers.add_surrogate(part))),
('tk', self._calc_token(part)),
('client', 'webapp'),
('prev', 'input'),
]
async with self._session.get(self._TRANSLATE_TTS_URL, params=params) as resp:
if resp.status == 404:
raise ValueError('unknown target language')
else:
result += await resp.read()
return result
async def close(self):
await self._session.close()
translator = Translator()
@borg.on(borg.cmd(r"tl"))
async def _(event):
if event.is_reply:
text = (await event.get_reply_message()).raw_text
elif not borg.me.bot:
text = ''
started = False
async for m in borg.iter_messages(event.chat_id):
if started and m.sender_id == borg.uid:
break
if m.sender_id != borg.uid:
started = True
if not started or not m.raw_text:
continue
if ' ' in m.raw_text:
text = m.raw_text + '\n' + text
else:
text = m.raw_text + ' ' + text
else:
return
translated = await translator.translate(text.strip())
action = event.edit if not borg.me.bot else event.respond
await action('translation: ' + translated, parse_mode=None)
@borg.on(borg.cmd(r"tts"))
async def _(event):
if not borg.me.bot:
await event.delete()
ts = event.raw_text.split(maxsplit=1)
text = None if len(ts) < 2 else ts[1]
if not text and event.is_reply:
text = (await event.get_reply_message()).raw_text
if not text:
return
file = io.BytesIO(await translator.tts(text))
file.name = 'a.borg+tts'
await borg.send_file(
event.chat_id,
file,
reply_to=event.reply_to_msg_id if not borg.me.bot else None,
attributes=[types.DocumentAttributeAudio(
duration=0,
voice=True
)]
)
async def unload():
await translator.close()