366 lines
10 KiB
Python
366 lines
10 KiB
Python
# This Source Code Form is subject to the terms of the Mozilla Public
|
|
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
"""
|
|
Translates stuff into English
|
|
"""
|
|
import aiohttp
|
|
import asyncio
|
|
import io
|
|
import math
|
|
import mimetypes
|
|
import re
|
|
import time
|
|
|
|
from telethon import helpers, types
|
|
|
|
|
|
mimetypes.add_type('audio/mpeg', '.borg+tts')
|
|
|
|
|
|
LANGUAGES = {
|
|
'af': 'Afrikaans',
|
|
'sq': 'Albanian',
|
|
'am': 'Amharic',
|
|
'ar': 'Arabic',
|
|
'hy': 'Armenian',
|
|
'az': 'Azerbaijani',
|
|
'eu': 'Basque',
|
|
'be': 'Belarusian',
|
|
'bn': 'Bengali',
|
|
'bs': 'Bosnian',
|
|
'bg': 'Bulgarian',
|
|
'ca': 'Catalan',
|
|
'ceb': 'Cebuano',
|
|
'ny': 'Chichewa',
|
|
'zh-CN': 'Chinese (Simplified)',
|
|
'zh-TW': 'Chinese (Traditional)',
|
|
'co': 'Corsican',
|
|
'hr': 'Croatian',
|
|
'cs': 'Czech',
|
|
'da': 'Danish',
|
|
'nl': 'Dutch',
|
|
'en': 'English',
|
|
'eo': 'Esperanto',
|
|
'et': 'Estonian',
|
|
'tl': 'Filipino',
|
|
'fi': 'Finnish',
|
|
'fr': 'French',
|
|
'fy': 'Frisian',
|
|
'gl': 'Galician',
|
|
'ka': 'Georgian',
|
|
'de': 'German',
|
|
'el': 'Greek',
|
|
'gu': 'Gujarati',
|
|
'ht': 'Haitian Creole',
|
|
'ha': 'Hausa',
|
|
'haw': 'Hawaiian',
|
|
'iw': 'Hebrew',
|
|
'hi': 'Hindi',
|
|
'hmn': 'Hmong',
|
|
'hu': 'Hungarian',
|
|
'is': 'Icelandic',
|
|
'ig': 'Igbo',
|
|
'id': 'Indonesian',
|
|
'ga': 'Irish',
|
|
'it': 'Italian',
|
|
'ja': 'Japanese',
|
|
'jw': 'Javanese',
|
|
'kn': 'Kannada',
|
|
'kk': 'Kazakh',
|
|
'km': 'Khmer',
|
|
'rw': 'Kinyarwanda',
|
|
'ko': 'Korean',
|
|
'ku': 'Kurdish (Kurmanji)',
|
|
'ky': 'Kyrgyz',
|
|
'lo': 'Lao',
|
|
'la': 'Latin',
|
|
'lv': 'Latvian',
|
|
'lt': 'Lithuanian',
|
|
'lb': 'Luxembourgish',
|
|
'mk': 'Macedonian',
|
|
'mg': 'Malagasy',
|
|
'ms': 'Malay',
|
|
'ml': 'Malayalam',
|
|
'mt': 'Maltese',
|
|
'mi': 'Maori',
|
|
'mr': 'Marathi',
|
|
'mn': 'Mongolian',
|
|
'my': 'Myanmar (Burmese)',
|
|
'ne': 'Nepali',
|
|
'no': 'Norwegian',
|
|
'or': 'Odia (Oriya)',
|
|
'ps': 'Pashto',
|
|
'fa': 'Persian',
|
|
'pl': 'Polish',
|
|
'pt': 'Portuguese',
|
|
'pa': 'Punjabi',
|
|
'ro': 'Romanian',
|
|
'ru': 'Russian',
|
|
'sm': 'Samoan',
|
|
'gd': 'Scots Gaelic',
|
|
'sr': 'Serbian',
|
|
'st': 'Sesotho',
|
|
'sn': 'Shona',
|
|
'sd': 'Sindhi',
|
|
'si': 'Sinhala',
|
|
'sk': 'Slovak',
|
|
'sl': 'Slovenian',
|
|
'so': 'Somali',
|
|
'es': 'Spanish',
|
|
'su': 'Sundanese',
|
|
'sw': 'Swahili',
|
|
'sv': 'Swedish',
|
|
'tg': 'Tajik',
|
|
'ta': 'Tamil',
|
|
'tt': 'Tatar',
|
|
'te': 'Telugu',
|
|
'th': 'Thai',
|
|
'tr': 'Turkish',
|
|
'tk': 'Turkmen',
|
|
'uk': 'Ukrainian',
|
|
'ur': 'Urdu',
|
|
'ug': 'Uyghur',
|
|
'uz': 'Uzbek',
|
|
'vi': 'Vietnamese',
|
|
'cy': 'Welsh',
|
|
'xh': 'Xhosa',
|
|
'yi': 'Yiddish',
|
|
'yo': 'Yoruba',
|
|
'zu': 'Zulu'
|
|
}
|
|
|
|
|
|
def split_text(text, n=40):
|
|
words = text.split()
|
|
while len(words) > n:
|
|
comma = None
|
|
semicolon = None
|
|
for i in reversed(range(n)):
|
|
if words[i].endswith('.'):
|
|
yield ' '.join(words[:i + 1])
|
|
words = words[i + 1:]
|
|
break
|
|
elif not semicolon and words[i].endswith(';'):
|
|
semicolon = i + 1
|
|
elif not comma and words[i].endswith(','):
|
|
comma = i + 1
|
|
else:
|
|
cut = semicolon or comma or n
|
|
yield ' '.join(words[:cut])
|
|
words = words[cut:]
|
|
if words:
|
|
yield ' '.join(words)
|
|
|
|
|
|
class Translator:
|
|
_TKK_RE = re.compile(r"tkk:'(\d+)\.(\d+)'", re.DOTALL)
|
|
_BASE_URL = 'https://translate.google.com'
|
|
_TRANSLATE_URL = 'https://translate.google.com/translate_a/single'
|
|
_TRANSLATE_TTS_URL = 'https://translate.google.com/translate_tts'
|
|
_HEADERS = {
|
|
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:75.0) Gecko/20100101 Firefox/75.0'
|
|
}
|
|
|
|
def __init__(self, target='en', source='auto'):
|
|
self._target = target
|
|
self._source = source
|
|
self._session = aiohttp.ClientSession(headers=self._HEADERS)
|
|
self._tkk = None
|
|
self._tkk_lock = asyncio.Lock()
|
|
|
|
async def _fetch_tkk(self):
|
|
async with self._session.get(self._BASE_URL) as resp:
|
|
html = await resp.text()
|
|
return tuple(map(int, self._TKK_RE.search(html).groups()))
|
|
|
|
def _need_refresh_tkk(self):
|
|
return (self._tkk is None) or (self._tkk[0] != int(time.time() / 3600))
|
|
|
|
def _calc_token(self, text):
|
|
"""
|
|
Original code by ultrafunkamsterdam/googletranslate:
|
|
https://github.com/ultrafunkamsterdam/googletranslate/blob/bd3f4d0a1386ffa634c8ebbebb3603279f3ece99/googletranslate/__init__.py#L263
|
|
|
|
If this ever breaks, the way it was found was in one of the top-100
|
|
longest lines of `translate_m.js` used by translate.google.com, it
|
|
uses a single-line with all these "magic" values and one can look
|
|
around there and use a debugger to figure out how it works. It's
|
|
a very straight-forward port.
|
|
"""
|
|
def xor_rot(a, b):
|
|
size_b = len(b)
|
|
c = 0
|
|
while c < size_b - 2:
|
|
d = b[c + 2]
|
|
d = ord(d[0]) - 87 if 'a' <= d else int(d)
|
|
d = (a % 0x100000000) >> d if '+' == b[c + 1] else a << d
|
|
a = a + d & 4294967295 if '+' == b[c] else a ^ d
|
|
c += 3
|
|
return a
|
|
|
|
a = []
|
|
text = helpers.add_surrogate(text)
|
|
for i in text:
|
|
val = ord(i)
|
|
if val < 0x10000:
|
|
a += [val]
|
|
else:
|
|
a += [
|
|
math.floor((val - 0x10000) / 0x400 + 0xD800),
|
|
math.floor((val - 0x10000) % 0x400 + 0xDC00),
|
|
]
|
|
|
|
d = self._tkk
|
|
b = d[0]
|
|
e = []
|
|
g = 0
|
|
size = len(text)
|
|
while g < size:
|
|
l = a[g]
|
|
if l < 128:
|
|
e.append(l)
|
|
else:
|
|
if l < 2048:
|
|
e.append(l >> 6 | 192)
|
|
else:
|
|
if (
|
|
(l & 64512) == 55296
|
|
and g + 1 < size
|
|
and a[g + 1] & 64512 == 56320
|
|
):
|
|
g += 1
|
|
l = 65536 + ((l & 1023) << 10) + (a[g] & 1023)
|
|
e.append(l >> 18 | 240)
|
|
e.append(l >> 12 & 63 | 128)
|
|
else:
|
|
e.append(l >> 12 | 224)
|
|
e.append(l >> 6 & 63 | 128)
|
|
e.append(l & 63 | 128)
|
|
g += 1
|
|
a = b
|
|
for i, value in enumerate(e):
|
|
a += value
|
|
a = xor_rot(a, '+-a^+6')
|
|
a = xor_rot(a, '+-3^+b+-f')
|
|
a ^= d[1]
|
|
if a < 0:
|
|
a = (a & 2147483647) + 2147483648
|
|
a %= 1000000
|
|
return '{}.{}'.format(a, a ^ b)
|
|
|
|
async def translate(self, text, target=None, source=None):
|
|
if self._need_refresh_tkk():
|
|
async with self._tkk_lock:
|
|
self._tkk = await self._fetch_tkk()
|
|
|
|
params = [
|
|
('client', 'webapp'),
|
|
('sl', source or self._source),
|
|
('tl', target or self._target),
|
|
('hl', 'en'),
|
|
*[('dt', x) for x in ['at', 'bd', 'ex', 'ld', 'md', 'qca', 'rw', 'rm', 'sos', 'ss', 't']],
|
|
('ie', 'UTF-8'),
|
|
('oe', 'UTF-8'),
|
|
('otf', 1),
|
|
('ssel', 0),
|
|
('tsel', 0),
|
|
('tk', self._calc_token(text)),
|
|
('q', text),
|
|
]
|
|
|
|
async with self._session.get(self._TRANSLATE_URL, params=params) as resp:
|
|
data = await resp.json()
|
|
return ''.join(part[0] for part in data[0] if part[0] is not None)
|
|
|
|
async def tts(self, text, target=None):
|
|
if self._need_refresh_tkk():
|
|
async with self._tkk_lock:
|
|
self._tkk = await self._fetch_tkk()
|
|
|
|
parts = list(split_text(text))
|
|
result = b''
|
|
for i, part in enumerate(parts):
|
|
params = [
|
|
('ie', 'UTF-8'),
|
|
('q', part),
|
|
('tl', target or self._target),
|
|
('total', len(parts)),
|
|
('idx', i),
|
|
('textlen', len(helpers.add_surrogate(part))),
|
|
('tk', self._calc_token(part)),
|
|
('client', 'webapp'),
|
|
('prev', 'input'),
|
|
]
|
|
|
|
async with self._session.get(self._TRANSLATE_TTS_URL, params=params) as resp:
|
|
if resp.status == 404:
|
|
raise ValueError('unknown target language')
|
|
else:
|
|
result += await resp.read()
|
|
|
|
return result
|
|
|
|
async def close(self):
|
|
await self._session.close()
|
|
|
|
|
|
translator = Translator()
|
|
|
|
|
|
@borg.on(borg.cmd(r"tl"))
|
|
async def _(event):
|
|
if event.is_reply:
|
|
text = (await event.get_reply_message()).raw_text
|
|
elif not borg.me.bot:
|
|
text = ''
|
|
started = False
|
|
async for m in borg.iter_messages(event.chat_id):
|
|
if started and m.sender_id == borg.uid:
|
|
break
|
|
if m.sender_id != borg.uid:
|
|
started = True
|
|
if not started or not m.raw_text:
|
|
continue
|
|
if ' ' in m.raw_text:
|
|
text = m.raw_text + '\n' + text
|
|
else:
|
|
text = m.raw_text + ' ' + text
|
|
else:
|
|
return
|
|
|
|
translated = await translator.translate(text.strip())
|
|
action = event.edit if not borg.me.bot else event.respond
|
|
await action('translation: ' + translated, parse_mode=None)
|
|
|
|
|
|
@borg.on(borg.cmd(r"tts"))
|
|
async def _(event):
|
|
if not borg.me.bot:
|
|
await event.delete()
|
|
|
|
ts = event.raw_text.split(maxsplit=1)
|
|
text = None if len(ts) < 2 else ts[1]
|
|
|
|
if not text and event.is_reply:
|
|
text = (await event.get_reply_message()).raw_text
|
|
|
|
if not text:
|
|
return
|
|
|
|
file = io.BytesIO(await translator.tts(text))
|
|
file.name = 'a.borg+tts'
|
|
await borg.send_file(
|
|
event.chat_id,
|
|
file,
|
|
reply_to=event.reply_to_msg_id if not borg.me.bot else None,
|
|
attributes=[types.DocumentAttributeAudio(
|
|
duration=0,
|
|
voice=True
|
|
)]
|
|
)
|
|
|
|
|
|
async def unload():
|
|
await translator.close()
|