84 lines
3.0 KiB
Python
84 lines
3.0 KiB
Python
# -*- coding: utf-8 -*-
|
|
from gtts.tokenizer.symbols import ALL_PUNC as punc
|
|
from string import whitespace as ws
|
|
import re
|
|
|
|
_ALL_PUNC_OR_SPACE = re.compile(u"^[{}]*$".format(re.escape(punc + ws)))
|
|
"""Regex that matches if an entire line is only comprised
|
|
of whitespace and punctuation
|
|
|
|
"""
|
|
|
|
|
|
def _minimize(the_string, delim, max_size):
|
|
"""Recursively split a string in the largest chunks
|
|
possible from the highest position of a delimiter all the way
|
|
to a maximum size
|
|
|
|
Args:
|
|
the_string (string): The string to split.
|
|
delim (string): The delimiter to split on.
|
|
max_size (int): The maximum size of a chunk.
|
|
|
|
Returns:
|
|
list: the minimized string in tokens
|
|
|
|
Every chunk size will be at minimum ``the_string[0:idx]`` where ``idx``
|
|
is the highest index of ``delim`` found in ``the_string``; and at maximum
|
|
``the_string[0:max_size]`` if no ``delim`` was found in ``the_string``.
|
|
In the latter case, the split will occur at ``the_string[max_size]``
|
|
which can be any character. The function runs itself again on the rest of
|
|
``the_string`` (``the_string[idx:]``) until no chunk is larger than
|
|
``max_size``.
|
|
|
|
"""
|
|
# Remove `delim` from start of `the_string`
|
|
# i.e. prevent a recursive infinite loop on `the_string[0:0]`
|
|
# if `the_string` starts with `delim` and is larger than `max_size`
|
|
if the_string.startswith(delim):
|
|
the_string = the_string[len(delim):]
|
|
|
|
if len(the_string) > max_size:
|
|
try:
|
|
# Find the highest index of `delim` in `the_string[0:max_size]`
|
|
# i.e. `the_string` will be cut in half on `delim` index
|
|
idx = the_string.rindex(delim, 0, max_size)
|
|
except ValueError:
|
|
# `delim` not found in `the_string`, index becomes `max_size`
|
|
# i.e. `the_string` will be cut in half arbitrarily on `max_size`
|
|
idx = max_size
|
|
# Call itself again for `the_string[idx:]`
|
|
return [the_string[:idx]] + _minimize(the_string[idx:], delim, max_size)
|
|
else:
|
|
return [the_string]
|
|
|
|
|
|
def _clean_tokens(tokens):
|
|
"""Clean a list of strings
|
|
|
|
Args:
|
|
tokens (list): A list of strings (tokens) to clean.
|
|
|
|
Returns:
|
|
list: Stripped strings ``tokens`` without the original elements
|
|
that only consisted of whitespace and/or punctuation characters.
|
|
|
|
"""
|
|
return [t.strip() for t in tokens if not _ALL_PUNC_OR_SPACE.match(t)]
|
|
|
|
|
|
def _translate_url(tld="com", path=""):
|
|
"""Generates a Google Translate URL
|
|
|
|
Args:
|
|
tld (string): Top-level domain for the Google Translate host,
|
|
i.e ``https://translate.google.<tld>``. Default is ``com``.
|
|
path: (string): A path to append to the Google Translate host,
|
|
i.e ``https://translate.google.com/<path>``. Default is ``""``.
|
|
|
|
Returns:
|
|
string: A Google Translate URL `https://translate.google.<tld>/path`
|
|
"""
|
|
_GOOGLE_TTS_URL = "https://translate.google.{}/{}"
|
|
return _GOOGLE_TTS_URL.format(tld, path)
|