solo-level-app-persistance-.../venv/lib/python3.11/site-packages/gtts/utils.py

84 lines
3.0 KiB
Python

# -*- coding: utf-8 -*-
from gtts.tokenizer.symbols import ALL_PUNC as punc
from string import whitespace as ws
import re
_ALL_PUNC_OR_SPACE = re.compile(u"^[{}]*$".format(re.escape(punc + ws)))
"""Regex that matches if an entire line is only comprised
of whitespace and punctuation
"""
def _minimize(the_string, delim, max_size):
"""Recursively split a string in the largest chunks
possible from the highest position of a delimiter all the way
to a maximum size
Args:
the_string (string): The string to split.
delim (string): The delimiter to split on.
max_size (int): The maximum size of a chunk.
Returns:
list: the minimized string in tokens
Every chunk size will be at minimum ``the_string[0:idx]`` where ``idx``
is the highest index of ``delim`` found in ``the_string``; and at maximum
``the_string[0:max_size]`` if no ``delim`` was found in ``the_string``.
In the latter case, the split will occur at ``the_string[max_size]``
which can be any character. The function runs itself again on the rest of
``the_string`` (``the_string[idx:]``) until no chunk is larger than
``max_size``.
"""
# Remove `delim` from start of `the_string`
# i.e. prevent a recursive infinite loop on `the_string[0:0]`
# if `the_string` starts with `delim` and is larger than `max_size`
if the_string.startswith(delim):
the_string = the_string[len(delim):]
if len(the_string) > max_size:
try:
# Find the highest index of `delim` in `the_string[0:max_size]`
# i.e. `the_string` will be cut in half on `delim` index
idx = the_string.rindex(delim, 0, max_size)
except ValueError:
# `delim` not found in `the_string`, index becomes `max_size`
# i.e. `the_string` will be cut in half arbitrarily on `max_size`
idx = max_size
# Call itself again for `the_string[idx:]`
return [the_string[:idx]] + _minimize(the_string[idx:], delim, max_size)
else:
return [the_string]
def _clean_tokens(tokens):
"""Clean a list of strings
Args:
tokens (list): A list of strings (tokens) to clean.
Returns:
list: Stripped strings ``tokens`` without the original elements
that only consisted of whitespace and/or punctuation characters.
"""
return [t.strip() for t in tokens if not _ALL_PUNC_OR_SPACE.match(t)]
def _translate_url(tld="com", path=""):
"""Generates a Google Translate URL
Args:
tld (string): Top-level domain for the Google Translate host,
i.e ``https://translate.google.<tld>``. Default is ``com``.
path: (string): A path to append to the Google Translate host,
i.e ``https://translate.google.com/<path>``. Default is ``""``.
Returns:
string: A Google Translate URL `https://translate.google.<tld>/path`
"""
_GOOGLE_TTS_URL = "https://translate.google.{}/{}"
return _GOOGLE_TTS_URL.format(tld, path)