diff --git a/davtelepot/__init__.py b/davtelepot/__init__.py index 0e9f5ad..d5a7750 100644 --- a/davtelepot/__init__.py +++ b/davtelepot/__init__.py @@ -11,7 +11,7 @@ __author__ = "Davide Testa" __email__ = "davide@davte.it" __credits__ = ["Marco Origlia", "Nick Lee @Nickoala"] __license__ = "GNU General Public License v3.0" -__version__ = "2.8.9" +__version__ = "2.8.10" __maintainer__ = "Davide Testa" __contact__ = "t.me/davte" diff --git a/davtelepot/administration_tools.py b/davtelepot/administration_tools.py index 5ef1306..9d25705 100644 --- a/davtelepot/administration_tools.py +++ b/davtelepot/administration_tools.py @@ -27,7 +27,7 @@ from davtelepot.messages import default_admin_messages, default_talk_messages from davtelepot.bot import Bot from davtelepot.utilities import ( async_wrapper, CachedPage, Confirmator, extract, get_cleaned_text, - get_user, escape_html_chars, line_drawing_unordered_list, make_button, + get_user, clean_html_string, line_drawing_unordered_list, make_button, make_inline_keyboard, remove_html_tags, send_part_of_text_file, send_csv_file, make_lines_of_buttons ) @@ -130,7 +130,7 @@ def get_talk_panel(bot: Bot, 'help_text', update=update, user_record=user_record, - q=escape_html_chars( + q=clean_html_string( remove_html_tags(text) ) ) @@ -155,7 +155,7 @@ def get_talk_panel(bot: Bot, 'user_not_found', update=update, user_record=user_record, - q=escape_html_chars( + q=clean_html_string( remove_html_tags(text) ) ) diff --git a/davtelepot/api.py b/davtelepot/api.py index 06d128a..a18be5b 100644 --- a/davtelepot/api.py +++ b/davtelepot/api.py @@ -382,7 +382,7 @@ class TelegramBot: @staticmethod def adapt_parameters(parameters, exclude=None): - """Build a aiohttp.FormData object from given `parameters`. + """Build an aiohttp.FormData object from given `parameters`. Exclude `self`, empty values and parameters in `exclude` list. Cast integers to string to avoid TypeError during json serialization. @@ -1058,7 +1058,7 @@ class TelegramBot: unbanned first. Note: In regular groups (non-supergroups), this method will only work if the ‘All Members Are Admins’ setting is off in the target group. - Otherwise members may only be removed by the group's creator or by + Otherwise, members may only be removed by the group's creator or by the member that added them. See https://core.telegram.org/bots/api#kickchatmember for details. """ @@ -1245,7 +1245,7 @@ class TelegramBot: ) async def getChat(self, chat_id: Union[int, str]): - """Get up to date information about the chat. + """Get up-to-date information about the chat. Return a Chat object on success. See https://core.telegram.org/bots/api#getchat for details. diff --git a/davtelepot/bot.py b/davtelepot/bot.py index 2bb5f5a..fa4fbed 100644 --- a/davtelepot/bot.py +++ b/davtelepot/bot.py @@ -54,7 +54,7 @@ from davtelepot.database import ObjectWithDatabase from davtelepot.languages import MultiLanguageObject from davtelepot.messages import davtelepot_messages from davtelepot.utilities import ( - async_get, escape_html_chars, extract, get_secure_key, + async_get, clean_html_string, extract, get_secure_key, make_inline_query_answer, make_lines_of_buttons, remove_html_tags ) @@ -69,7 +69,7 @@ logging.getLogger('chardet').setLevel(logging.WARNING) class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject): """Simple Bot object, providing methods corresponding to Telegram bot API. - Multiple Bot() instances may be run together, along with a aiohttp web app. + Multiple Bot() instances may be run together, along with an aiohttp web app. """ bots = [] @@ -347,7 +347,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject): @property def errors_file_path(self): - """Return errors file path basing on self.path and `_errors_file_name`. + """Return errors file path basing on `self.path` and `_errors_file_name`. Fallback to class file if set, otherwise return None. """ @@ -417,7 +417,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject): """Maximum number of simultaneous HTTPS connections allowed. Telegram will open as many connections as possible to boost bot’s - throughput, lower values limit the load on bot‘s server. + throughput, lower values limit the load on bot's server. """ return self._max_connections @@ -477,7 +477,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject): def allowed_during_maintenance(self): """Return the list of criteria to allow an update during maintenance. - If any of this criteria returns True on an update, that update will be + If any of these criteria returns True on an update, that update will be handled even during maintenance. """ return self._allowed_during_maintenance @@ -858,7 +858,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject): elif 'chat' in update and update['chat']['id'] > 0: reply = dict(text=self.unknown_command_message) else: # Handle command aliases and text parsers - # Aliases are case insensitive: text and alias are both .lower() + # Aliases are case-insensitive: text and alias are both .lower() for alias, function in self.command_aliases.items(): if lowered_text.startswith(alias.lower()): replier = function @@ -1222,7 +1222,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject): with proper code markdown. """ if parse_mode == 'HTML': - text = escape_html_chars(text) + text = clean_html_string(text) tags = ( ('`', '`') if parse_mode == 'Markdown' @@ -1591,7 +1591,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject): photo.startswith(url_starter) for url_starter in ('http', 'www',) ] - ): # If `photo` is not a url but a local file path + ): # If `photo` is not a URL but a local file path try: with io.BytesIO() as buffered_picture: with open( @@ -1716,7 +1716,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject): audio.startswith(url_starter) for url_starter in ('http', 'www',) ] - ): # If `audio` is not a url but a local file path + ): # If `audio` is not a URL but a local file path try: with io.BytesIO() as buffered_picture: with open( @@ -1841,7 +1841,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject): voice.startswith(url_starter) for url_starter in ('http', 'www',) ] - ): # If `voice` is not a url but a local file path + ): # If `voice` is not a URL but a local file path try: with io.BytesIO() as buffered_picture: with open( @@ -1977,7 +1977,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject): document_path.startswith(url_starter) for url_starter in ('http', 'www',) ] - ): # If `document_path` is not a url but a local file path + ): # If `document_path` is not a URL but a local file path try: with open( document_path.format( @@ -3162,7 +3162,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject): allowed_updates : List(str) List of update types to be retrieved. Empty list to allow all updates. - None to fallback to class default. + None to fall back to class default. """ # Return if token is invalid await self.get_me() diff --git a/davtelepot/utilities.py b/davtelepot/utilities.py index af71434..e542278 100644 --- a/davtelepot/utilities.py +++ b/davtelepot/utilities.py @@ -16,9 +16,9 @@ import string import time from difflib import SequenceMatcher +from typing import Tuple, Union # Third party modules -from typing import Tuple, Union import aiohttp from bs4 import BeautifulSoup @@ -1251,7 +1251,7 @@ def parse_datetime_interval_string(text): result_text.pop() if len(result_text) > 0 and result_text[-1].lower() in TIME_WORDS: result_text.pop() - result_text = escape_html_chars( + result_text = clean_html_string( ' '.join(result_text) ) parsers = list( @@ -1330,6 +1330,22 @@ MONTH_NAMES_ITA[10] = "ottobre" MONTH_NAMES_ITA[11] = "novembre" MONTH_NAMES_ITA[12] = "dicembre" +allowed_html_tags = ['b', 'strong', + 'i', 'em', + 'u', 'ins', + 's', 'strike', 'del', + 'span', 'tg-spoiler', + 'a', + 'code', 'pre'] + +HTML_SYMBOLS = collections.OrderedDict() +HTML_SYMBOLS["&"] = "&" +HTML_SYMBOLS["<"] = "<" +HTML_SYMBOLS[">"] = ">" +HTML_SYMBOLS["\""] = """ + +html_numeric_code_regex = re.compile(r'&(?P#\d{2,3};)') + def beautytd(td): """Format properly timedeltas.""" @@ -1410,67 +1426,56 @@ def beautydt(dt): return result -HTML_SYMBOLS = MyOD() -HTML_SYMBOLS["&"] = "&" -HTML_SYMBOLS["<"] = "<" -HTML_SYMBOLS[">"] = ">" -HTML_SYMBOLS["\""] = """ -HTML_SYMBOLS["<b>"] = "" -HTML_SYMBOLS["</b>"] = "" -HTML_SYMBOLS["<i>"] = "" -HTML_SYMBOLS["</i>"] = "" -HTML_SYMBOLS["<code>"] = "" -HTML_SYMBOLS["</code>"] = "" -HTML_SYMBOLS["<pre>"] = "
"
-HTML_SYMBOLS["</pre>"] = "
" -HTML_SYMBOLS["<a href=""] = "" -HTML_SYMBOLS["</a>"] = "" +def clean_html_string(text: str) -> str: + """Escape HTML symbols, unless part of a valid tag or numeric code character. -HTML_TAGS = [ - None, "", "", - None, "", "", - None, "", "", - None, "
", "
", - None, "", "", - None -] - - -def remove_html_tags(text): - """Remove HTML tags from `text`.""" - for tag in HTML_TAGS: - if tag is None: - continue - text = text.replace(tag, '') + Find valid HTML tags; + if there are any, choose the first occurring and call the function + recursively on what comes before the tag, inside the tag and after the + tag, preserving the tag opening and close as they are; + if there aren't any, escape HTML symbols except for `&` in HTML numeric code + characters (`&#` followed by 2 or 3 digits followed by `;`). + """ + first_match = None + for tag in allowed_html_tags: + if tag in ('a', ): # must have href attribute + attribute = r" href=\".*\"" + elif tag in ('span', ): # must have class attribute with "tg-spoiler" value + attribute = r" class=\"tg-spoiler\"" + elif tag in ('code',): # may have a class with a programming language as value + attribute = r"( class=\".*\")?" + else: + attribute = "" + match = re.search( + rf'(?P<{tag}{attribute}>)' + rf'(?P.*?)' + rf'(?P)', + text + ) + if match and (first_match is None or match.start() < first_match.start()): + first_match = match + if first_match is not None: + groups = first_match.groupdict() + text = (f"{clean_html_string(text[:first_match.start()])}" + f"{groups['opening']}{clean_html_string(groups['body'])}{groups['close']}" + f"{clean_html_string(text[first_match.end():])}") + else: + for key, value in HTML_SYMBOLS.items(): + text = text.replace(key, value) + if re.search(html_numeric_code_regex, text): + text = re.sub(html_numeric_code_regex, r'&\g', text) return text def escape_html_chars(text): - """Escape HTML chars if not part of a tag.""" - for s, r in HTML_SYMBOLS.items(): - text = text.replace(s, r) - copy = text - expected_tag = None - while copy: - min_ = min( - ( - dict( - position=copy.find(tag) if tag in copy else len(copy), - tag=tag - ) - for tag in HTML_TAGS - if tag - ), - key=lambda x: x['position'], - default=0 - ) - if min_['position'] == len(copy): - break - if expected_tag and min_['tag'] != expected_tag: - return text.replace('<', '_').replace('>', '_') - expected_tag = HTML_TAGS[HTML_TAGS.index(min_['tag'])+1] - copy = extract(copy, min_['tag']) + logging.error("`escape_html_chars` function deprecated, use `clean_html_string` instead.") + return clean_html_string(text) + + +def remove_html_tags(text): + """Remove HTML tags from `text`.""" + for tag in allowed_html_tags: + text = re.sub(rf'', '', text) return text