HTML parse_mode check upgraded.

All supported tags permitted, malformed tags replaced with escaped characters
This commit is contained in:
Davte 2022-12-12 22:43:44 +01:00
parent 55b47ed1f7
commit fafa639328
Signed by: Davte
GPG Key ID: 70336F92E6814706
5 changed files with 82 additions and 77 deletions

View File

@ -11,7 +11,7 @@ __author__ = "Davide Testa"
__email__ = "davide@davte.it"
__credits__ = ["Marco Origlia", "Nick Lee @Nickoala"]
__license__ = "GNU General Public License v3.0"
__version__ = "2.8.9"
__version__ = "2.8.10"
__maintainer__ = "Davide Testa"
__contact__ = "t.me/davte"

View File

@ -27,7 +27,7 @@ from davtelepot.messages import default_admin_messages, default_talk_messages
from davtelepot.bot import Bot
from davtelepot.utilities import (
async_wrapper, CachedPage, Confirmator, extract, get_cleaned_text,
get_user, escape_html_chars, line_drawing_unordered_list, make_button,
get_user, clean_html_string, line_drawing_unordered_list, make_button,
make_inline_keyboard, remove_html_tags, send_part_of_text_file,
send_csv_file, make_lines_of_buttons
)
@ -130,7 +130,7 @@ def get_talk_panel(bot: Bot,
'help_text',
update=update,
user_record=user_record,
q=escape_html_chars(
q=clean_html_string(
remove_html_tags(text)
)
)
@ -155,7 +155,7 @@ def get_talk_panel(bot: Bot,
'user_not_found',
update=update,
user_record=user_record,
q=escape_html_chars(
q=clean_html_string(
remove_html_tags(text)
)
)

View File

@ -382,7 +382,7 @@ class TelegramBot:
@staticmethod
def adapt_parameters(parameters, exclude=None):
"""Build a aiohttp.FormData object from given `parameters`.
"""Build an aiohttp.FormData object from given `parameters`.
Exclude `self`, empty values and parameters in `exclude` list.
Cast integers to string to avoid TypeError during json serialization.
@ -1058,7 +1058,7 @@ class TelegramBot:
unbanned first.
Note: In regular groups (non-supergroups), this method will only work
if the All Members Are Admins setting is off in the target group.
Otherwise members may only be removed by the group's creator or by
Otherwise, members may only be removed by the group's creator or by
the member that added them.
See https://core.telegram.org/bots/api#kickchatmember for details.
"""
@ -1245,7 +1245,7 @@ class TelegramBot:
)
async def getChat(self, chat_id: Union[int, str]):
"""Get up to date information about the chat.
"""Get up-to-date information about the chat.
Return a Chat object on success.
See https://core.telegram.org/bots/api#getchat for details.

View File

@ -54,7 +54,7 @@ from davtelepot.database import ObjectWithDatabase
from davtelepot.languages import MultiLanguageObject
from davtelepot.messages import davtelepot_messages
from davtelepot.utilities import (
async_get, escape_html_chars, extract, get_secure_key,
async_get, clean_html_string, extract, get_secure_key,
make_inline_query_answer, make_lines_of_buttons, remove_html_tags
)
@ -69,7 +69,7 @@ logging.getLogger('chardet').setLevel(logging.WARNING)
class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
"""Simple Bot object, providing methods corresponding to Telegram bot API.
Multiple Bot() instances may be run together, along with a aiohttp web app.
Multiple Bot() instances may be run together, along with an aiohttp web app.
"""
bots = []
@ -347,7 +347,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
@property
def errors_file_path(self):
"""Return errors file path basing on self.path and `_errors_file_name`.
"""Return errors file path basing on `self.path` and `_errors_file_name`.
Fallback to class file if set, otherwise return None.
"""
@ -417,7 +417,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
"""Maximum number of simultaneous HTTPS connections allowed.
Telegram will open as many connections as possible to boost bots
throughput, lower values limit the load on bots server.
throughput, lower values limit the load on bot's server.
"""
return self._max_connections
@ -477,7 +477,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
def allowed_during_maintenance(self):
"""Return the list of criteria to allow an update during maintenance.
If any of this criteria returns True on an update, that update will be
If any of these criteria returns True on an update, that update will be
handled even during maintenance.
"""
return self._allowed_during_maintenance
@ -858,7 +858,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
elif 'chat' in update and update['chat']['id'] > 0:
reply = dict(text=self.unknown_command_message)
else: # Handle command aliases and text parsers
# Aliases are case insensitive: text and alias are both .lower()
# Aliases are case-insensitive: text and alias are both .lower()
for alias, function in self.command_aliases.items():
if lowered_text.startswith(alias.lower()):
replier = function
@ -1222,7 +1222,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
with proper code markdown.
"""
if parse_mode == 'HTML':
text = escape_html_chars(text)
text = clean_html_string(text)
tags = (
('`', '`')
if parse_mode == 'Markdown'
@ -1591,7 +1591,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
photo.startswith(url_starter)
for url_starter in ('http', 'www',)
]
): # If `photo` is not a url but a local file path
): # If `photo` is not a URL but a local file path
try:
with io.BytesIO() as buffered_picture:
with open(
@ -1716,7 +1716,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
audio.startswith(url_starter)
for url_starter in ('http', 'www',)
]
): # If `audio` is not a url but a local file path
): # If `audio` is not a URL but a local file path
try:
with io.BytesIO() as buffered_picture:
with open(
@ -1841,7 +1841,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
voice.startswith(url_starter)
for url_starter in ('http', 'www',)
]
): # If `voice` is not a url but a local file path
): # If `voice` is not a URL but a local file path
try:
with io.BytesIO() as buffered_picture:
with open(
@ -1977,7 +1977,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
document_path.startswith(url_starter)
for url_starter in ('http', 'www',)
]
): # If `document_path` is not a url but a local file path
): # If `document_path` is not a URL but a local file path
try:
with open(
document_path.format(
@ -3162,7 +3162,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
allowed_updates : List(str)
List of update types to be retrieved.
Empty list to allow all updates.
None to fallback to class default.
None to fall back to class default.
"""
# Return if token is invalid
await self.get_me()

View File

@ -16,9 +16,9 @@ import string
import time
from difflib import SequenceMatcher
from typing import Tuple, Union
# Third party modules
from typing import Tuple, Union
import aiohttp
from bs4 import BeautifulSoup
@ -1251,7 +1251,7 @@ def parse_datetime_interval_string(text):
result_text.pop()
if len(result_text) > 0 and result_text[-1].lower() in TIME_WORDS:
result_text.pop()
result_text = escape_html_chars(
result_text = clean_html_string(
' '.join(result_text)
)
parsers = list(
@ -1330,6 +1330,22 @@ MONTH_NAMES_ITA[10] = "ottobre"
MONTH_NAMES_ITA[11] = "novembre"
MONTH_NAMES_ITA[12] = "dicembre"
allowed_html_tags = ['b', 'strong',
'i', 'em',
'u', 'ins',
's', 'strike', 'del',
'span', 'tg-spoiler',
'a',
'code', 'pre']
HTML_SYMBOLS = collections.OrderedDict()
HTML_SYMBOLS["&"] = "&"
HTML_SYMBOLS["<"] = "&lt;"
HTML_SYMBOLS[">"] = "&gt;"
HTML_SYMBOLS["\""] = "&quot;"
html_numeric_code_regex = re.compile(r'&amp;(?P<code>#\d{2,3};)')
def beautytd(td):
"""Format properly timedeltas."""
@ -1410,67 +1426,56 @@ def beautydt(dt):
return result
HTML_SYMBOLS = MyOD()
HTML_SYMBOLS["&"] = "&amp;"
HTML_SYMBOLS["<"] = "&lt;"
HTML_SYMBOLS[">"] = "&gt;"
HTML_SYMBOLS["\""] = "&quot;"
HTML_SYMBOLS["&lt;b&gt;"] = "<b>"
HTML_SYMBOLS["&lt;/b&gt;"] = "</b>"
HTML_SYMBOLS["&lt;i&gt;"] = "<i>"
HTML_SYMBOLS["&lt;/i&gt;"] = "</i>"
HTML_SYMBOLS["&lt;code&gt;"] = "<code>"
HTML_SYMBOLS["&lt;/code&gt;"] = "</code>"
HTML_SYMBOLS["&lt;pre&gt;"] = "<pre>"
HTML_SYMBOLS["&lt;/pre&gt;"] = "</pre>"
HTML_SYMBOLS["&lt;a href=&quot;"] = "<a href=\""
HTML_SYMBOLS["&quot;&gt;"] = "\">"
HTML_SYMBOLS["&lt;/a&gt;"] = "</a>"
def clean_html_string(text: str) -> str:
"""Escape HTML symbols, unless part of a valid tag or numeric code character.
HTML_TAGS = [
None, "<b>", "</b>",
None, "<i>", "</i>",
None, "<code>", "</code>",
None, "<pre>", "</pre>",
None, "<a href=\"", "\">", "</a>",
None
]
def remove_html_tags(text):
"""Remove HTML tags from `text`."""
for tag in HTML_TAGS:
if tag is None:
continue
text = text.replace(tag, '')
Find valid HTML tags;
if there are any, choose the first occurring and call the function
recursively on what comes before the tag, inside the tag and after the
tag, preserving the tag opening and close as they are;
if there aren't any, escape HTML symbols except for `&` in HTML numeric code
characters (`&#` followed by 2 or 3 digits followed by `;`).
"""
first_match = None
for tag in allowed_html_tags:
if tag in ('a', ): # <a> must have href attribute
attribute = r" href=\".*\""
elif tag in ('span', ): # <span> must have class attribute with "tg-spoiler" value
attribute = r" class=\"tg-spoiler\""
elif tag in ('code',): # <code> may have a class with a programming language as value
attribute = r"( class=\".*\")?"
else:
attribute = ""
match = re.search(
rf'(?P<opening><{tag}{attribute}>)'
rf'(?P<body>.*?)'
rf'(?P<close></{tag}>)',
text
)
if match and (first_match is None or match.start() < first_match.start()):
first_match = match
if first_match is not None:
groups = first_match.groupdict()
text = (f"{clean_html_string(text[:first_match.start()])}"
f"{groups['opening']}{clean_html_string(groups['body'])}{groups['close']}"
f"{clean_html_string(text[first_match.end():])}")
else:
for key, value in HTML_SYMBOLS.items():
text = text.replace(key, value)
if re.search(html_numeric_code_regex, text):
text = re.sub(html_numeric_code_regex, r'&\g<code>', text)
return text
def escape_html_chars(text):
"""Escape HTML chars if not part of a tag."""
for s, r in HTML_SYMBOLS.items():
text = text.replace(s, r)
copy = text
expected_tag = None
while copy:
min_ = min(
(
dict(
position=copy.find(tag) if tag in copy else len(copy),
tag=tag
)
for tag in HTML_TAGS
if tag
),
key=lambda x: x['position'],
default=0
)
if min_['position'] == len(copy):
break
if expected_tag and min_['tag'] != expected_tag:
return text.replace('<', '_').replace('>', '_')
expected_tag = HTML_TAGS[HTML_TAGS.index(min_['tag'])+1]
copy = extract(copy, min_['tag'])
logging.error("`escape_html_chars` function deprecated, use `clean_html_string` instead.")
return clean_html_string(text)
def remove_html_tags(text):
"""Remove HTML tags from `text`."""
for tag in allowed_html_tags:
text = re.sub(rf'</?{tag}( (href|class)=\".*\")?>', '', text)
return text