HTML parse_mode check upgraded.
All supported tags permitted, malformed tags replaced with escaped characters
This commit is contained in:
parent
55b47ed1f7
commit
fafa639328
@ -11,7 +11,7 @@ __author__ = "Davide Testa"
|
||||
__email__ = "davide@davte.it"
|
||||
__credits__ = ["Marco Origlia", "Nick Lee @Nickoala"]
|
||||
__license__ = "GNU General Public License v3.0"
|
||||
__version__ = "2.8.9"
|
||||
__version__ = "2.8.10"
|
||||
__maintainer__ = "Davide Testa"
|
||||
__contact__ = "t.me/davte"
|
||||
|
||||
|
@ -27,7 +27,7 @@ from davtelepot.messages import default_admin_messages, default_talk_messages
|
||||
from davtelepot.bot import Bot
|
||||
from davtelepot.utilities import (
|
||||
async_wrapper, CachedPage, Confirmator, extract, get_cleaned_text,
|
||||
get_user, escape_html_chars, line_drawing_unordered_list, make_button,
|
||||
get_user, clean_html_string, line_drawing_unordered_list, make_button,
|
||||
make_inline_keyboard, remove_html_tags, send_part_of_text_file,
|
||||
send_csv_file, make_lines_of_buttons
|
||||
)
|
||||
@ -130,7 +130,7 @@ def get_talk_panel(bot: Bot,
|
||||
'help_text',
|
||||
update=update,
|
||||
user_record=user_record,
|
||||
q=escape_html_chars(
|
||||
q=clean_html_string(
|
||||
remove_html_tags(text)
|
||||
)
|
||||
)
|
||||
@ -155,7 +155,7 @@ def get_talk_panel(bot: Bot,
|
||||
'user_not_found',
|
||||
update=update,
|
||||
user_record=user_record,
|
||||
q=escape_html_chars(
|
||||
q=clean_html_string(
|
||||
remove_html_tags(text)
|
||||
)
|
||||
)
|
||||
|
@ -382,7 +382,7 @@ class TelegramBot:
|
||||
|
||||
@staticmethod
|
||||
def adapt_parameters(parameters, exclude=None):
|
||||
"""Build a aiohttp.FormData object from given `parameters`.
|
||||
"""Build an aiohttp.FormData object from given `parameters`.
|
||||
|
||||
Exclude `self`, empty values and parameters in `exclude` list.
|
||||
Cast integers to string to avoid TypeError during json serialization.
|
||||
@ -1058,7 +1058,7 @@ class TelegramBot:
|
||||
unbanned first.
|
||||
Note: In regular groups (non-supergroups), this method will only work
|
||||
if the ‘All Members Are Admins’ setting is off in the target group.
|
||||
Otherwise members may only be removed by the group's creator or by
|
||||
Otherwise, members may only be removed by the group's creator or by
|
||||
the member that added them.
|
||||
See https://core.telegram.org/bots/api#kickchatmember for details.
|
||||
"""
|
||||
@ -1245,7 +1245,7 @@ class TelegramBot:
|
||||
)
|
||||
|
||||
async def getChat(self, chat_id: Union[int, str]):
|
||||
"""Get up to date information about the chat.
|
||||
"""Get up-to-date information about the chat.
|
||||
|
||||
Return a Chat object on success.
|
||||
See https://core.telegram.org/bots/api#getchat for details.
|
||||
|
@ -54,7 +54,7 @@ from davtelepot.database import ObjectWithDatabase
|
||||
from davtelepot.languages import MultiLanguageObject
|
||||
from davtelepot.messages import davtelepot_messages
|
||||
from davtelepot.utilities import (
|
||||
async_get, escape_html_chars, extract, get_secure_key,
|
||||
async_get, clean_html_string, extract, get_secure_key,
|
||||
make_inline_query_answer, make_lines_of_buttons, remove_html_tags
|
||||
)
|
||||
|
||||
@ -69,7 +69,7 @@ logging.getLogger('chardet').setLevel(logging.WARNING)
|
||||
class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
|
||||
"""Simple Bot object, providing methods corresponding to Telegram bot API.
|
||||
|
||||
Multiple Bot() instances may be run together, along with a aiohttp web app.
|
||||
Multiple Bot() instances may be run together, along with an aiohttp web app.
|
||||
"""
|
||||
|
||||
bots = []
|
||||
@ -347,7 +347,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
|
||||
|
||||
@property
|
||||
def errors_file_path(self):
|
||||
"""Return errors file path basing on self.path and `_errors_file_name`.
|
||||
"""Return errors file path basing on `self.path` and `_errors_file_name`.
|
||||
|
||||
Fallback to class file if set, otherwise return None.
|
||||
"""
|
||||
@ -417,7 +417,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
|
||||
"""Maximum number of simultaneous HTTPS connections allowed.
|
||||
|
||||
Telegram will open as many connections as possible to boost bot’s
|
||||
throughput, lower values limit the load on bot‘s server.
|
||||
throughput, lower values limit the load on bot's server.
|
||||
"""
|
||||
return self._max_connections
|
||||
|
||||
@ -477,7 +477,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
|
||||
def allowed_during_maintenance(self):
|
||||
"""Return the list of criteria to allow an update during maintenance.
|
||||
|
||||
If any of this criteria returns True on an update, that update will be
|
||||
If any of these criteria returns True on an update, that update will be
|
||||
handled even during maintenance.
|
||||
"""
|
||||
return self._allowed_during_maintenance
|
||||
@ -858,7 +858,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
|
||||
elif 'chat' in update and update['chat']['id'] > 0:
|
||||
reply = dict(text=self.unknown_command_message)
|
||||
else: # Handle command aliases and text parsers
|
||||
# Aliases are case insensitive: text and alias are both .lower()
|
||||
# Aliases are case-insensitive: text and alias are both .lower()
|
||||
for alias, function in self.command_aliases.items():
|
||||
if lowered_text.startswith(alias.lower()):
|
||||
replier = function
|
||||
@ -1222,7 +1222,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
|
||||
with proper code markdown.
|
||||
"""
|
||||
if parse_mode == 'HTML':
|
||||
text = escape_html_chars(text)
|
||||
text = clean_html_string(text)
|
||||
tags = (
|
||||
('`', '`')
|
||||
if parse_mode == 'Markdown'
|
||||
@ -1591,7 +1591,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
|
||||
photo.startswith(url_starter)
|
||||
for url_starter in ('http', 'www',)
|
||||
]
|
||||
): # If `photo` is not a url but a local file path
|
||||
): # If `photo` is not a URL but a local file path
|
||||
try:
|
||||
with io.BytesIO() as buffered_picture:
|
||||
with open(
|
||||
@ -1716,7 +1716,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
|
||||
audio.startswith(url_starter)
|
||||
for url_starter in ('http', 'www',)
|
||||
]
|
||||
): # If `audio` is not a url but a local file path
|
||||
): # If `audio` is not a URL but a local file path
|
||||
try:
|
||||
with io.BytesIO() as buffered_picture:
|
||||
with open(
|
||||
@ -1841,7 +1841,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
|
||||
voice.startswith(url_starter)
|
||||
for url_starter in ('http', 'www',)
|
||||
]
|
||||
): # If `voice` is not a url but a local file path
|
||||
): # If `voice` is not a URL but a local file path
|
||||
try:
|
||||
with io.BytesIO() as buffered_picture:
|
||||
with open(
|
||||
@ -1977,7 +1977,7 @@ class Bot(TelegramBot, ObjectWithDatabase, MultiLanguageObject):
|
||||
document_path.startswith(url_starter)
|
||||
for url_starter in ('http', 'www',)
|
||||
]
|
||||
): # If `document_path` is not a url but a local file path
|
||||
): # If `document_path` is not a URL but a local file path
|
||||
try:
|
||||
with open(
|
||||
document_path.format(
|
||||
|
@ -16,9 +16,9 @@ import string
|
||||
import time
|
||||
|
||||
from difflib import SequenceMatcher
|
||||
from typing import Tuple, Union
|
||||
|
||||
# Third party modules
|
||||
from typing import Tuple, Union
|
||||
|
||||
import aiohttp
|
||||
from bs4 import BeautifulSoup
|
||||
@ -1251,7 +1251,7 @@ def parse_datetime_interval_string(text):
|
||||
result_text.pop()
|
||||
if len(result_text) > 0 and result_text[-1].lower() in TIME_WORDS:
|
||||
result_text.pop()
|
||||
result_text = escape_html_chars(
|
||||
result_text = clean_html_string(
|
||||
' '.join(result_text)
|
||||
)
|
||||
parsers = list(
|
||||
@ -1330,6 +1330,22 @@ MONTH_NAMES_ITA[10] = "ottobre"
|
||||
MONTH_NAMES_ITA[11] = "novembre"
|
||||
MONTH_NAMES_ITA[12] = "dicembre"
|
||||
|
||||
allowed_html_tags = ['b', 'strong',
|
||||
'i', 'em',
|
||||
'u', 'ins',
|
||||
's', 'strike', 'del',
|
||||
'span', 'tg-spoiler',
|
||||
'a',
|
||||
'code', 'pre']
|
||||
|
||||
HTML_SYMBOLS = collections.OrderedDict()
|
||||
HTML_SYMBOLS["&"] = "&"
|
||||
HTML_SYMBOLS["<"] = "<"
|
||||
HTML_SYMBOLS[">"] = ">"
|
||||
HTML_SYMBOLS["\""] = """
|
||||
|
||||
html_numeric_code_regex = re.compile(r'&(?P<code>#\d{2,3};)')
|
||||
|
||||
|
||||
def beautytd(td):
|
||||
"""Format properly timedeltas."""
|
||||
@ -1410,67 +1426,56 @@ def beautydt(dt):
|
||||
return result
|
||||
|
||||
|
||||
HTML_SYMBOLS = MyOD()
|
||||
HTML_SYMBOLS["&"] = "&"
|
||||
HTML_SYMBOLS["<"] = "<"
|
||||
HTML_SYMBOLS[">"] = ">"
|
||||
HTML_SYMBOLS["\""] = """
|
||||
HTML_SYMBOLS["<b>"] = "<b>"
|
||||
HTML_SYMBOLS["</b>"] = "</b>"
|
||||
HTML_SYMBOLS["<i>"] = "<i>"
|
||||
HTML_SYMBOLS["</i>"] = "</i>"
|
||||
HTML_SYMBOLS["<code>"] = "<code>"
|
||||
HTML_SYMBOLS["</code>"] = "</code>"
|
||||
HTML_SYMBOLS["<pre>"] = "<pre>"
|
||||
HTML_SYMBOLS["</pre>"] = "</pre>"
|
||||
HTML_SYMBOLS["<a href=""] = "<a href=\""
|
||||
HTML_SYMBOLS["">"] = "\">"
|
||||
HTML_SYMBOLS["</a>"] = "</a>"
|
||||
def clean_html_string(text: str) -> str:
|
||||
"""Escape HTML symbols, unless part of a valid tag or numeric code character.
|
||||
|
||||
HTML_TAGS = [
|
||||
None, "<b>", "</b>",
|
||||
None, "<i>", "</i>",
|
||||
None, "<code>", "</code>",
|
||||
None, "<pre>", "</pre>",
|
||||
None, "<a href=\"", "\">", "</a>",
|
||||
None
|
||||
]
|
||||
|
||||
|
||||
def remove_html_tags(text):
|
||||
"""Remove HTML tags from `text`."""
|
||||
for tag in HTML_TAGS:
|
||||
if tag is None:
|
||||
continue
|
||||
text = text.replace(tag, '')
|
||||
Find valid HTML tags;
|
||||
if there are any, choose the first occurring and call the function
|
||||
recursively on what comes before the tag, inside the tag and after the
|
||||
tag, preserving the tag opening and close as they are;
|
||||
if there aren't any, escape HTML symbols except for `&` in HTML numeric code
|
||||
characters (`&#` followed by 2 or 3 digits followed by `;`).
|
||||
"""
|
||||
first_match = None
|
||||
for tag in allowed_html_tags:
|
||||
if tag in ('a', ): # <a> must have href attribute
|
||||
attribute = r" href=\".*\""
|
||||
elif tag in ('span', ): # <span> must have class attribute with "tg-spoiler" value
|
||||
attribute = r" class=\"tg-spoiler\""
|
||||
elif tag in ('code',): # <code> may have a class with a programming language as value
|
||||
attribute = r"( class=\".*\")?"
|
||||
else:
|
||||
attribute = ""
|
||||
match = re.search(
|
||||
rf'(?P<opening><{tag}{attribute}>)'
|
||||
rf'(?P<body>.*?)'
|
||||
rf'(?P<close></{tag}>)',
|
||||
text
|
||||
)
|
||||
if match and (first_match is None or match.start() < first_match.start()):
|
||||
first_match = match
|
||||
if first_match is not None:
|
||||
groups = first_match.groupdict()
|
||||
text = (f"{clean_html_string(text[:first_match.start()])}"
|
||||
f"{groups['opening']}{clean_html_string(groups['body'])}{groups['close']}"
|
||||
f"{clean_html_string(text[first_match.end():])}")
|
||||
else:
|
||||
for key, value in HTML_SYMBOLS.items():
|
||||
text = text.replace(key, value)
|
||||
if re.search(html_numeric_code_regex, text):
|
||||
text = re.sub(html_numeric_code_regex, r'&\g<code>', text)
|
||||
return text
|
||||
|
||||
|
||||
def escape_html_chars(text):
|
||||
"""Escape HTML chars if not part of a tag."""
|
||||
for s, r in HTML_SYMBOLS.items():
|
||||
text = text.replace(s, r)
|
||||
copy = text
|
||||
expected_tag = None
|
||||
while copy:
|
||||
min_ = min(
|
||||
(
|
||||
dict(
|
||||
position=copy.find(tag) if tag in copy else len(copy),
|
||||
tag=tag
|
||||
)
|
||||
for tag in HTML_TAGS
|
||||
if tag
|
||||
),
|
||||
key=lambda x: x['position'],
|
||||
default=0
|
||||
)
|
||||
if min_['position'] == len(copy):
|
||||
break
|
||||
if expected_tag and min_['tag'] != expected_tag:
|
||||
return text.replace('<', '_').replace('>', '_')
|
||||
expected_tag = HTML_TAGS[HTML_TAGS.index(min_['tag'])+1]
|
||||
copy = extract(copy, min_['tag'])
|
||||
logging.error("`escape_html_chars` function deprecated, use `clean_html_string` instead.")
|
||||
return clean_html_string(text)
|
||||
|
||||
|
||||
def remove_html_tags(text):
|
||||
"""Remove HTML tags from `text`."""
|
||||
for tag in allowed_html_tags:
|
||||
text = re.sub(rf'</?{tag}( (href|class)=\".*\")?>', '', text)
|
||||
return text
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user