Skip to content

Commit

Permalink
Merge pull request #4475 from vigneshp826/master
Browse files Browse the repository at this point in the history
Improve Slack Sanitization
  • Loading branch information
erohmensing authored Sep 25, 2019
2 parents 003bed9 + 5becabb commit 19fa8a3
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 3 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,14 @@ This project adheres to `Semantic Versioning`_ starting with version 1.0.

Added
-----

- added ``DynamoTrackerStore`` to support persistence of agents running on AWS
- added docstrings for ``TrackerStore`` classes
- added buttons and images to mattermost.
- `CRFEntityExtractor` updated to accept arbitrary token-level features like word vectors (issues/4214)
- `SpacyFeaturizer` updated to add `ner_features` for `CRFEntityExtractor`
- Sanitizing incoming messages from slack to remove slack formatting like <mailto:[email protected]|[email protected]>
or <http://url.com|url.com> and substitute it with original content

Changed
-------
Expand Down
19 changes: 16 additions & 3 deletions rasa/core/channels/slack.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,6 @@ def _is_user_message(slack_event):
@staticmethod
def _sanitize_user_message(text, uids_to_remove):
"""Remove superfluous/wrong/problematic tokens from a message.
Probably a good starting point for pre-formatting of user-provided text
to make NLU's life easier in case they go funky to the power of extreme
Expand All @@ -193,18 +192,32 @@ def _sanitize_user_message(text, uids_to_remove):
Returns:
str: parsed and cleaned version of the input text
"""

for uid_to_remove in uids_to_remove:
# heuristic to format majority cases OK
# can be adjusted to taste later if needed,
# but is a good first approximation
for regex, replacement in [
(r"<@{}>\s".format(uid_to_remove), ""),
(r"\s<@{}>".format(uid_to_remove), ""),
# a bit arbitrary but probably OK
(
r"\s<@{}>".format(uid_to_remove),
"",
), # a bit arbitrary but probably OK
(r"<@{}>".format(uid_to_remove), " "),
]:
text = re.sub(regex, replacement, text)

"""Find mailto or http links like <mailto:[email protected]|[email protected]> or '<http://url.com|url.com>in text and substitute it with original content
"""

pattern = r"\<(mailto:|(http|https):\/\/).*\|.*\>"
match = re.search(pattern, text)

if match:
regex = match.group(0)
replacement = regex.split("|")[1]
replacement = replacement.replace(">", "")
text = text.replace(regex, replacement)
return text.strip()

@staticmethod
Expand Down

0 comments on commit 19fa8a3

Please sign in to comment.