-
Notifications
You must be signed in to change notification settings - Fork 4.7k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #4475 from vigneshp826/master
Improve Slack Sanitization
- Loading branch information
Showing
2 changed files
with
19 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,11 +12,14 @@ This project adheres to `Semantic Versioning`_ starting with version 1.0. | |
|
||
Added | ||
----- | ||
|
||
- added ``DynamoTrackerStore`` to support persistence of agents running on AWS | ||
- added docstrings for ``TrackerStore`` classes | ||
- added buttons and images to mattermost. | ||
- `CRFEntityExtractor` updated to accept arbitrary token-level features like word vectors (issues/4214) | ||
- `SpacyFeaturizer` updated to add `ner_features` for `CRFEntityExtractor` | ||
- Sanitizing incoming messages from slack to remove slack formatting like <mailto:[email protected]|[email protected]> | ||
or <http://url.com|url.com> and substitute it with original content | ||
|
||
Changed | ||
------- | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -180,7 +180,6 @@ def _is_user_message(slack_event): | |
@staticmethod | ||
def _sanitize_user_message(text, uids_to_remove): | ||
"""Remove superfluous/wrong/problematic tokens from a message. | ||
Probably a good starting point for pre-formatting of user-provided text | ||
to make NLU's life easier in case they go funky to the power of extreme | ||
|
@@ -193,18 +192,32 @@ def _sanitize_user_message(text, uids_to_remove): | |
Returns: | ||
str: parsed and cleaned version of the input text | ||
""" | ||
|
||
for uid_to_remove in uids_to_remove: | ||
# heuristic to format majority cases OK | ||
# can be adjusted to taste later if needed, | ||
# but is a good first approximation | ||
for regex, replacement in [ | ||
(r"<@{}>\s".format(uid_to_remove), ""), | ||
(r"\s<@{}>".format(uid_to_remove), ""), | ||
# a bit arbitrary but probably OK | ||
( | ||
r"\s<@{}>".format(uid_to_remove), | ||
"", | ||
), # a bit arbitrary but probably OK | ||
(r"<@{}>".format(uid_to_remove), " "), | ||
]: | ||
text = re.sub(regex, replacement, text) | ||
|
||
"""Find mailto or http links like <mailto:[email protected]|[email protected]> or '<http://url.com|url.com>in text and substitute it with original content | ||
""" | ||
|
||
pattern = r"\<(mailto:|(http|https):\/\/).*\|.*\>" | ||
match = re.search(pattern, text) | ||
|
||
if match: | ||
regex = match.group(0) | ||
replacement = regex.split("|")[1] | ||
replacement = replacement.replace(">", "") | ||
text = text.replace(regex, replacement) | ||
return text.strip() | ||
|
||
@staticmethod | ||
|