Skip to content

Commit

Permalink
Merge pull request #1434 from swirlai/DS-2944
Browse files Browse the repository at this point in the history
Ds 2944
  • Loading branch information
dnicodemus authored Sep 23, 2024
2 parents 42beec0 + 77c11fd commit 4a1fa2d
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 19 deletions.
2 changes: 1 addition & 1 deletion swirl/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def getSearchPreQueryProcessorsDefault():
return []

def getSearchPostResultProcessorsDefault():
return ["DedupeByFieldPostResultProcessor","CosineRelevancyPostResultProcessor", "RedactPIIPostResultProcessor"]
return ["DedupeByFieldPostResultProcessor","CosineRelevancyPostResultProcessor"]

class Search(models.Model):
id = models.BigAutoField(primary_key=True)
Expand Down
36 changes: 18 additions & 18 deletions swirl/processors/remove_pii.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,13 @@ def remove_pii(text: str, query_string=None, redact=False) -> str:
By default, Presidio redacts entities, replacing it with <entity-type>.
The Presidio "redact" option removes the PII entirely.
In SWIRL, remove means "remove the PII" and "redact" means "replace it with <entity-type>".
:param text: The input string (either query or result) to clean.
:return: The text with PII removed.
"""

text = remove_tags(text)
pii_entities = analyzer.analyze(text=text, language='en')
untagged_text = remove_tags(text)
pii_entities = analyzer.analyze(text=untagged_text, language='en')

if not pii_entities:
return text
Expand All @@ -55,11 +55,11 @@ def remove_pii(text: str, query_string=None, redact=False) -> str:
operators = {"DEFAULT": OperatorConfig("replace")}

anonymized_result = anonymizer.anonymize(
text=text,
analyzer_results=pii_entities,
text=untagged_text,
analyzer_results=pii_entities,
operators=operators
)

anonymized_text = anonymized_result.text

if redact:
Expand All @@ -77,17 +77,17 @@ class RemovePIIQueryProcessor(QueryProcessor):
"""
A SWIRL metasearch query processor that removes PII from search queries.
"""

type = 'RemovePIIQueryProcessor'

def process(self) -> str:
"""
:return: The processed query with PII removed.
"""

# Remove PII from the query
cleaned_query = remove_pii(self.query_string)

return cleaned_query

#############################################
Expand All @@ -97,19 +97,19 @@ class RedactPIIResultProcessor(ResultProcessor):
A SWIRL result processor that removes PII from the search results.
Meant to be run after CosineResultProcessor.
"""

type = "RemovePIIResultProcessor"

def process(self) -> int:
"""
:return: The number of modified results.
"""
logger.debug(f"Processing {len(self.results)} results for PII removal.")

modified = 0
for item in self.results:
pii_modified = False

# Remove PII from 'title' and 'body' fields of each result
if 'title' in item:
cleaned_title = redact_pii(item['title'], self.query_string)
Expand All @@ -131,14 +131,14 @@ def process(self) -> int:
if cleaned_payload != item['payload'][key]:
item['payload'][key] = cleaned_payload
pii_modified = True

if pii_modified:
modified += 1

self.processed_results = self.results
self.modified = modified
logger.debug(f"PII removal complete. {self.modified} results modified.")

return self.modified

#############################################
Expand All @@ -147,16 +147,16 @@ class RedactPIIPostResultProcessor(PostResultProcessor):
"""
A SWIRL result processor that removes PII from all results.
"""

type = "RemovePIIPostResultProcessor"

def process(self) -> int:
"""
:return: The number of modified results.
"""

modified = 0

for result in self.results:
for item in result.json_results:
pii_modified = False
Expand All @@ -182,5 +182,5 @@ def process(self) -> int:
modified += 1
result.save()

self.results_updated = modified
self.results_updated = modified
return self.results_updated

0 comments on commit 4a1fa2d

Please sign in to comment.