Skip to content

Commit

Permalink
text generator and test (#295)
Browse files Browse the repository at this point in the history
* text generator and test

* fixed style and 256 to 255 error

* fixed eof

* new eof line

* one line eof

* updated num_rows test with multiple row counts

* removed whitespace in eof

* moved text_generator test into distinct_gen test folder
  • Loading branch information
drahc1R authored Jul 12, 2023
1 parent 60ed0ae commit 7ea0f0b
Show file tree
Hide file tree
Showing 2 changed files with 124 additions and 0 deletions.
76 changes: 76 additions & 0 deletions synthetic_data/distinct_generators/text_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import numpy as np
import string
from numpy.random import Generator
from typing import List, Optional


def random_string(
rng: Generator,
chars: Optional[List[str]] = None,
num_rows: int = 1,
str_len_min: int = 1,
str_len_max: int = 256,
) -> np.array:
"""
Randomly generates an array of strings with length between a min and max value
:param rng: the np rng object used to generate random values
:type rng: numpy Generator
:param chars: a list of values that are allowed in a string or None
:type chars: List[str], None
:param num_rows: the number of rows in np array generated
:type num_rows: int, optional
:param str_len_min: the minimum length a string can be
:type str_len_min: int, optional
:param str_len_max: the maximum length a string can be
:type str_len_max: int, optional
:return: numpy array of strings
"""
if chars is None:
chars = list(
string.ascii_uppercase
+ string.ascii_lowercase
+ string.digits
+ " "
+ string.punctuation
)
string_list = []

for _ in range(num_rows):
length = rng.integers(str_len_min, str_len_max)
string_entry = "".join(rng.choice(chars, (length,)))
string_list.append(string_entry)

return np.array(string_list)


def random_text(
rng: Generator,
chars: Optional[str] = None,
num_rows: int = 1,
str_len_min: int = 256,
str_len_max: int = 1000,
) -> np.array:
"""
Randomly generates an array of text with length between a min and max value
:param rng: the np rng object used to generate random values
:type rng: numpy Generator
:param chars: a list of values that are allowed in a string or None
:type chars: List[str], None
:param num_rows: the number of rows in np array generated
:type num_rows: int, optional
:param str_len_min: the minimum length a string can be (must be larger than 255)
:type str_len_min: int, optional
:param str_len_max: the maximum length a string can be
:type str_len_max: int, optional
:return: numpy array of text
"""
if str_len_min < 256:
raise ValueError(
f"str_len_min must be > 255. " f"Value provided: {str_len_min}."
)

return random_string(rng, chars, num_rows, str_len_min, str_len_max)
48 changes: 48 additions & 0 deletions tests/distinct_generators/test_text_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import unittest
from unittest import mock
import pandas as pd
import numpy as np
from synthetic_data.distinct_generators.text_generator import random_string, random_text


class TestTextGeneratorFunctions(unittest.TestCase):
def setUp(self):
self.rng = np.random.default_rng(12345)

def test_return_type(self):
str_arr = random_string(self.rng)
txt_arr = random_text(self.rng)
for x in str_arr:
self.assertIsInstance(x, np.str_)
for x in txt_arr:
self.assertIsInstance(x, np.str_)

def test_str_length(self):
str_arr = random_string(self.rng, str_len_min=1, str_len_max=256)
txt_arr = random_text(self.rng, str_len_min=256, str_len_max=1000)
with self.assertRaises(ValueError):
random_text(self.rng, str_len_min=255)

self.assertLessEqual(len(str_arr[0]), 256)
self.assertGreaterEqual(len(str_arr[0]), 1)
self.assertLessEqual(len(txt_arr[0]), 1000)
self.assertGreaterEqual(len(txt_arr[0]), 256)

def test_num_rows(self):
num_rows = [1,5,10]
for nr in num_rows:
str_arr = random_string(self.rng, num_rows=nr)
txt_arr = random_text(self.rng, num_rows=nr)
self.assertEqual(str_arr.size, nr)
self.assertEqual(txt_arr.size, nr)

def test_chars(self):
chars_set = {"0","1"}
str_arr = random_string(self.rng, chars=["0","1"])
txt_arr = random_text(self.rng, chars=["0","1"])
for s in str_arr:
for char in s:
self.assertIn(char, chars_set)
for s in txt_arr:
for char in s:
self.assertIn(char, chars_set)

0 comments on commit 7ea0f0b

Please sign in to comment.