-
Notifications
You must be signed in to change notification settings - Fork 28
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* text generator and test * fixed style and 256 to 255 error * fixed eof * new eof line * one line eof * updated num_rows test with multiple row counts * removed whitespace in eof * moved text_generator test into distinct_gen test folder
- Loading branch information
Showing
2 changed files
with
124 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
import numpy as np | ||
import string | ||
from numpy.random import Generator | ||
from typing import List, Optional | ||
|
||
|
||
def random_string( | ||
rng: Generator, | ||
chars: Optional[List[str]] = None, | ||
num_rows: int = 1, | ||
str_len_min: int = 1, | ||
str_len_max: int = 256, | ||
) -> np.array: | ||
""" | ||
Randomly generates an array of strings with length between a min and max value | ||
:param rng: the np rng object used to generate random values | ||
:type rng: numpy Generator | ||
:param chars: a list of values that are allowed in a string or None | ||
:type chars: List[str], None | ||
:param num_rows: the number of rows in np array generated | ||
:type num_rows: int, optional | ||
:param str_len_min: the minimum length a string can be | ||
:type str_len_min: int, optional | ||
:param str_len_max: the maximum length a string can be | ||
:type str_len_max: int, optional | ||
:return: numpy array of strings | ||
""" | ||
if chars is None: | ||
chars = list( | ||
string.ascii_uppercase | ||
+ string.ascii_lowercase | ||
+ string.digits | ||
+ " " | ||
+ string.punctuation | ||
) | ||
string_list = [] | ||
|
||
for _ in range(num_rows): | ||
length = rng.integers(str_len_min, str_len_max) | ||
string_entry = "".join(rng.choice(chars, (length,))) | ||
string_list.append(string_entry) | ||
|
||
return np.array(string_list) | ||
|
||
|
||
def random_text( | ||
rng: Generator, | ||
chars: Optional[str] = None, | ||
num_rows: int = 1, | ||
str_len_min: int = 256, | ||
str_len_max: int = 1000, | ||
) -> np.array: | ||
""" | ||
Randomly generates an array of text with length between a min and max value | ||
:param rng: the np rng object used to generate random values | ||
:type rng: numpy Generator | ||
:param chars: a list of values that are allowed in a string or None | ||
:type chars: List[str], None | ||
:param num_rows: the number of rows in np array generated | ||
:type num_rows: int, optional | ||
:param str_len_min: the minimum length a string can be (must be larger than 255) | ||
:type str_len_min: int, optional | ||
:param str_len_max: the maximum length a string can be | ||
:type str_len_max: int, optional | ||
:return: numpy array of text | ||
""" | ||
if str_len_min < 256: | ||
raise ValueError( | ||
f"str_len_min must be > 255. " f"Value provided: {str_len_min}." | ||
) | ||
|
||
return random_string(rng, chars, num_rows, str_len_min, str_len_max) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
import unittest | ||
from unittest import mock | ||
import pandas as pd | ||
import numpy as np | ||
from synthetic_data.distinct_generators.text_generator import random_string, random_text | ||
|
||
|
||
class TestTextGeneratorFunctions(unittest.TestCase): | ||
def setUp(self): | ||
self.rng = np.random.default_rng(12345) | ||
|
||
def test_return_type(self): | ||
str_arr = random_string(self.rng) | ||
txt_arr = random_text(self.rng) | ||
for x in str_arr: | ||
self.assertIsInstance(x, np.str_) | ||
for x in txt_arr: | ||
self.assertIsInstance(x, np.str_) | ||
|
||
def test_str_length(self): | ||
str_arr = random_string(self.rng, str_len_min=1, str_len_max=256) | ||
txt_arr = random_text(self.rng, str_len_min=256, str_len_max=1000) | ||
with self.assertRaises(ValueError): | ||
random_text(self.rng, str_len_min=255) | ||
|
||
self.assertLessEqual(len(str_arr[0]), 256) | ||
self.assertGreaterEqual(len(str_arr[0]), 1) | ||
self.assertLessEqual(len(txt_arr[0]), 1000) | ||
self.assertGreaterEqual(len(txt_arr[0]), 256) | ||
|
||
def test_num_rows(self): | ||
num_rows = [1,5,10] | ||
for nr in num_rows: | ||
str_arr = random_string(self.rng, num_rows=nr) | ||
txt_arr = random_text(self.rng, num_rows=nr) | ||
self.assertEqual(str_arr.size, nr) | ||
self.assertEqual(txt_arr.size, nr) | ||
|
||
def test_chars(self): | ||
chars_set = {"0","1"} | ||
str_arr = random_string(self.rng, chars=["0","1"]) | ||
txt_arr = random_text(self.rng, chars=["0","1"]) | ||
for s in str_arr: | ||
for char in s: | ||
self.assertIn(char, chars_set) | ||
for s in txt_arr: | ||
for char in s: | ||
self.assertIn(char, chars_set) |