module TestUtils.BuildLargeTestData
  ( generateGoldenManySubjectsRw
  , generateGoldenManySubjectsCw
  , generateTestDataManySubjects
  , generateTestDataManyEvents
  , largeInputSize
  ) where

import           Data.List   (intercalate, sort)
import           Text.Printf
import           Text.Regex

{-
The number of times to replicate the data to create the "large data" tests
-}
largeInputSize :: Int
largeInputSize :: Int
largeInputSize = Int
1000

{-
-- Generates new test data by taking each subject and replicating them
-- `largeInputSize` number of times, and where each subject ID is postfixed with
-- a number so as to make the replicated subject IDs unique
-}
generateTestDataManySubjects :: String -> String -> IO ()
generateTestDataManySubjects :: [Char] -> [Char] -> IO ()
generateTestDataManySubjects [Char]
infilepath [Char]
outfilepath = do
  [Char] -> [Char] -> ([[Char]] -> [[Char]]) -> IO ()
generateTestDataBase [Char]
infilepath [Char]
outfilepath (Regex -> [[Char]] -> [[Char]]
generateNewSubjs Regex
patientRe)

{-
Generates new test data by dropping all subjects except the subject with subject
ID `"a"`, and replicating that subject's events `largeInputSize` number of times

TODO: this is quite fragile -- if the form of the test data changes then this
routine have unintended behavior without any warning. We should add an assertion
or think of a better approach altogether -- DP 2022-04-05
-}
generateTestDataManyEvents :: String -> String -> IO ()
generateTestDataManyEvents :: [Char] -> [Char] -> IO ()
generateTestDataManyEvents [Char]
infilepath [Char]
outfilepath = do
  [Char] -> [Char] -> ([[Char]] -> [[Char]]) -> IO ()
generateTestDataBase [Char]
infilepath [Char]
outfilepath [[Char]] -> [[Char]]
transform where
  checkIfSubjA :: [Char] -> Bool
checkIfSubjA [Char]
s = ([Char]
s forall a. [a] -> Int -> a
!! Int
2 forall a. Eq a => a -> a -> Bool
== Char
'a') Bool -> Bool -> Bool
&& ([Char]
s forall a. [a] -> Int -> a
!! Int
3 forall a. Eq a => a -> a -> Bool
== Char
'"')
  transform :: [[Char]] -> [[Char]]
transform = [[Char]] -> [[Char]]
generateNewEvents forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. (a -> Bool) -> [a] -> [a]
filter [Char] -> Bool
checkIfSubjA

{-
Generates a new golden testing file corresponding to the test data produced by
`generateTestDataManySubjects` for a cohort using the row-wise version of the
cohort-building application
-}
generateGoldenManySubjectsRw :: String -> IO ()
generateGoldenManySubjectsRw :: [Char] -> IO ()
generateGoldenManySubjectsRw [Char]
outfilepath = [Char] -> [Char] -> IO ()
writeFile [Char]
outfilepath [Char]
concatLines where
  concatLines :: [Char]
concatLines =
    forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat [[Char]]
goldenManySubjectsStart
      forall a. [a] -> [a] -> [a]
++ forall a. [a] -> [[a]] -> [a]
intercalate [Char]
"," [[Char]]
goldenRwPatientManys
      forall a. [a] -> [a] -> [a]
++ forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat [[Char]]
goldenRwEnd

{-
Generates a new golden testing file corresponding to the test data produced by
`generateTestDataManySubjects` for a cohort using the column-wise version of the
cohort-building application
-}
generateGoldenManySubjectsCw :: String -> IO ()
generateGoldenManySubjectsCw :: [Char] -> IO ()
generateGoldenManySubjectsCw [Char]
outfilepath = [Char] -> [Char] -> IO ()
writeFile [Char]
outfilepath [Char]
concatLines where
  concatLines :: [Char]
concatLines =
    forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat [[Char]]
goldenManySubjectsStart
      forall a. [a] -> [a] -> [a]
++ forall a. [a] -> [[a]] -> [a]
intercalate [Char]
"," [[Char]]
goldenCwVarManys
      forall a. [a] -> [a] -> [a]
++ [Char]
"],\"ids\":["
      forall a. [a] -> [a] -> [a]
++ forall a. [a] -> [[a]] -> [a]
intercalate [Char]
"," [[Char]]
goldenCwPatientManys
      forall a. [a] -> [a] -> [a]
++ forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat [[Char]]
goldenCwEnd

{-
A helper function to read in test data from file, perform a transformation on
the data, and write out the updated data to file
-}
generateTestDataBase :: String -> String -> ([String] -> [String]) -> IO ()
generateTestDataBase :: [Char] -> [Char] -> ([[Char]] -> [[Char]]) -> IO ()
generateTestDataBase [Char]
infilePath [Char]
outfilePath [[Char]] -> [[Char]]
transform = do
  [[Char]]
infileLines <- [Char] -> IO [[Char]]
readLines [Char]
infilePath
  let updatedLines :: [[Char]]
updatedLines = [[Char]] -> [[Char]]
transform [[Char]]
infileLines
  let concatLines :: [Char]
concatLines  = forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr (\[Char]
x [Char]
y -> [Char]
x forall a. [a] -> [a] -> [a]
++ [Char]
"\n" forall a. [a] -> [a] -> [a]
++ [Char]
y) [Char]
"" [[Char]]
updatedLines
  [Char] -> [Char] -> IO ()
writeFile [Char]
outfilePath [Char]
concatLines

{-
Create `largeInputSize` copies of each subject, and where each subject ID is
postfixed with a number so as to make the replicated subject IDs unique
-}
generateNewSubjs :: Regex -> [String] -> [String]
generateNewSubjs :: Regex -> [[Char]] -> [[Char]]
generateNewSubjs Regex
re [[Char]]
lines = forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap [Char] -> [[Char]]
updateIdsPtl [[Char]]
replacements
 where
  updateIdsPtl :: [Char] -> [[Char]]
updateIdsPtl [Char]
replacement = Regex -> [Char] -> [[Char]] -> [[Char]]
updateIds Regex
re [Char]
replacement [[Char]]
lines
  replacements :: [[Char]]
replacements =
    forall a b. (a -> b) -> [a] -> [b]
map ([Char] -> [Char]
constructReplacement forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> [Char]
formatNum) [Int
0 .. (Int
largeInputSize forall a. Num a => a -> a -> a
- Int
1)]

{-
Given a regex `pattern` and `replacement` inputs, perform a search and replace
on every entry in a list of strings
-}
updateIds :: Regex -> String -> [String] -> [String]
updateIds :: Regex -> [Char] -> [[Char]] -> [[Char]]
updateIds Regex
pat [Char]
replacement = forall a b. (a -> b) -> [a] -> [b]
map [Char] -> [Char]
subRegexPtl
  where subRegexPtl :: [Char] -> [Char]
subRegexPtl [Char]
line = Regex -> [Char] -> [Char] -> [Char]
subRegex Regex
pat [Char]
line [Char]
replacement

{-
Create `largeInputSize` copies of each event
-}
generateNewEvents :: [String] -> [String]
generateNewEvents :: [[Char]] -> [[Char]]
generateNewEvents = forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. Int -> a -> [a]
replicate Int
largeInputSize

{-
The initial fragment of the "many subjects" golden files
-}
goldenManySubjectsStart :: [String]
goldenManySubjectsStart :: [[Char]]
goldenManySubjectsStart =
  [ [Char]
"{\"example\":[{\"attritionInfo\":[[{\"tag\":\"SubjectHasNoIndex\"},0],[{\"contents\":[1,\"dummy\"],\"tag\":\"ExcludedBy\"},0],[{\"tag\":\"Included\"},"
  , forall a. Show a => a -> [Char]
show (Int
nValidSubj forall a. Num a => a -> a -> a
* Int
largeInputSize)
  , [Char]
"]],\"totalSubjectsProcessed\":"
  , forall a. Show a => a -> [Char]
show (Int
nValidSubj forall a. Num a => a -> a -> a
* Int
largeInputSize)
  , [Char]
",\"totalUnitsProcessed\":"
  , forall a. Show a => a -> [Char]
show (Int
nValidSubj forall a. Num a => a -> a -> a
* Int
largeInputSize)
  , [Char]
"},{\"contents\":{\"attributes\":[{\"attrs\":{\"getDerivation\":\"\",\"getLongLabel\":\"another label\",\"getPurpose\":{\"getRole\":[\"Outcome\"],\"getTags\":[]},\"getShortLabel\":\"somelabel\"},\"name\":\"myVar1\",\"type\":\"Count\"},{\"attrs\":{\"getDerivation\":\"\",\"getLongLabel\":\"\",\"getPurpose\":{\"getRole\":[],\"getTags\":[]},\"getShortLabel\":\"\"},\"name\":\"myVar2\",\"type\":\"Bool\"}],\"cohortData\":["
  ]

{-
The row-wise data to be replicated (and where the subject IDs are to be changed)
-}
goldenRwData :: [String]
goldenRwData :: [[Char]]
goldenRwData =
  [ [Char]
"[[\"a\",[\"2010-07-06\",\"2010-07-07\"]],[5,true]]"
  , [Char]
"[[\"b\",[\"2010-07-06\",\"2010-07-07\"]],[5,true]]"
  ]

{-
The replicated row-wise data (and where the subject IDs are changed)
-}
goldenRwPatientManys :: [String]
goldenRwPatientManys :: [[Char]]
goldenRwPatientManys = forall a. Ord a => [a] -> [a]
sort forall a b. (a -> b) -> a -> b
$ Regex -> [[Char]] -> [[Char]]
generateNewSubjs Regex
patientRe [[Char]]
goldenRwData

{-
The column-wise patietn information to be replicated (and where the subject IDs
are changed)
-}
goldenCwPatientEntries :: [String]
goldenCwPatientEntries :: [[Char]]
goldenCwPatientEntries =
  [ [Char]
"[\"a\",[\"2010-07-06\",\"2010-07-07\"]]"
  , [Char]
"[\"b\",[\"2010-07-06\",\"2010-07-07\"]]"
  ]

{-
The replicated column-wise patient information (and where the subject IDs are
changed)
-}
goldenCwPatientManys :: [String]
goldenCwPatientManys :: [[Char]]
goldenCwPatientManys = forall a. Ord a => [a] -> [a]
sort forall a b. (a -> b) -> a -> b
$ Regex -> [[Char]] -> [[Char]]
generateNewSubjs Regex
patientRe [[Char]]
goldenCwPatientEntries

{-
The column-wise data to be replicated (and where the subject IDs are changed)
-}
goldenCwVarEntries :: [String]
goldenCwVarEntries :: [[Char]]
goldenCwVarEntries = [[Char]
"5,5", [Char]
"true,true"]

{-
Replicate each element once for each patient in the input data, and then
surround the replicated entries by "[]" (i.e. making them a JSON array). Thus,
each element in the return list is a string representing a JSON array
-}
goldenCwVarManys :: [String]
goldenCwVarManys :: [[Char]]
goldenCwVarManys = forall a b. (a -> b) -> [a] -> [b]
map (\[Char]
s -> [Char]
"[" forall a. [a] -> [a] -> [a]
++ [Char] -> [Char]
replicateEntries [Char]
s forall a. [a] -> [a] -> [a]
++ [Char]
"]")
                       [[Char]]
goldenCwVarEntries
  where replicateEntries :: [Char] -> [Char]
replicateEntries = forall a. [a] -> [[a]] -> [a]
intercalate [Char]
"," forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. Int -> a -> [a]
replicate Int
largeInputSize

{-
The concluding fragment for the "many subjects" row-wise golden file
-}
goldenRwEnd :: [String]
goldenRwEnd :: [[Char]]
goldenRwEnd = [[Char]
"]},\"tag\":\"RW\"}]}"]

{-
-- The concluding fragment for the "many subjects" column-wise golden file
-}
goldenCwEnd :: [String]
goldenCwEnd :: [[Char]]
goldenCwEnd = [[Char]
"]},\"tag\":\"CW\"}]}"]

{-
The number of subjects with at least one valid entry in
hasklepias-main/exampleApp-test/test/testData.jsonl (note that subject "c" does
not have any valid entries)

TODO: add a test to ensure that the test data filepath doesn't change and that
the number of valid subjects also doesn't change
-}
nValidSubj :: Int
nValidSubj :: Int
nValidSubj = Int
2

{-
Create a 0-padded string representation of a number
-}
formatNum :: Int -> String
formatNum :: Int -> [Char]
formatNum = forall r. PrintfType r => [Char] -> r
printf ([Char]
"%0" forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show Int
numWidth forall a. [a] -> [a] -> [a]
++ [Char]
"d")

{-
The number of characters in a string representation of `largeInputSize`
-}
numWidth :: Int
numWidth :: Int
numWidth = forall (t :: * -> *) a. Foldable t => t a -> Int
length forall a b. (a -> b) -> a -> b
$ forall a. Show a => a -> [Char]
show (Int
largeInputSize forall a. Num a => a -> a -> a
- Int
1)

{-
Constructs the replacement pattern where the captured string has `label`
appended to it
-}
constructReplacement :: String -> String
constructReplacement :: [Char] -> [Char]
constructReplacement [Char]
label = [Char]
"\"\\1-" forall a. [a] -> [a] -> [a]
++ [Char]
label forall a. [a] -> [a] -> [a]
++ [Char]
"\""

{-
Read a file into a list of strings
-}
readLines :: String -> IO [String]
readLines :: [Char] -> IO [[Char]]
readLines = forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap [Char] -> [[Char]]
lines forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Char] -> IO [Char]
readFile

{-
A regular expression for the subject IDs in the
hasklepias-main/exampleApp-test/test/testData.jsonl data

TODO: add a test to ensure that this regex captures all of the subjects IDs
-}
patientRe :: Regex
patientRe = [Char] -> Regex
mkRegex [Char]
"\"(a|b|c)\""