module PostgresqlTypes.Tsvector
  ( Tsvector,

    -- * Accessors
    toLexemeList,

    -- * Constructors
    refineFromLexemeList,
    normalizeFromLexemeList,

    -- * Weight
    Weight (..),
  )
where

import qualified Data.Attoparsec.Text as Attoparsec
import qualified Data.ByteString as ByteString
import qualified Data.List as List
import qualified Data.Map.Strict as Map
import qualified Data.Text as Text
import qualified Data.Text.Encoding as Text.Encoding
import qualified Data.Vector as Vector
import PostgresqlTypes.Algebra
import PostgresqlTypes.Prelude
import PostgresqlTypes.Via
import qualified PtrPeeker
import qualified PtrPoker.Write as Write
import qualified Test.QuickCheck as QuickCheck
import qualified TextBuilder

-- | Weight of a tsvector lexeme position.
data Weight = AWeight | BWeight | CWeight | DWeight
  deriving stock (Weight -> Weight -> Bool
(Weight -> Weight -> Bool)
-> (Weight -> Weight -> Bool) -> Eq Weight
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: Weight -> Weight -> Bool
== :: Weight -> Weight -> Bool
$c/= :: Weight -> Weight -> Bool
/= :: Weight -> Weight -> Bool
Eq, Eq Weight
Eq Weight =>
(Weight -> Weight -> Ordering)
-> (Weight -> Weight -> Bool)
-> (Weight -> Weight -> Bool)
-> (Weight -> Weight -> Bool)
-> (Weight -> Weight -> Bool)
-> (Weight -> Weight -> Weight)
-> (Weight -> Weight -> Weight)
-> Ord Weight
Weight -> Weight -> Bool
Weight -> Weight -> Ordering
Weight -> Weight -> Weight
forall a.
Eq a =>
(a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
$ccompare :: Weight -> Weight -> Ordering
compare :: Weight -> Weight -> Ordering
$c< :: Weight -> Weight -> Bool
< :: Weight -> Weight -> Bool
$c<= :: Weight -> Weight -> Bool
<= :: Weight -> Weight -> Bool
$c> :: Weight -> Weight -> Bool
> :: Weight -> Weight -> Bool
$c>= :: Weight -> Weight -> Bool
>= :: Weight -> Weight -> Bool
$cmax :: Weight -> Weight -> Weight
max :: Weight -> Weight -> Weight
$cmin :: Weight -> Weight -> Weight
min :: Weight -> Weight -> Weight
Ord, Int -> Weight -> ShowS
[Weight] -> ShowS
Weight -> String
(Int -> Weight -> ShowS)
-> (Weight -> String) -> ([Weight] -> ShowS) -> Show Weight
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> Weight -> ShowS
showsPrec :: Int -> Weight -> ShowS
$cshow :: Weight -> String
show :: Weight -> String
$cshowList :: [Weight] -> ShowS
showList :: [Weight] -> ShowS
Show, ReadPrec [Weight]
ReadPrec Weight
Int -> ReadS Weight
ReadS [Weight]
(Int -> ReadS Weight)
-> ReadS [Weight]
-> ReadPrec Weight
-> ReadPrec [Weight]
-> Read Weight
forall a.
(Int -> ReadS a)
-> ReadS [a] -> ReadPrec a -> ReadPrec [a] -> Read a
$creadsPrec :: Int -> ReadS Weight
readsPrec :: Int -> ReadS Weight
$creadList :: ReadS [Weight]
readList :: ReadS [Weight]
$creadPrec :: ReadPrec Weight
readPrec :: ReadPrec Weight
$creadListPrec :: ReadPrec [Weight]
readListPrec :: ReadPrec [Weight]
Read, Int -> Weight
Weight -> Int
Weight -> [Weight]
Weight -> Weight
Weight -> Weight -> [Weight]
Weight -> Weight -> Weight -> [Weight]
(Weight -> Weight)
-> (Weight -> Weight)
-> (Int -> Weight)
-> (Weight -> Int)
-> (Weight -> [Weight])
-> (Weight -> Weight -> [Weight])
-> (Weight -> Weight -> [Weight])
-> (Weight -> Weight -> Weight -> [Weight])
-> Enum Weight
forall a.
(a -> a)
-> (a -> a)
-> (Int -> a)
-> (a -> Int)
-> (a -> [a])
-> (a -> a -> [a])
-> (a -> a -> [a])
-> (a -> a -> a -> [a])
-> Enum a
$csucc :: Weight -> Weight
succ :: Weight -> Weight
$cpred :: Weight -> Weight
pred :: Weight -> Weight
$ctoEnum :: Int -> Weight
toEnum :: Int -> Weight
$cfromEnum :: Weight -> Int
fromEnum :: Weight -> Int
$cenumFrom :: Weight -> [Weight]
enumFrom :: Weight -> [Weight]
$cenumFromThen :: Weight -> Weight -> [Weight]
enumFromThen :: Weight -> Weight -> [Weight]
$cenumFromTo :: Weight -> Weight -> [Weight]
enumFromTo :: Weight -> Weight -> [Weight]
$cenumFromThenTo :: Weight -> Weight -> Weight -> [Weight]
enumFromThenTo :: Weight -> Weight -> Weight -> [Weight]
Enum, Weight
Weight -> Weight -> Bounded Weight
forall a. a -> a -> Bounded a
$cminBound :: Weight
minBound :: Weight
$cmaxBound :: Weight
maxBound :: Weight
Bounded)

instance Arbitrary Weight where
  arbitrary :: Gen Weight
arbitrary = [Weight] -> Gen Weight
forall a. HasCallStack => [a] -> Gen a
QuickCheck.elements [Weight
AWeight, Weight
BWeight, Weight
CWeight, Weight
DWeight]

instance Hashable Weight where
  hashWithSalt :: Int -> Weight -> Int
hashWithSalt Int
salt = Int -> Int -> Int
forall a. Hashable a => Int -> a -> Int
hashWithSalt Int
salt (Int -> Int) -> (Weight -> Int) -> Weight -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
forall {k} (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. Weight -> Int
forall a. Enum a => a -> Int
fromEnum

-- | PostgreSQL @tsvector@ type. Full-text search document representation.
--
-- A tsvector is a sorted list of distinct lexemes with optional position and weight information.
-- Lexemes are sorted alphabetically and deduplicated, matching PostgreSQL's canonical representation.
--
-- [PostgreSQL docs](https://www.postgresql.org/docs/18/datatype-textsearch.html).
data Tsvector = Tsvector (Vector (Text, Vector (Word16, Weight)))
  deriving stock (Tsvector -> Tsvector -> Bool
(Tsvector -> Tsvector -> Bool)
-> (Tsvector -> Tsvector -> Bool) -> Eq Tsvector
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: Tsvector -> Tsvector -> Bool
== :: Tsvector -> Tsvector -> Bool
$c/= :: Tsvector -> Tsvector -> Bool
/= :: Tsvector -> Tsvector -> Bool
Eq, Eq Tsvector
Eq Tsvector =>
(Tsvector -> Tsvector -> Ordering)
-> (Tsvector -> Tsvector -> Bool)
-> (Tsvector -> Tsvector -> Bool)
-> (Tsvector -> Tsvector -> Bool)
-> (Tsvector -> Tsvector -> Bool)
-> (Tsvector -> Tsvector -> Tsvector)
-> (Tsvector -> Tsvector -> Tsvector)
-> Ord Tsvector
Tsvector -> Tsvector -> Bool
Tsvector -> Tsvector -> Ordering
Tsvector -> Tsvector -> Tsvector
forall a.
Eq a =>
(a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
$ccompare :: Tsvector -> Tsvector -> Ordering
compare :: Tsvector -> Tsvector -> Ordering
$c< :: Tsvector -> Tsvector -> Bool
< :: Tsvector -> Tsvector -> Bool
$c<= :: Tsvector -> Tsvector -> Bool
<= :: Tsvector -> Tsvector -> Bool
$c> :: Tsvector -> Tsvector -> Bool
> :: Tsvector -> Tsvector -> Bool
$c>= :: Tsvector -> Tsvector -> Bool
>= :: Tsvector -> Tsvector -> Bool
$cmax :: Tsvector -> Tsvector -> Tsvector
max :: Tsvector -> Tsvector -> Tsvector
$cmin :: Tsvector -> Tsvector -> Tsvector
min :: Tsvector -> Tsvector -> Tsvector
Ord)
  deriving (Int -> Tsvector -> ShowS
[Tsvector] -> ShowS
Tsvector -> String
(Int -> Tsvector -> ShowS)
-> (Tsvector -> String) -> ([Tsvector] -> ShowS) -> Show Tsvector
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> Tsvector -> ShowS
showsPrec :: Int -> Tsvector -> ShowS
$cshow :: Tsvector -> String
show :: Tsvector -> String
$cshowList :: [Tsvector] -> ShowS
showList :: [Tsvector] -> ShowS
Show, ReadPrec [Tsvector]
ReadPrec Tsvector
Int -> ReadS Tsvector
ReadS [Tsvector]
(Int -> ReadS Tsvector)
-> ReadS [Tsvector]
-> ReadPrec Tsvector
-> ReadPrec [Tsvector]
-> Read Tsvector
forall a.
(Int -> ReadS a)
-> ReadS [a] -> ReadPrec a -> ReadPrec [a] -> Read a
$creadsPrec :: Int -> ReadS Tsvector
readsPrec :: Int -> ReadS Tsvector
$creadList :: ReadS [Tsvector]
readList :: ReadS [Tsvector]
$creadPrec :: ReadPrec Tsvector
readPrec :: ReadPrec Tsvector
$creadListPrec :: ReadPrec [Tsvector]
readListPrec :: ReadPrec [Tsvector]
Read, String -> Tsvector
(String -> Tsvector) -> IsString Tsvector
forall a. (String -> a) -> IsString a
$cfromString :: String -> Tsvector
fromString :: String -> Tsvector
IsString) via (ViaIsScalar Tsvector)

instance Hashable Tsvector where
  hashWithSalt :: Int -> Tsvector -> Int
hashWithSalt Int
salt (Tsvector Vector (Text, Vector (Word16, Weight))
lexemes) =
    (Int -> (Text, Vector (Word16, Weight)) -> Int)
-> Int -> Vector (Text, Vector (Word16, Weight)) -> Int
forall a b. (a -> b -> a) -> a -> Vector b -> a
Vector.foldl' (\Int
s (Text
t, Vector (Word16, Weight)
ps) -> (Int -> (Word16, Weight) -> Int)
-> Int -> Vector (Word16, Weight) -> Int
forall a b. (a -> b -> a) -> a -> Vector b -> a
Vector.foldl' (\Int
s' (Word16
p, Weight
w) -> Int
s' Int -> Word16 -> Int
forall a. Hashable a => Int -> a -> Int
`hashWithSalt` Word16
p Int -> Int -> Int
forall a. Hashable a => Int -> a -> Int
`hashWithSalt` Weight -> Int
forall a. Enum a => a -> Int
fromEnum Weight
w) (Int
s Int -> Text -> Int
forall a. Hashable a => Int -> a -> Int
`hashWithSalt` Text
t) Vector (Word16, Weight)
ps) Int
salt Vector (Text, Vector (Word16, Weight))
lexemes

instance Arbitrary Tsvector where
  arbitrary :: Gen Tsvector
arbitrary = do
    Int
size <- Gen Int
QuickCheck.getSize
    Int
numLexemes <- (Int, Int) -> Gen Int
forall a. Random a => (a, a) -> Gen a
QuickCheck.choose (Int
0, Int -> Int -> Int
forall a. Ord a => a -> a -> a
max Int
0 Int
size)
    Map Text [(Word16, Weight)]
lexemeMap <-
      [(Text, [(Word16, Weight)])] -> Map Text [(Word16, Weight)]
forall k a. Ord k => [(k, a)] -> Map k a
Map.fromList ([(Text, [(Word16, Weight)])] -> Map Text [(Word16, Weight)])
-> Gen [(Text, [(Word16, Weight)])]
-> Gen (Map Text [(Word16, Weight)])
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Int
-> Gen (Text, [(Word16, Weight)])
-> Gen [(Text, [(Word16, Weight)])]
forall a. Int -> Gen a -> Gen [a]
QuickCheck.vectorOf Int
numLexemes do
        -- Generate a non-empty lexeme token without NUL characters
        Text
token <-
          String -> Text
Text.pack
            (String -> Text) -> Gen String -> Gen Text
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Gen Char -> Gen String
forall a. Gen a -> Gen [a]
QuickCheck.listOf1
              (Gen Char -> (Char -> Bool) -> Gen Char
forall a. Gen a -> (a -> Bool) -> Gen a
QuickCheck.suchThat Gen Char
forall a. Arbitrary a => Gen a
arbitrary (\Char
c -> Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
'\NUL'))
        Int
numPositions <- (Int, Int) -> Gen Int
forall a. Random a => (a, a) -> Gen a
QuickCheck.choose (Int
0, Int
3)
        [(Word16, Weight)]
positions <-
          [(Word16, Weight)] -> [(Word16, Weight)]
sortAndDedupPositions ([(Word16, Weight)] -> [(Word16, Weight)])
-> Gen [(Word16, Weight)] -> Gen [(Word16, Weight)]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Int -> Gen (Word16, Weight) -> Gen [(Word16, Weight)]
forall a. Int -> Gen a -> Gen [a]
QuickCheck.vectorOf Int
numPositions do
            Word16
pos <- (Word16, Word16) -> Gen Word16
forall a. Random a => (a, a) -> Gen a
QuickCheck.choose (Word16
1, Word16
16383)
            Weight
weight <- Gen Weight
forall a. Arbitrary a => Gen a
arbitrary
            pure (Word16
pos, Weight
weight)
        pure (Text
token, [(Word16, Weight)]
positions)
    -- Sort by lexeme (Map.toAscList) and deduplicate (Map guarantees unique keys)
    let sorted :: [(Text, [(Word16, Weight)])]
sorted = Map Text [(Word16, Weight)] -> [(Text, [(Word16, Weight)])]
forall k a. Map k a -> [(k, a)]
Map.toAscList Map Text [(Word16, Weight)]
lexemeMap
    Tsvector -> Gen Tsvector
forall a. a -> Gen a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Vector (Text, Vector (Word16, Weight)) -> Tsvector
Tsvector ([(Text, Vector (Word16, Weight))]
-> Vector (Text, Vector (Word16, Weight))
forall a. [a] -> Vector a
Vector.fromList (((Text, [(Word16, Weight)]) -> (Text, Vector (Word16, Weight)))
-> [(Text, [(Word16, Weight)])]
-> [(Text, Vector (Word16, Weight))]
forall a b. (a -> b) -> [a] -> [b]
map (\(Text
t, [(Word16, Weight)]
ps) -> (Text
t, [(Word16, Weight)] -> Vector (Word16, Weight)
forall a. [a] -> Vector a
Vector.fromList [(Word16, Weight)]
ps)) [(Text, [(Word16, Weight)])]
sorted)))
  shrink :: Tsvector -> [Tsvector]
shrink (Tsvector Vector (Text, Vector (Word16, Weight))
lexemes) =
    ([(Text, [(Word16, Weight)])] -> Tsvector)
-> [[(Text, [(Word16, Weight)])]] -> [Tsvector]
forall a b. (a -> b) -> [a] -> [b]
map (\[(Text, [(Word16, Weight)])]
ls -> [(Text, Vector (Word16, Weight))] -> Tsvector
normalizeLexemes (((Text, [(Word16, Weight)]) -> (Text, Vector (Word16, Weight)))
-> [(Text, [(Word16, Weight)])]
-> [(Text, Vector (Word16, Weight))]
forall a b. (a -> b) -> [a] -> [b]
map (\(Text
t, [(Word16, Weight)]
ps) -> (Text
t, [(Word16, Weight)] -> Vector (Word16, Weight)
forall a. [a] -> Vector a
Vector.fromList [(Word16, Weight)]
ps)) [(Text, [(Word16, Weight)])]
ls)) ([[(Text, [(Word16, Weight)])]] -> [Tsvector])
-> [[(Text, [(Word16, Weight)])]] -> [Tsvector]
forall a b. (a -> b) -> a -> b
$
      ((Text, [(Word16, Weight)]) -> [(Text, [(Word16, Weight)])])
-> [(Text, [(Word16, Weight)])] -> [[(Text, [(Word16, Weight)])]]
forall a. (a -> [a]) -> [a] -> [[a]]
QuickCheck.shrinkList
        ( \(Text
tok, [(Word16, Weight)]
positions) -> do
            Text
shrunkenTok <- (String -> Text) -> (Text -> String) -> Text -> [Text]
forall a b. Arbitrary a => (a -> b) -> (b -> a) -> b -> [b]
QuickCheck.shrinkMap String -> Text
Text.pack Text -> String
Text.unpack Text
tok
            [(Word16, Weight)]
shrunkenPositions <- ((Word16, Weight) -> [(Word16, Weight)])
-> [(Word16, Weight)] -> [[(Word16, Weight)]]
forall a. (a -> [a]) -> [a] -> [[a]]
QuickCheck.shrinkList (Word16, Weight) -> [(Word16, Weight)]
forall a. Arbitrary a => a -> [a]
shrink [(Word16, Weight)]
positions
            pure (Text
shrunkenTok, [(Word16, Weight)]
shrunkenPositions)
        )
        (((Text, Vector (Word16, Weight)) -> (Text, [(Word16, Weight)]))
-> [(Text, Vector (Word16, Weight))]
-> [(Text, [(Word16, Weight)])]
forall a b. (a -> b) -> [a] -> [b]
map (\(Text
t, Vector (Word16, Weight)
ps) -> (Text
t, Vector (Word16, Weight) -> [(Word16, Weight)]
forall a. Vector a -> [a]
Vector.toList Vector (Word16, Weight)
ps)) (Vector (Text, Vector (Word16, Weight))
-> [(Text, Vector (Word16, Weight))]
forall a. Vector a -> [a]
Vector.toList Vector (Text, Vector (Word16, Weight))
lexemes))

-- | Sort lexemes alphabetically and deduplicate by lexeme text, merging positions.
-- Positions within each lexeme are sorted by position number and deduplicated
-- (keeping the highest weight for duplicate positions), matching PostgreSQL's canonical form.
normalizeLexemes :: [(Text, Vector (Word16, Weight))] -> Tsvector
normalizeLexemes :: [(Text, Vector (Word16, Weight))] -> Tsvector
normalizeLexemes [(Text, Vector (Word16, Weight))]
lexemes =
  let m :: Map Text [(Word16, Weight)]
m = ([(Word16, Weight)] -> [(Word16, Weight)] -> [(Word16, Weight)])
-> [(Text, [(Word16, Weight)])] -> Map Text [(Word16, Weight)]
forall k a. Ord k => (a -> a -> a) -> [(k, a)] -> Map k a
Map.fromListWith [(Word16, Weight)] -> [(Word16, Weight)] -> [(Word16, Weight)]
forall a. Semigroup a => a -> a -> a
(<>) (((Text, Vector (Word16, Weight)) -> (Text, [(Word16, Weight)]))
-> [(Text, Vector (Word16, Weight))]
-> [(Text, [(Word16, Weight)])]
forall a b. (a -> b) -> [a] -> [b]
map (\(Text
t, Vector (Word16, Weight)
ps) -> (Text
t, Vector (Word16, Weight) -> [(Word16, Weight)]
forall a. Vector a -> [a]
Vector.toList Vector (Word16, Weight)
ps)) [(Text, Vector (Word16, Weight))]
lexemes)
      sorted :: [(Text, [(Word16, Weight)])]
sorted = Map Text [(Word16, Weight)] -> [(Text, [(Word16, Weight)])]
forall k a. Map k a -> [(k, a)]
Map.toAscList Map Text [(Word16, Weight)]
m
   in Vector (Text, Vector (Word16, Weight)) -> Tsvector
Tsvector ([(Text, Vector (Word16, Weight))]
-> Vector (Text, Vector (Word16, Weight))
forall a. [a] -> Vector a
Vector.fromList (((Text, [(Word16, Weight)]) -> (Text, Vector (Word16, Weight)))
-> [(Text, [(Word16, Weight)])]
-> [(Text, Vector (Word16, Weight))]
forall a b. (a -> b) -> [a] -> [b]
map (\(Text
t, [(Word16, Weight)]
ps) -> (Text
t, [(Word16, Weight)] -> Vector (Word16, Weight)
forall a. [a] -> Vector a
Vector.fromList ([(Word16, Weight)] -> [(Word16, Weight)]
sortAndDedupPositions [(Word16, Weight)]
ps))) [(Text, [(Word16, Weight)])]
sorted))

-- | Sort positions by position number ascending, deduplicating by position
-- (keeping the minimum weight, i.e. highest priority: A < B < C < D).
sortAndDedupPositions :: [(Word16, Weight)] -> [(Word16, Weight)]
sortAndDedupPositions :: [(Word16, Weight)] -> [(Word16, Weight)]
sortAndDedupPositions =
  ([(Word16, Weight)] -> (Word16, Weight))
-> [[(Word16, Weight)]] -> [(Word16, Weight)]
forall a b. (a -> b) -> [a] -> [b]
map (((Word16, Weight) -> (Word16, Weight) -> (Word16, Weight))
-> [(Word16, Weight)] -> (Word16, Weight)
forall a. (a -> a -> a) -> [a] -> a
forall (t :: * -> *) a. Foldable t => (a -> a -> a) -> t a -> a
foldr1 (\(Word16
p, Weight
w1) (Word16
_, Weight
w2) -> (Word16
p, Weight -> Weight -> Weight
forall a. Ord a => a -> a -> a
min Weight
w1 Weight
w2)))
    ([[(Word16, Weight)]] -> [(Word16, Weight)])
-> ([(Word16, Weight)] -> [[(Word16, Weight)]])
-> [(Word16, Weight)]
-> [(Word16, Weight)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
forall {k} (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. ((Word16, Weight) -> (Word16, Weight) -> Bool)
-> [(Word16, Weight)] -> [[(Word16, Weight)]]
forall a. (a -> a -> Bool) -> [a] -> [[a]]
List.groupBy (\(Word16, Weight)
a (Word16, Weight)
b -> (Word16, Weight) -> Word16
forall a b. (a, b) -> a
fst (Word16, Weight)
a Word16 -> Word16 -> Bool
forall a. Eq a => a -> a -> Bool
== (Word16, Weight) -> Word16
forall a b. (a, b) -> a
fst (Word16, Weight)
b)
    ([(Word16, Weight)] -> [[(Word16, Weight)]])
-> ([(Word16, Weight)] -> [(Word16, Weight)])
-> [(Word16, Weight)]
-> [[(Word16, Weight)]]
forall b c a. (b -> c) -> (a -> b) -> a -> c
forall {k} (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. ((Word16, Weight) -> Word16)
-> [(Word16, Weight)] -> [(Word16, Weight)]
forall b a. Ord b => (a -> b) -> [a] -> [a]
List.sortOn (Word16, Weight) -> Word16
forall a b. (a, b) -> a
fst

instance IsScalar Tsvector where
  schemaName :: Tagged Tsvector (Maybe Text)
schemaName = Maybe Text -> Tagged Tsvector (Maybe Text)
forall {k} (s :: k) b. b -> Tagged s b
Tagged Maybe Text
forall a. Maybe a
Nothing
  typeName :: Tagged Tsvector Text
typeName = Text -> Tagged Tsvector Text
forall {k} (s :: k) b. b -> Tagged s b
Tagged Text
"tsvector"
  baseOid :: Tagged Tsvector (Maybe Word32)
baseOid = Maybe Word32 -> Tagged Tsvector (Maybe Word32)
forall {k} (s :: k) b. b -> Tagged s b
Tagged (Word32 -> Maybe Word32
forall a. a -> Maybe a
Just Word32
3614)
  arrayOid :: Tagged Tsvector (Maybe Word32)
arrayOid = Maybe Word32 -> Tagged Tsvector (Maybe Word32)
forall {k} (s :: k) b. b -> Tagged s b
Tagged (Word32 -> Maybe Word32
forall a. a -> Maybe a
Just Word32
3643)
  typeParams :: Tagged Tsvector [Text]
typeParams = [Text] -> Tagged Tsvector [Text]
forall {k} (s :: k) b. b -> Tagged s b
Tagged []

  -- Binary format:
  -- 4 bytes: number of lexemes (int32)
  -- Per lexeme:
  --   N bytes: lexeme text as null-terminated UTF-8 string
  --   2 bytes: number of positions (uint16)
  --   Per position:
  --     2 bytes: uint16 where bits 14-15 = weight (A=3,B=2,C=1,D=0), bits 0-13 = position
  binaryEncoder :: Tsvector -> Write
binaryEncoder (Tsvector Vector (Text, Vector (Word16, Weight))
lexemes) =
    Int32 -> Write
Write.bInt32 (Int -> Int32
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Vector (Text, Vector (Word16, Weight)) -> Int
forall a. Vector a -> Int
Vector.length Vector (Text, Vector (Word16, Weight))
lexemes))
      Write -> Write -> Write
forall a. Semigroup a => a -> a -> a
<> ((Text, Vector (Word16, Weight)) -> Write)
-> Vector (Text, Vector (Word16, Weight)) -> Write
forall m a. Monoid m => (a -> m) -> Vector a -> m
Vector.foldMap (Text, Vector (Word16, Weight)) -> Write
encodeLexeme Vector (Text, Vector (Word16, Weight))
lexemes
    where
      encodeLexeme :: (Text, Vector (Word16, Weight)) -> Write
encodeLexeme (Text
token, Vector (Word16, Weight)
positions) =
        let tokenBytes :: ByteString
tokenBytes = Text -> ByteString
Text.Encoding.encodeUtf8 Text
token
         in ByteString -> Write
Write.byteString ByteString
tokenBytes
              Write -> Write -> Write
forall a. Semigroup a => a -> a -> a
<> Word8 -> Write
Write.word8 Word8
0 -- null terminator
              Write -> Write -> Write
forall a. Semigroup a => a -> a -> a
<> Word16 -> Write
Write.bWord16 (Int -> Word16
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Vector (Word16, Weight) -> Int
forall a. Vector a -> Int
Vector.length Vector (Word16, Weight)
positions))
              Write -> Write -> Write
forall a. Semigroup a => a -> a -> a
<> ((Word16, Weight) -> Write) -> Vector (Word16, Weight) -> Write
forall m a. Monoid m => (a -> m) -> Vector a -> m
Vector.foldMap (Word16, Weight) -> Write
encodePosition Vector (Word16, Weight)
positions
      encodePosition :: (Word16, Weight) -> Write
encodePosition (Word16
pos, Weight
weight) =
        let weightBits :: Word16
weightBits = case Weight
weight of
              Weight
AWeight -> Word16
3
              Weight
BWeight -> Word16
2
              Weight
CWeight -> Word16
1
              Weight
DWeight -> Word16
0
            -- PostgreSQL tsvector positions must be in the range 1..16383.
            -- Clamp here to avoid silent truncation by bit masking.
            posClamped :: Word16
posClamped = Word16 -> Word16 -> Word16
forall a. Ord a => a -> a -> a
max Word16
1 (Word16 -> Word16 -> Word16
forall a. Ord a => a -> a -> a
min Word16
16383 Word16
pos)
         in Word16 -> Write
Write.bWord16 ((Word16
weightBits Word16 -> Int -> Word16
forall a. Bits a => a -> Int -> a
`shiftL` Int
14) Word16 -> Word16 -> Word16
forall a. Bits a => a -> a -> a
.|. Word16
posClamped)

  binaryDecoder :: Variable (Either DecodingError Tsvector)
binaryDecoder = ExceptT DecodingError Variable Tsvector
-> Variable (Either DecodingError Tsvector)
forall e (m :: * -> *) a. ExceptT e m a -> m (Either e a)
runExceptT do
    Int32
numLexemes <- Variable Int32 -> ExceptT DecodingError Variable Int32
forall (m :: * -> *) a. Monad m => m a -> ExceptT DecodingError m a
forall (t :: (* -> *) -> * -> *) (m :: * -> *) a.
(MonadTrans t, Monad m) =>
m a -> t m a
lift (Variable Int32 -> ExceptT DecodingError Variable Int32)
-> Variable Int32 -> ExceptT DecodingError Variable Int32
forall a b. (a -> b) -> a -> b
$ Fixed Int32 -> Variable Int32
forall a. Fixed a -> Variable a
PtrPeeker.fixed Fixed Int32
PtrPeeker.beSignedInt4
    Bool
-> ExceptT DecodingError Variable ()
-> ExceptT DecodingError Variable ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Int32
numLexemes Int32 -> Int32 -> Bool
forall a. Ord a => a -> a -> Bool
< Int32
0) do
      DecodingError -> ExceptT DecodingError Variable ()
forall a. DecodingError -> ExceptT DecodingError Variable a
forall e (m :: * -> *) a. MonadError e m => e -> m a
throwError
        ( DecodingError
            { location :: [Text]
location = [Text
"tsvector", Text
"lexemeCount"],
              reason :: DecodingErrorReason
reason =
                Text -> ByteString -> DecodingErrorReason
ParsingDecodingErrorReason
                  (String -> Text
forall a. IsString a => String -> a
fromString String
"Negative lexeme count in tsvector binary data")
                  ByteString
ByteString.empty
            }
        )
    Vector (Text, Vector (Word16, Weight))
lexemes <- [(Text, Vector (Word16, Weight))]
-> Vector (Text, Vector (Word16, Weight))
forall a. [a] -> Vector a
Vector.fromList ([(Text, Vector (Word16, Weight))]
 -> Vector (Text, Vector (Word16, Weight)))
-> ExceptT DecodingError Variable [(Text, Vector (Word16, Weight))]
-> ExceptT
     DecodingError Variable (Vector (Text, Vector (Word16, Weight)))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Int
-> ExceptT DecodingError Variable (Text, Vector (Word16, Weight))
-> ExceptT DecodingError Variable [(Text, Vector (Word16, Weight))]
forall (m :: * -> *) a. Applicative m => Int -> m a -> m [a]
replicateM (Int32 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int32
numLexemes) ExceptT DecodingError Variable (Text, Vector (Word16, Weight))
forall {t :: (* -> *) -> * -> *}.
(MonadTrans t, MonadError DecodingError (t Variable)) =>
t Variable (Text, Vector (Word16, Weight))
decodeLexeme
    pure (Vector (Text, Vector (Word16, Weight)) -> Tsvector
Tsvector Vector (Text, Vector (Word16, Weight))
lexemes)
    where
      decodeLexeme :: t Variable (Text, Vector (Word16, Weight))
decodeLexeme = do
        -- Read null-terminated UTF-8 string
        ByteString
tokenBytes <- Variable ByteString -> t Variable ByteString
forall (m :: * -> *) a. Monad m => m a -> t m a
forall (t :: (* -> *) -> * -> *) (m :: * -> *) a.
(MonadTrans t, Monad m) =>
m a -> t m a
lift Variable ByteString
PtrPeeker.nullTerminatedStringAsByteString
        case ByteString -> Either UnicodeException Text
Text.Encoding.decodeUtf8' ByteString
tokenBytes of
          Left UnicodeException
e ->
            DecodingError -> t Variable (Text, Vector (Word16, Weight))
forall a. DecodingError -> t Variable a
forall e (m :: * -> *) a. MonadError e m => e -> m a
throwError
              ( DecodingError
                  { location :: [Text]
location = [Text
"tsvector", Text
"lexeme"],
                    reason :: DecodingErrorReason
reason = Text -> ByteString -> DecodingErrorReason
ParsingDecodingErrorReason (String -> Text
forall a. IsString a => String -> a
fromString (UnicodeException -> String
forall a. Show a => a -> String
show UnicodeException
e)) ByteString
tokenBytes
                  }
              )
          Right Text
token
            | Text -> Bool
Text.null Text
token ->
                DecodingError -> t Variable (Text, Vector (Word16, Weight))
forall a. DecodingError -> t Variable a
forall e (m :: * -> *) a. MonadError e m => e -> m a
throwError
                  ( DecodingError
                      { location :: [Text]
location = [Text
"tsvector", Text
"lexeme"],
                        reason :: DecodingErrorReason
reason =
                          Text -> ByteString -> DecodingErrorReason
ParsingDecodingErrorReason
                            (String -> Text
forall a. IsString a => String -> a
fromString String
"empty lexeme is not allowed in tsvector")
                            ByteString
tokenBytes
                      }
                  )
            | Bool
otherwise -> do
                Word16
numPositions <- Variable Word16 -> t Variable Word16
forall (m :: * -> *) a. Monad m => m a -> t m a
forall (t :: (* -> *) -> * -> *) (m :: * -> *) a.
(MonadTrans t, Monad m) =>
m a -> t m a
lift (Variable Word16 -> t Variable Word16)
-> Variable Word16 -> t Variable Word16
forall a b. (a -> b) -> a -> b
$ Fixed Word16 -> Variable Word16
forall a. Fixed a -> Variable a
PtrPeeker.fixed Fixed Word16
PtrPeeker.beUnsignedInt2
                Vector (Word16, Weight)
positions <-
                  [(Word16, Weight)] -> Vector (Word16, Weight)
forall a. [a] -> Vector a
Vector.fromList ([(Word16, Weight)] -> Vector (Word16, Weight))
-> t Variable [(Word16, Weight)]
-> t Variable (Vector (Word16, Weight))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Int -> t Variable (Word16, Weight) -> t Variable [(Word16, Weight)]
forall (m :: * -> *) a. Applicative m => Int -> m a -> m [a]
replicateM (Word16 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word16
numPositions) do
                    Word16
posWord <- Variable Word16 -> t Variable Word16
forall (m :: * -> *) a. Monad m => m a -> t m a
forall (t :: (* -> *) -> * -> *) (m :: * -> *) a.
(MonadTrans t, Monad m) =>
m a -> t m a
lift (Variable Word16 -> t Variable Word16)
-> Variable Word16 -> t Variable Word16
forall a b. (a -> b) -> a -> b
$ Fixed Word16 -> Variable Word16
forall a. Fixed a -> Variable a
PtrPeeker.fixed Fixed Word16
PtrPeeker.beUnsignedInt2
                    let weightBits :: Word16
weightBits = (Word16
posWord Word16 -> Int -> Word16
forall a. Bits a => a -> Int -> a
`shiftR` Int
14) Word16 -> Word16 -> Word16
forall a. Bits a => a -> a -> a
.&. Word16
0x3
                    let weight :: Weight
weight = case Word16
weightBits of
                          Word16
3 -> Weight
AWeight
                          Word16
2 -> Weight
BWeight
                          Word16
1 -> Weight
CWeight
                          Word16
_ -> Weight
DWeight
                    let pos :: Word16
pos = Word16
posWord Word16 -> Word16 -> Word16
forall a. Bits a => a -> a -> a
.&. Word16
0x3FFF
                    (Word16, Weight) -> t Variable (Word16, Weight)
forall a. a -> t Variable a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Word16
pos, Weight
weight)
                pure (Text
token, Vector (Word16, Weight)
positions)

  -- Text format: 'lexeme1':1A,2B 'lexeme2':3C
  -- Single quotes are escaped as '', backslashes as \\
  textualEncoder :: Tsvector -> TextBuilder
textualEncoder (Tsvector Vector (Text, Vector (Word16, Weight))
lexemes) =
    TextBuilder
-> ((Text, Vector (Word16, Weight)) -> TextBuilder)
-> [(Text, Vector (Word16, Weight))]
-> TextBuilder
forall (f :: * -> *) a.
Foldable f =>
TextBuilder -> (a -> TextBuilder) -> f a -> TextBuilder
TextBuilder.intercalateMap TextBuilder
" " (Text, Vector (Word16, Weight)) -> TextBuilder
forall {a}. Show a => (Text, Vector (a, Weight)) -> TextBuilder
encodeLexeme (Vector (Text, Vector (Word16, Weight))
-> [(Text, Vector (Word16, Weight))]
forall a. Vector a -> [a]
Vector.toList Vector (Text, Vector (Word16, Weight))
lexemes)
    where
      encodeLexeme :: (Text, Vector (a, Weight)) -> TextBuilder
encodeLexeme (Text
token, Vector (a, Weight)
positions) =
        Char -> TextBuilder
TextBuilder.char Char
'\''
          TextBuilder -> TextBuilder -> TextBuilder
forall a. Semigroup a => a -> a -> a
<> Text -> TextBuilder
TextBuilder.text (Text -> Text
escapeToken Text
token)
          TextBuilder -> TextBuilder -> TextBuilder
forall a. Semigroup a => a -> a -> a
<> Char -> TextBuilder
TextBuilder.char Char
'\''
          TextBuilder -> TextBuilder -> TextBuilder
forall a. Semigroup a => a -> a -> a
<> if Vector (a, Weight) -> Bool
forall a. Vector a -> Bool
Vector.null Vector (a, Weight)
positions
            then TextBuilder
forall a. Monoid a => a
mempty
            else Char -> TextBuilder
TextBuilder.char Char
':' TextBuilder -> TextBuilder -> TextBuilder
forall a. Semigroup a => a -> a -> a
<> TextBuilder
-> ((a, Weight) -> TextBuilder) -> [(a, Weight)] -> TextBuilder
forall (f :: * -> *) a.
Foldable f =>
TextBuilder -> (a -> TextBuilder) -> f a -> TextBuilder
TextBuilder.intercalateMap TextBuilder
"," (a, Weight) -> TextBuilder
forall {a}. Show a => (a, Weight) -> TextBuilder
encodePosition (Vector (a, Weight) -> [(a, Weight)]
forall a. Vector a -> [a]
Vector.toList Vector (a, Weight)
positions)
      encodePosition :: (a, Weight) -> TextBuilder
encodePosition (a
pos, Weight
weight) =
        String -> TextBuilder
TextBuilder.string (a -> String
forall a. Show a => a -> String
show a
pos)
          TextBuilder -> TextBuilder -> TextBuilder
forall a. Semigroup a => a -> a -> a
<> case Weight
weight of
            Weight
AWeight -> Char -> TextBuilder
TextBuilder.char Char
'A'
            Weight
BWeight -> Char -> TextBuilder
TextBuilder.char Char
'B'
            Weight
CWeight -> Char -> TextBuilder
TextBuilder.char Char
'C'
            Weight
DWeight -> TextBuilder
forall a. Monoid a => a
mempty -- D is default, omitted by PostgreSQL
      escapeToken :: Text -> Text
escapeToken = (Char -> Text) -> Text -> Text
Text.concatMap Char -> Text
escapeChar
      escapeChar :: Char -> Text
escapeChar Char
c = case Char
c of
        Char
'\'' -> Text
"''"
        Char
'\\' -> Text
"\\\\"
        Char
_ -> Char -> Text
Text.singleton Char
c

  textualDecoder :: Parser Tsvector
textualDecoder = do
    -- Allow and ignore leading whitespace before the first lexeme
    Parser Text ()
Attoparsec.skipSpace
    [(Text, [(Word16, Weight)])]
lexemes <- Parser Text (Text, [(Word16, Weight)])
lexemeParser Parser Text (Text, [(Word16, Weight)])
-> Parser Text () -> Parser Text [(Text, [(Word16, Weight)])]
forall (f :: * -> *) a s. Alternative f => f a -> f s -> f [a]
`Attoparsec.sepBy` Parser Text ()
space1
    -- Allow and ignore trailing whitespace after the last lexeme
    Parser Text ()
Attoparsec.skipSpace
    -- Sort and deduplicate to match PostgreSQL's canonical form
    let Tsvector Vector (Text, Vector (Word16, Weight))
normalized = [(Text, Vector (Word16, Weight))] -> Tsvector
normalizeLexemes (((Text, [(Word16, Weight)]) -> (Text, Vector (Word16, Weight)))
-> [(Text, [(Word16, Weight)])]
-> [(Text, Vector (Word16, Weight))]
forall a b. (a -> b) -> [a] -> [b]
map (\(Text
t, [(Word16, Weight)]
ps) -> (Text
t, [(Word16, Weight)] -> Vector (Word16, Weight)
forall a. [a] -> Vector a
Vector.fromList [(Word16, Weight)]
ps)) [(Text, [(Word16, Weight)])]
lexemes)
    pure (Vector (Text, Vector (Word16, Weight)) -> Tsvector
Tsvector Vector (Text, Vector (Word16, Weight))
normalized)
    where
      -- Consume one or more space / tab / newline characters between lexemes
      space1 :: Parser Text ()
space1 = do
        Text
_ <- (Char -> Bool) -> Parser Text Text
Attoparsec.takeWhile1 (\Char
c -> Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
' ' Bool -> Bool -> Bool
|| Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'\t' Bool -> Bool -> Bool
|| Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'\n')
        pure ()
      lexemeParser :: Parser Text (Text, [(Word16, Weight)])
lexemeParser = do
        Char
_ <- Char -> Parser Text Char
Attoparsec.char Char
'\''
        Text
token <- Parser Text Text
parseToken
        Char
_ <- Char -> Parser Text Char
Attoparsec.char Char
'\''
        [(Word16, Weight)]
positions <- Parser Text [(Word16, Weight)]
parsePositions Parser Text [(Word16, Weight)]
-> Parser Text [(Word16, Weight)] -> Parser Text [(Word16, Weight)]
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> [(Word16, Weight)] -> Parser Text [(Word16, Weight)]
forall a. a -> Parser Text a
forall (f :: * -> *) a. Applicative f => a -> f a
pure []
        pure (Text
token, [(Word16, Weight)]
positions)
      parseToken :: Parser Text Text
parseToken = do
        String
chars <- Parser Text Char -> Parser Text String
forall a. Parser Text a -> Parser Text [a]
forall (f :: * -> *) a. Alternative f => f a -> f [a]
many (Parser Text Char
escapedQuote Parser Text Char -> Parser Text Char -> Parser Text Char
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser Text Char
escapedBackslash Parser Text Char -> Parser Text Char -> Parser Text Char
forall a. Parser Text a -> Parser Text a -> Parser Text a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser Text Char
normalChar)
        pure (String -> Text
Text.pack String
chars)
      escapedQuote :: Parser Text Char
escapedQuote = do
        Text
_ <- Text -> Parser Text Text
Attoparsec.string Text
"''"
        pure Char
'\''
      escapedBackslash :: Parser Text Char
escapedBackslash = do
        Text
_ <- Text -> Parser Text Text
Attoparsec.string Text
"\\\\"
        pure Char
'\\'
      normalChar :: Parser Text Char
normalChar = (Char -> Bool) -> Parser Text Char
Attoparsec.satisfy (\Char
c -> Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
'\'' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
'\\')
      parsePositions :: Parser Text [(Word16, Weight)]
parsePositions = do
        Char
_ <- Char -> Parser Text Char
Attoparsec.char Char
':'
        Parser Text (Word16, Weight)
parsePosition Parser Text (Word16, Weight)
-> Parser Text Char -> Parser Text [(Word16, Weight)]
forall (f :: * -> *) a s. Alternative f => f a -> f s -> f [a]
`Attoparsec.sepBy1` Char -> Parser Text Char
Attoparsec.char Char
','
      parsePosition :: Parser Text (Word16, Weight)
parsePosition = do
        Integer
pos <- forall a. Integral a => Parser a
Attoparsec.decimal @Integer
        Bool -> Parser Text () -> Parser Text ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Integer
pos Integer -> Integer -> Bool
forall a. Ord a => a -> a -> Bool
< Integer
1 Bool -> Bool -> Bool
|| Integer
pos Integer -> Integer -> Bool
forall a. Ord a => a -> a -> Bool
> Integer
16383) do
          String -> Parser Text ()
forall a. HasCallStack => String -> Parser Text a
forall (m :: * -> *) a.
(MonadFail m, HasCallStack) =>
String -> m a
fail (String
"tsvector position out of range 1..16383: " String -> ShowS
forall a. Semigroup a => a -> a -> a
<> Integer -> String
forall a. Show a => a -> String
show Integer
pos)
        let pos' :: Word16
pos' = Integer -> Word16
forall a b. (Integral a, Num b) => a -> b
fromIntegral Integer
pos :: Word16
        Weight
weight <-
          [Parser Text Weight] -> Parser Text Weight
forall (t :: * -> *) (f :: * -> *) a.
(Foldable t, Alternative f) =>
t (f a) -> f a
asum
            [ Char -> Parser Text Char
Attoparsec.char Char
'A' Parser Text Char -> Weight -> Parser Text Weight
forall (f :: * -> *) a b. Functor f => f a -> b -> f b
$> Weight
AWeight,
              Char -> Parser Text Char
Attoparsec.char Char
'B' Parser Text Char -> Weight -> Parser Text Weight
forall (f :: * -> *) a b. Functor f => f a -> b -> f b
$> Weight
BWeight,
              Char -> Parser Text Char
Attoparsec.char Char
'C' Parser Text Char -> Weight -> Parser Text Weight
forall (f :: * -> *) a b. Functor f => f a -> b -> f b
$> Weight
CWeight,
              Char -> Parser Text Char
Attoparsec.char Char
'D' Parser Text Char -> Weight -> Parser Text Weight
forall (f :: * -> *) a b. Functor f => f a -> b -> f b
$> Weight
DWeight,
              Weight -> Parser Text Weight
forall a. a -> Parser Text a
forall (f :: * -> *) a. Applicative f => a -> f a
pure Weight
DWeight
            ]
        pure (Word16
pos', Weight
weight)

-- * Accessors

-- | Extract the tsvector as a list of (lexeme, positions) pairs.
-- Lexemes are in sorted order. Each position is a (position, weight) pair where position is 1-16383.
toLexemeList :: Tsvector -> [(Text, [(Word16, Weight)])]
toLexemeList :: Tsvector -> [(Text, [(Word16, Weight)])]
toLexemeList (Tsvector Vector (Text, Vector (Word16, Weight))
lexemes) =
  ((Text, Vector (Word16, Weight)) -> (Text, [(Word16, Weight)]))
-> [(Text, Vector (Word16, Weight))]
-> [(Text, [(Word16, Weight)])]
forall a b. (a -> b) -> [a] -> [b]
map (\(Text
t, Vector (Word16, Weight)
ps) -> (Text
t, Vector (Word16, Weight) -> [(Word16, Weight)]
forall a. Vector a -> [a]
Vector.toList Vector (Word16, Weight)
ps)) (Vector (Text, Vector (Word16, Weight))
-> [(Text, Vector (Word16, Weight))]
forall a. Vector a -> [a]
Vector.toList Vector (Text, Vector (Word16, Weight))
lexemes)

-- * Constructors

-- | Construct a tsvector from a list of (lexeme, positions) pairs with validation.
-- Returns 'Nothing' if any lexeme is empty, contains null characters,
-- or has positions outside the valid range 1..16383.
-- Sorts and deduplicates lexemes to match PostgreSQL's canonical representation.
refineFromLexemeList :: [(Text, [(Word16, Weight)])] -> Maybe Tsvector
refineFromLexemeList :: [(Text, [(Word16, Weight)])] -> Maybe Tsvector
refineFromLexemeList [(Text, [(Word16, Weight)])]
lexemes =
  if ((Text, [(Word16, Weight)]) -> Bool)
-> [(Text, [(Word16, Weight)])] -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
any (\(Text
t, [(Word16, Weight)]
ps) -> Text -> Bool
Text.null Text
t Bool -> Bool -> Bool
|| Char -> Text -> Bool
Text.elem Char
'\NUL' Text
t Bool -> Bool -> Bool
|| ((Word16, Weight) -> Bool) -> [(Word16, Weight)] -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
any (\(Word16
p, Weight
_) -> Word16
p Word16 -> Word16 -> Bool
forall a. Ord a => a -> a -> Bool
< Word16
1 Bool -> Bool -> Bool
|| Word16
p Word16 -> Word16 -> Bool
forall a. Ord a => a -> a -> Bool
> Word16
16383) [(Word16, Weight)]
ps) [(Text, [(Word16, Weight)])]
lexemes
    then Maybe Tsvector
forall a. Maybe a
Nothing
    else Tsvector -> Maybe Tsvector
forall a. a -> Maybe a
Just ([(Text, Vector (Word16, Weight))] -> Tsvector
normalizeLexemes (((Text, [(Word16, Weight)]) -> (Text, Vector (Word16, Weight)))
-> [(Text, [(Word16, Weight)])]
-> [(Text, Vector (Word16, Weight))]
forall a b. (a -> b) -> [a] -> [b]
map (\(Text
t, [(Word16, Weight)]
ps) -> (Text
t, [(Word16, Weight)] -> Vector (Word16, Weight)
forall a. [a] -> Vector a
Vector.fromList [(Word16, Weight)]
ps)) [(Text, [(Word16, Weight)])]
lexemes))

-- | Construct a tsvector from a list of (lexeme, positions) pairs.
-- Strips null characters from lexemes and removes empty lexemes.
-- Sorts and deduplicates lexemes to match PostgreSQL's canonical representation.
normalizeFromLexemeList :: [(Text, [(Word16, Weight)])] -> Tsvector
normalizeFromLexemeList :: [(Text, [(Word16, Weight)])] -> Tsvector
normalizeFromLexemeList [(Text, [(Word16, Weight)])]
lexemes =
  let cleaned :: [(Text, [(Word16, Weight)])]
cleaned = ((Text, [(Word16, Weight)]) -> Bool)
-> [(Text, [(Word16, Weight)])] -> [(Text, [(Word16, Weight)])]
forall a. (a -> Bool) -> [a] -> [a]
filter (\(Text
t, [(Word16, Weight)]
_) -> Bool -> Bool
not (Text -> Bool
Text.null Text
t)) ([(Text, [(Word16, Weight)])] -> [(Text, [(Word16, Weight)])])
-> [(Text, [(Word16, Weight)])] -> [(Text, [(Word16, Weight)])]
forall a b. (a -> b) -> a -> b
$ ((Text, [(Word16, Weight)]) -> (Text, [(Word16, Weight)]))
-> [(Text, [(Word16, Weight)])] -> [(Text, [(Word16, Weight)])]
forall a b. (a -> b) -> [a] -> [b]
map (\(Text
t, [(Word16, Weight)]
ps) -> (HasCallStack => Text -> Text -> Text -> Text
Text -> Text -> Text -> Text
Text.replace Text
"\NUL" Text
"" Text
t, [(Word16, Weight)]
ps)) [(Text, [(Word16, Weight)])]
lexemes
   in [(Text, Vector (Word16, Weight))] -> Tsvector
normalizeLexemes (((Text, [(Word16, Weight)]) -> (Text, Vector (Word16, Weight)))
-> [(Text, [(Word16, Weight)])]
-> [(Text, Vector (Word16, Weight))]
forall a b. (a -> b) -> [a] -> [b]
map (\(Text
t, [(Word16, Weight)]
ps) -> (Text
t, [(Word16, Weight)] -> Vector (Word16, Weight)
forall a. [a] -> Vector a
Vector.fromList [(Word16, Weight)]
ps)) [(Text, [(Word16, Weight)])]
cleaned)