import module namespace kwic="http://exist-db.org/xquery/kwic" at "xmldb:exist:///db/modules/kwic.xql";
declare function local:filter($node as node(), $mode as xs:string) as xs:string? {
if ($node/parent::SPEAKER or $node/parent::STAGEDIR) then
()
else if ($mode eq 'before') then
concat($node, ' ')
else
concat(' ', $node)
};
(: context scope: number of preceding / following words :)
let $scope := 5
(: determine context width for KWIC results: 10 characters per context word, minimally 40 :)
let $cutoff := max(($scope * 10, xs:int(40)))
let $config :=
let $hits := doc("/db/shakespeare/plays/hamlet.xml")//SPEECH[ft:query(., "nature king sir")]
let $KWIC := $hits/kwic:summarize(., $config,
util:function(xs:QName("local:filter"), 2))
(: split up collocations per search term :)
for $term in distinct-values($KWIC//kwic:hit/lower-case(normalize-space(.)))
order by $term
return
(: prepare entire left / right contexts for tokenization:
-lower case
-normalize whitespace
-reverse $prev
:)
let $prev := for $a in $KWIC[kwic:hit/lower-case(normalize-space(.))) eq $term]/kwic:prev/lower-case(normalize-space(.)) return string-join(reverse(tokenize($a, '\W+')), ' ')
let $next := $KWIC[kwic:hit/lower-case(normalize-space(.))) eq $term]//kwic:next/lower-case(normalize-space(.))
(: per context position, retrieve all distinct words :)
(: note: discard numbers :)
let $prev :=
for $context in ((0 - $scope) to -1)
let $words :=
let $tok := for $a in $prev return tokenize($a, '\W+')[matches(., '[a-zA-Z]')][position() = abs($context)]
for $b in distinct-values($tok) order by $b return {$b}
return {$words}
let $next :=
for $context in (1 to $scope)
let $words :=
let $tok := for $a in $next return tokenize($a, '\W+')[matches(., '[a-zA-Z]')][$context]
for $b in distinct-values($tok) order by $b return {$b}
return {$words}
let $max := max(($next|$prev)/count(w))
(: spread out words-per-context over table rows :)
return
{
|
{
for $a in $prev
return {$a/@pos/string()} |
,
term |
,
for $a in $next
return {$a/@pos/string()} |
}
,
for $i in (1 to $max)
return
{
| {$i} | ,
for $a in $prev
return
{$a/w[$i]/text()} |
,
{$term} |
,
for $a in $next
return
{$a/w[$i]/text()} |
}
}