mgr.tex


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397

% dodaj opcję [licencjacka] dla pracy licencjackiej
% dodaj opcję [en] dla wersji angielskiej (mogą być obie: [licencjacka,en])
\documentclass[en]{pracamgr}

\usepackage{definitions}
\usepackage[backend=biber]{biblatex}
\addbibresource{mgr.bib}


% Dane magistranta:
\autor{Marcin Chrzanowski}{370754}

\title{MSO Query Answering on Trees}
\titlepl{Odpowiadanie na zapytania MSO na drzewach}
\kierunek{Computer Science}

% Praca wykonana pod kierunkiem:
% (podać tytuł/stopień imię i nazwisko opiekuna
% Instytut
% ew. Wydział ew. Uczelnia (jeżeli nie MIM UW))
\opiekun{dr hab. Szymon Toruńczyk\\
  Institute of Informatics\\
  }

% miesiąc i~rok:
\date{August 2021}

%Podać dziedzinę wg klasyfikacji Socrates-Erasmus:
\dziedzina{
11.3 Informatyka\\
}

%Klasyfikacja tematyczna według ACM
\klasyfikacja{% TODO
  D. Software\\
  D.127. Blabalgorithms\\
  D.127.6. Numerical blabalysis}

\keywords{MSO, query answering, tree automata, RMQ}

\begin{document}
\maketitle

%tu idzie streszczenie na strone poczatkowa
\begin{abstract}
    We define relabel regular queries on trees, which, via the known equivalence
    between tree automata and MSO formulae on trees, happens to be a
    generalization of the MSO query answering problem on trees. We show these
    queries can be performed in linear time with respect to query size (constant
    time in the case of MSO formulae with only first-order free variables) after
    preprocessing the input tree in linear time. Along the way, we show an
    algorithm for handling queries of the form ``Does the infix of a tree branch
    between nodes $x$ and $y$ belong to the regular language $L$'' (for a
    previously fixed regular language $L$) in the same time complexities. Our
    approach is much simpler in presentation than a previously known solution
    due to \textcite{colcombet}.
\end{abstract}

\tableofcontents

\chapter{Introduction}
% \addcontentsline{toc}{chapter}{Introduction}

\queryproblem[%
    an MSO formula $\varphi(\vec{X})$ over trees with $k$ free
    second-order variables.
]{%
    MSO Query Answering on Trees
}{%
    a tree $T$.
}{%
    given a $k$-tuple of subsets of $T$'s vertices $\vec{W}
    \in \mathcal{P}(V(T))^{k}$, is $\vec{W}$ a satisfying assignment to
    $\vec{X}$? In other words, does $T \models \varphi(\vec{W})$?
}

\queryproblem[%
    a deterministic bottom-up tree automaton $A$ over ranked alphabet $\Sigma$.
]{%
    Relabel Regular Queries on Trees
}{%
    a tree $T$ labeled with $\Sigma$.
}{%
    given $m$ relabelings $v_1 \mapsto a_1, \ldots, v_m \mapsto a_m$, where
    $v_i$ are vertices of $T$ and $a_i$ are elements of $\Sigma$, what state
    does $A$ arrive  at in the root of $T'$, where $T'$ is $T$ with each $v_i$'s
    label modified to the corresponding $a_i$.
}

\chapter{Preliminaries}\label{r:pojecia}

% \begin{quote}
  % Blaba, który jest blaba, nie jest prawdziwym blaba.

% \raggedleft\slshape tłum. z~chińskiego Robert Blarbarucki
% \end{quote}

\section{Definitions}
\subsection{Trees}

We work with finite trees whose vertices are labeled with letters from a finite
alphabet. More formally, given a finite alphabet $\Sigma$, for each $a \in
\Sigma$, $a$ is a tree, and if $t_1, \ldots, t_k$ are trees, then
$a(t_1, \ldots, t_k)$ is also a tree.

We use the standard notions of root, child, sibling, ancestor, descendant, etc.

Binary trees are trees where each node has either no children (the node is a
leaf), or exactly two children (which, based on their order, can be called the
left and the right child).

\subsection{Tree automata}

Consider the case of binary trees labeled with $\Sigma$. A
\definedterm{deterministic, bottom-up tree automaton} (further called just a
\definedterm{tree automaton}) consists of

\begin{itemize}
    \item A finite set of \definedterm{states} $Q$.
    \item A set of \definedterm{accepting states} $F \subseteq Q$.
    \item A bottom-up \definedterm{transition function} $\delta : Q \times
        \Sigma \times Q \to Q$.
    \item An \definedterm{initializatoin function} $\iota : \Sigma \to Q$.
\end{itemize}

A \definedterm{run} of tree automaton $A$ over tree $T$ is a relabeling of $T$
with the elements of $Q$ such that

\begin{itemize}
    \item Each leaf with label $a \in \Sigma$ is relabeled with $\iota(a)$.
    \item If an inner node $v$ has label $a \in \Sigma$, its left child got
        relabeled to $p \in Q$, and its right child got relabeled to $q \in Q$,
        then $v$ gets relabeled with $\delta(p, a, q)$.
\end{itemize}

A run is \definedterm{accepting} if $T$'s root gets relabeled to an accepting
state, that is a state $q \in F$. The set of all trees accepted by an automaton
$A$ is called the \definedterm{language recognized by $A$}, notated $L(A)$. We
call the class of all languages recognized by tree automata \definedterm{regular
tree languages}, analogously to regular languages recognized by finite state
automata.

We note that the expressive power of deterministic bottom-up tree automata is
the same as that of nondeterministic (either bottom-up or top-down) tree
automata.

\subsection{Monadic Second Order (MSO) Logic}
% def of MSO

From a logical point of view, the trees we work with can be seen as structures
over a signature with a single binary relation $E$ and unary relations $U_a$ for
each $a \in \Sigma$. $E(v, w)$ represents a (directed from parent to child) edge
from $v$ to $w$. $U_a(v)$ signifies that $v$'s label is $a$.

For convenience, we will also make use of the binary relation $\leq$, with $v
\leq w$ signifying that $v$ is an ancestor of $w$ (with every vertex being an
ancestor of itself; $<$ can be used to signify a strict ancestor). Note that in
the case of MSO on trees, $\leq$ could be defined using just the edge relation
$E$.

We make use of a fundamental theorem tying MSO logic on trees and tree automata:

\begin{theorem}
    For every MSO formula $\varphi$ over trees, there exists a tree automaton
    $A$ such that for every tree $T$, $T \models \varphi$ if, and only if $T \in
    L(A)$.
\end{theorem}

We note that the converse of this theorem is also true (i.e. that for every tree
automaton, there is a corresponding MSO formula), however we will use only the
MSO to automata direction in this work.

\section{Query answering problems}

Consider a computational problem whose inputs are of the form $(S, q) \in
\mathcal{S} \times Q$ for some set of \definedterm{structures} $\mathcal{S}$ and
some set of \definedterm{queries} $Q$. This induces a \definedterm{query
answering problem} which is divided into two phases:

\begin{description}
    \item[preprocessing] An input structure $S \in \mathcal{S}$ is given and a
        \definedterm{preprocessing algorithm} outputs an indexing structure
        $S'$.
    \item[queries] On-line queries $q_1, \ldots$ from $Q$ about $S$ need to be
        handled, with access to the preprocessing output $S'$.
\end{description}

We are interested in the time complexities of both phases. We use the following
notation for algorithms that have both a preprocessing and query phase: If
it takes $f(n)$ time to complete the preprocessing step for an input of size
$n$, and $g(n, m)$ time to then handle a query of size $m$, we say the algorithm
has time complexity \qptime{$f(n)$}{$g(n, m)$}.

We turn to a discussion of several query problems with known solutions, both to
serve as examples and because we will be using them in our algorithm.

\subsection{Lowest Common Ancestor}

\queryproblem{%
    Lowest Common Ancestor (LCA) Queries
}{%
    a tree $T$.
}{%
    given vertices $x$ and $y$, find the vertex $z$ that's an ancestor of both
    $x$ and $y$, and is their lowest (i.e. furthest from the root) common
    ancestor.
}

\textcite{tarjan1984} were the first to show an optimal \qpoptimal{}
algorithm for LCA. \textcite{schieber1988} used a similar approach but
simplified the indexing structure, keeping the same time complexities.
\textcite{berkman1993} showed a completely new approach to the problem, which
relies on answering range minimum queries (see below) about an array of properly
ordered tree vertices. \textcite{bender2000} offer a simpler presentation of the
algorithm in \cite{berkman1993} and note the equivalence between the LCA and RMQ
problems.

\subsection{Range Minimum Query (RMQ)}

\queryproblem{Range Minimum Queries}{%
    an array $A$ of integers.
}{%
    given indices $i$ and $j$, return the index of the smallest
    element in the subarray $A[i, j]$.
}

As mentioned above, \textcite{bender2000} show an \qpoptimal{}
algorithm for the RMQ problem.

Their method is as follows. First they show that a special case of the RMQ
problem, $\pm1$ RMQ, can be solved in \qpoptimal. This restriction of the problem
is enough to handle LCA queries. Then, for the general RMQ case, a Cartesian
tree\footnote{A Cartesian tree of a list is a binary tree with the list's
minimum element in its root, the root's children being Cartesian trees of the
left and right sublists around the minimal element. It can be constructed in
linear time.} of the list is built, and LCA queries on this tree correspond to
range minimum queries on the list.

\subsection{Word infix regular queries}\label{wordinfix}

\queryproblem[%
    regular language $L$ over alphabet $\Sigma$, given by DFA $A$.
]{Word Infix Regular Queries}{%
    a word $w \in \Sigma^*$.
}{%
    given indices $i$ and $j$ with $1 \leq i < j \leq |w|$, does the infix $w[i,
    j]$ belong to $L$?
}

This problem has a very elegant \qpoptimal{} solution. We present the full
construction as we will be generalizing its internals for the tree case in
Chapter \ref{branchinfix}.

We begin by replacing each letter of $w$ with the set of states of $A$, $Q$.

Each letter $a$ of $w$ defines an injective function on $Q$, and we draw these
functions as directed edges between successive copies of $Q$. For example, if
$A$ in state $q$, reading $a$, would move to state $q'$, then, for any copy of
$Q$ corresponding to an instance of $a$ in the original word, there will be an
edge from $q$ there to $q'$ in the successive copy of $Q$.

Now we will color the vertices of the graph we just constructed with the colors
$1, 2, \ldots, |Q|$ in such a way that

\begin{enumerate}
    \item every copy of $Q$ has one vertex of each of the $|Q|$ colors;
    \item when a vertex of color $i$ has an edge to a vertex of color $j$ in a
        successive copy of $Q$, then $i \geq j$.
\end{enumerate}

The second point basically means that we will be trying to draw single-color
paths, but when paths need to join, it is the higher-colored path that gets cut
off.

The construction is as follows:

\begin{enumerate}
    \item Color an arbitrary vertex of the first copy of $Q$ with the color $1$.
    \item Follow the deterministic edges to the end of the word, coloring all
        vertices along this path with $1$.
    \item Now color another uncolored vertex of the first copy of $Q$ with the
        color $2$.
    \item Try following edges as far as possible, coloring all vertices with
        $2$.
    \item If your run into an already colored vertex, pick an arbitrary
        uncolored vertex in this copy of $Q$ to color with $2$ and continue from
        here.
    \item Repeat steps 3.-5. for each successive color up to $|Q|$.
\end{enumerate}

Additionally, for each vertex $v$, we store the index of the next copy of $Q$ in
which the path of this vertex's color is broken by a lower color. Put this
information in table $BREAK$.

To handle the query ``does $w[i, j] \in L$'', we look at vertex $v$, which is
the vertex of $A$'s
initial state in the $i$th copy of $Q$ and note its color $c$. Now we want to
answer the following question: if we follow the edges of the graph until the
$j$th copy of $Q$, what color will we end in? First, look at $k := BREAK[v]$. If
$k \geq j$, then we know that in the $j$th copy of $Q$, the path we're interested
in still has color $c$. Look at the state in this copy of $Q$ that's colored
with $c$, if it's an accepting state answer YES, if not, answer NO.

If $k < j$, then jump to the $k$th copy of $Q$ and take the edge from the vertex
colored $c$ here to the next copy of $Q$. The vertex $v'$ we end up in will be
colored with color $c' < c$. Continue as we did before, by looking at $k' :=
BREAK[v']$, comparing it to $j$, and either halting if $k' \geq j$, or jumping
again otherwise. Because with each jump we move to a color strictly smaller than
before, the number of jumps is bounded by the number of colors, $|Q|$. Thus the
query is answered in time constant with respect to $|w|$.

\chapter{Branch Infix Regular Queries}\label{branchinfix}

Before solving our main problem, that of MSO queries on trees, we generalize
word infix regular queries (section \ref{wordinfix}) to trees. This will be a
vital step in the MSO queries algorithm, but is an interesting result on its own
so it deserves its own chapter.

\queryproblem[%
    regular language $L$ over alphabet $\Sigma$.
]{Branch Infix Regular Queries}{%
    a $\Sigma$-labeled tree $T$.
}{%
    given a vertex $x$ and its descendant $y$, does the word given by labels on
    the path from $x$ to $y$ belong to $L$?
}

We begin with similar path coloring as in the word case, i.e. we replace each
vertex of the tree with a copy of the states of $A$, $Q$. Each labeled node
defines

\section{Highest Marked Descendant on Path}

We can now reduce the problem to the following:

\queryproblem{Highest Marked Descendant on Path Queries}{%
    a tree $T$ with set $M \subseteq V(T)$ of marked vertices.
}{%
    given a vertex $x$, its descendant $y$, find the node $z \in M$ that
    is the highest marked node on the path between $x$ and $y$, if such $z$
    exists.
}

We will build an index structure, constructible in linear time, that allows us
to handle such queries in constant time. The structure is heavily inspired by
\textcite{bender2000}, where a simple algorithm for computing LCA queries is
presented.

Recall that RMQ queries can be answered in constant time after linear preprocessing of
the input array.

We turn to describing the index structure for our tree problem.

First, we create the array $POST$ of length $n$, which is the post-order of the
nodes.

Next, we label each node of the tree with its pre-order number. We create the
array $PRE$ with the corresponding pre-order labels of the nodes in $POST$, i.e.
if $POST[i] = v$, then $PRE[i]$ is $v$'s pre-order number.

Finally, for each node of the tree, we record its index in $POST$ in the array
$INDEX$.

Observation: given a node $x$ and its descendant $y$, looking at the range
$PRE[INDEX[y], INDEX[x] - 1]$, all the values in this range are descendants of
$x$, and the values smaller than $PRE[INDEX[y]]$ correspond to ancestors of $y$.
In particular, the minimum of that range corresponds to the highest ancestor of
$y$ that's a descendant of $x$.

In our problem, we only care about ancestors of $y$ that are colored black. So
we perform one final modification of our data structure: for all non-black
vertices $v$, we change $PRE[INDEX[v]]$ to $\infty$ (which can be represented by
$n+1$, an integer greater than any node's pre-order label). With $PRE$ modified
like this, we observe that now the minimum value of $PRE[INDEX[y], INDEX[x] -
1]$ corresponds exactly to the answer of our queries -- the highest black node
between $x$ and $y$.

We preprocess $PRE$ for RMQ queries in linear time.

Now when given a query $x$, $y$, we:

\begin{enumerate}
    \item Lookup $i := INDEX[y]$ and $j := INDEX[x] - 1$.
    \item Perform an RMQ query on $PRE[i, j]$, giving us index $k$ of the
        minimal value in that range.
    \item Lookup the corresponding vertex as $z := POST[k]$. This is the
        answer to our query.
\end{enumerate}

\chapter{Relabel Regular Queries on Trees}
h

\chapter{Conclusions}

\printbibliography

\end{document}