mgr.tex


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311

% dodaj opcję [licencjacka] dla pracy licencjackiej
% dodaj opcję [en] dla wersji angielskiej (mogą być obie: [licencjacka,en])
\documentclass[en]{pracamgr}

\usepackage{definitions}
\usepackage[backend=biber]{biblatex}
\usepackage{graphicx}
\usepackage{subcaption}
\addbibresource{mgr.bib}

\usepackage{amsmath}
\usepackage{amsfonts}

\usepackage{hyperref}

% Dane magistranta:
\autor{Marcin Chrzanowski}{370754}

\title{MSO Query Answering on Trees}
\titlepl{Odpowiadanie na zapytania MSO na drzewach}
\kierunek{Computer Science}

% Praca wykonana pod kierunkiem:
% (podać tytuł/stopień imię i nazwisko opiekuna
% Instytut
% ew. Wydział ew. Uczelnia (jeżeli nie MIM UW))
\opiekun{dr hab. Szymon Toruńczyk\\
  Institute of Informatics\\
  }

% miesiąc i~rok:
\date{December 2022}

%Podać dziedzinę wg klasyfikacji Socrates-Erasmus:
\dziedzina{
11.3 Informatyka\\
}

%Klasyfikacja tematyczna według ACM
\klasyfikacja{% TODO
\begin{itemize}
    \item Theory of computation
    \begin{itemize}
        \item Design and analysis of algorithms
        \begin{itemize}
            \item Data structures design and analysis
        \end{itemize}
    \end{itemize}
\end{itemize}}

\keywords{MSO, query answering, tree automata}

\begin{document}
\maketitle

\begin{abstract}
    We define relabel regular questions on trees, which, via the known
    equivalence between tree automata and MSO formulae on trees, happens to be a
    generalization of the MSO query answering problem on trees. We show these
    questions can be answered in time $O_{\varphi}(m \log(m))$ with respect to
    question size (constant time in the case of MSO formulae with only
    first-order free variables) after preprocessing the input tree in time linear
    with respect to the tree's size. Along the way, we show an algorithm for
    answering questions of the form ``Does the infix of a tree branch between
    nodes $x$ and $y$ belong to the regular language $L$'' (for a previously
    fixed regular language $L$) in constant time after linear preprocessing. Our
    approach is much simpler in presentation than if using deterministic
    factorization forests due to \textcite{colcombet}.
\end{abstract}

\tableofcontents

\chapter{Introduction}

The model checking problem is a classic problem in theoretical computer science
in which we are tasked with answering whether a given structure $S$ satisfies a
sentence $\varphi$ of some logic. Several other interesting computational
problems arise when we generalize to formulae with free variables.

As an analogy to databases, formulae with free variables are often referred to
as \definedterm{queries} in the literature. We consider the \definedterm{query
answering} problem: given a query $\varphi(X_1, ..., X_k)$ and a structure $S$,
we want to answer questions of the form ``does assigning the set of elements
$W_1$ to $X_1$, ..., $W_k$ to $X_k$, satisfy $\varphi$?''.  A naive solution
would be to transform the formula $\varphi$ into a sentence by adding $k$ unary
relations $U_1, \ldots, U_k$ and labeling elements of $W_i$ with $U_i$. Now we
can answer an individual question in the same time as model checking.

Already model checking is known to be computationally difficult for various
interesting logics, but the situation can improve when restricting the class of
structures considered. In this work we will focus on the case of MSO logic over
trees. While MSO model checking is PSpace-hard in general,
\textcite{courcelle1990} showed that deciding if an MSO sentence holds in a
structure of bounded treewidth can be done in time linear in the size of the
structure. This, using the naive method described above, gives us an algorithm
for MSO query answering over trees that answers each question in time
$O_{\varphi}(n)$\footnote{Throughout the text we use $O_{x}(f(n))$ notation,
where $x$ is some portion of an algorithm's input. It indicates $x$ as a
parameter of the algorithm, i.e. that for every integer $k$, there is some
constant $c_k$ such that if $|x| = k$, then the algorithm runs in time $c_k
f(n)$ ($n$ being the size of the remaining input).} ($n$ being the size of the
tree).

In our setting we suppose that we want to answer multiple such questions about a
single tree and formula, varying the valuations we ask about, and thus allow a
preprocessing step over the formula and tree. In this case, the above naive
algorithm could be improved with Amarilli et al.'s \cite{amarilli2019} data
structure that allows updates of the underlying tree.  Let $m$ be the size of a
given question, i.e. $m = |W_1 \cup \ldots \cup W_k|$. Amarilli's result allows
us to preprocess the tree in linear time, label it with a choice of $W_1,
\ldots, W_k$ in time $O_{\varphi}(m \log n)$, then answer model checking in
constant time, thus greatly improving over the naive algorithm for questions
that talk about a small number of vertices compared to the size of the whole
tree.

\section{Main result}

We will improve on the above method even further, showing an algorithm with
which, after linear preprocessing of the tree, we can answer questions in time
linearithmic with respect to the question size. In particular, we eliminate any
dependence on tree size during question answering.

More explicitly, we will show how to solve Problem
\ref{mso-query-answering-problem}, with a preprocessing algorithm working in
time $O_{\varphi}(n)$ (where $n$ is the size of the input tree), and then answer
questions in time $O_{\varphi}(m \log m)$ (where $m$ is the size of the
question):

\queryproblem[%
    an MSO formula $\varphi(\vec{X})$ over trees with $k$ free
    second-order variables.
]{%
    \label{mso-query-answering-problem} MSO Query Answering on Trees
}{%
    a tree $T$.
}{%
    given a $k$-tuple of subsets of $T$'s vertices $\vec{W}
    \in \mathcal{P}(V(T))^{k}$, is $\vec{W}$ a satisfying assignment to
    $\vec{X}$? In other words, does $T \models \varphi(\vec{W})$?
}

Note that though we speak of MSO formulae with second-order variables only,
first-order variables are supported by simply restricting to questions in which
sets assigned to first-order variables are singletons. Also note that for
formulae with first-order free variables only question answering happens in
constant time.

Through a series of reductions, we will show that MSO query answering reduces to
the following problem about tree automata, which we solve in Chapter
\ref{mso-query-answering}:

\queryproblem[%
    a deterministic bottom-up tree automaton $A$ over ranked alphabet $\Sigma$.
]{%
    \label{relabel-regular-queries} Relabel Regular Questions on Trees
}{%
    a tree $T$ labeled with $\Sigma$.
}{%
    given $m$ relabelings $v_1 \mapsto a_1, \ldots, v_m \mapsto a_m$, where
    $v_i$ are vertices of $T$ and $a_i$ are labels from $\Sigma$, what state
    does $A$ arrive  at in the root of $T'$, where $T'$ is $T$ with each $v_i$'s
    label modified to the corresponding $a_i$.
}

\section{Related Work}

Query answering is one generalization of model checking to the case of formulae
with free variables. Another related one is \definedterm{query enumeration}.
Similar to query answering, we again allow the structure and query to be
preprocessed. Then, instead of answering individual questions about valuations,
we are interested in outputting all the satisfying valuations one by one.

The complexity class \constantdelaylin\ refers to those enumeration problems in
which the preprocessing step takes linear time and the delay between each
successive answer is constant. \lineardelaylin\ is a generalization of
\constantdelaylin\ where the time complexity of the second algorithm is linear
in the size of the answer being output. \textcite{bagan2006} introduced this
second notion and showed that enumerating MSO queries is \lineardelaylin\ on
trees. This in particular means that enumerating MSO queries for formulae whose
free variables are all first-order is in \constantdelaylin, and
\textcite{kazana2013} revisit this result, proving it using deterministic
factorization forests due to \textcite{colcombet} rather than Bagan's intricate
indexing structure.

Most closely related, \textcite{kazanaphd} investigated MSO query answering
(there called \emph{query testing}) on structures of bounded treewidth in his
PhD thesis. He showed that after linear preprocessing, questions can be answered
in constant time, however this result was limited to MSO queries whose free
variables are first-order variables (i.e., range over single vertices). This
work generalizes to also allowing set variables in the query, and uses a
different approach. Here again Kazana relied on Colcombet's factorization, an
application of semigroup theory. Our appraoch is more algorithmic and
straigforward.

\section{Organization}

The rest of our work is organized in the following way:

\begin{itemize}
    \item In Chapter \ref{preliminaries} we introduce definitions and
        known algorithms we will use to arrive at our main result.
        \begin{itemize}
            \item In particular, Section \ref{query-answering-problems} serves
                as a short review of some fundamental algorithms for solving
                problems related to trees and regular languages where a
                structure is preprocessed and then questions about it are
                answered.
        \end{itemize}
    \item In Chapter \ref{branchinfix} we solve an important subproblem, which
        is a generalization of a known algorithm about words to the tree case.
    \item In Chapter \ref{mso-query-answering} we arrive at our main result by
        reducing Problem \ref{mso-query-answering-problem} to Problem
        \ref{relabel-regular-queries} and solving it.
        \begin{itemize}
            \item Of note, in Section \ref{computing-closure} we show how to
                compute the LCA closure of a set of $m$ vertices of a
                preprocessed tree in time $O(m \log m)$.
        \end{itemize}
    \item In Chapter \ref{conclusion} we offer some concluding remarks,
        including discussion of time complexities with respect to parameters,
        which we mostly ignore throughout the main text, and a generalization of
        our result to structures of bounded treewidth.
\end{itemize}

\chapter{Preliminaries}\label{preliminaries}

\section{Definitions}
\subsection{Trees}

We work with finite rooted trees whose vertices are labeled with letters from a
finite alphabet. More formally, given a finite alphabet $\Sigma$, for each $a
\in \Sigma$, $a$ is a tree, and if $T_1, \ldots, T_k$ are trees, then $a(T_1,
\ldots, T_k)$ is also a tree. A tree $T$ can then also be seen as an acyclic
undirected graph with vertex set $V(T)$ and edge set $E(T)$, with a
distinguished root vertex $r \in V(T)$.

We use the standard notions of leaf, child, sibling, ancestor, descendant, etc.

Binary trees are trees where each node has either no children (the node is a
leaf), or exactly two children (which, based on their order, can be called the
left and the right child).

\subsubsection{Tree traversals}

Take a binary tree $T$. The \definedterm{post-order} of $V(T)$ is an ordering of
$T$'s vertices produced by the following recursive procedure:

\begin{enumerate}
    \item Traverse the root's children's subtrees.
    \item Visit the root.
\end{enumerate}

Similarly, the \definedterm{pre-order} of $V(T)$ is produced by the following
recursive procedure:

\begin{enumerate}
    \item Visit the root.
    \item Traverse the root's children's subtrees.
\end{enumerate}

Finally, the following procedure produces the \definedterm{in-order} of $V(T)$:

\begin{enumerate}
    \item Traverse the left child's subtree.
    \item Visit the root.
    \item Traverse the right child's subtree.
\end{enumerate}

\subsection{Tree automata}

Consider the case of binary trees labeled with $\Sigma$. A
\definedterm{deterministic, bottom-up tree automaton} (further called just a
\definedterm{tree automaton}) consists of

\begin{itemize}
    \item A finite set of \definedterm{states} $Q$.
    \item A set of \definedterm{accepting states} $F \subseteq Q$.
    \item A bottom-up \definedterm{transition function} $\delta : Q \times
        \Sigma \times Q \to Q$.
    \item An \definedterm{initialization function} $\iota : \Sigma \to Q$.
\end{itemize}

A \definedterm{run} of tree automaton $A$ over tree $T$ is a relabeling of $T$
with elements of $Q$ such that

\begin{itemize}
    \item Each leaf with label $a \in \Sigma$ is relabeled with $\iota(a)$.
    \item If an inner node $v$ has label $a \in \Sigma$, its left child got
        relabeled to $p \in Q$, and its right child got relabeled to $q \in Q$,
        then $v$ gets relabeled with $\delta(p, a, q)$.
\end{itemize}

A run is \definedterm{accepting} if $T$'s root gets relabeled to an accepting
state, that is a state $q \in F$. The set of all binary trees accepted by an
automaton $A$ is called the \definedterm{language recognized by $A$}, notated
$L(A)$. We call the class of all languages recognized by tree automata
\definedterm{regular tree languages}, analogously to regular languages
recognized by finite state automata.

We note that the expressive power of deterministic bottom-up tree automata is
the same as that of nondeterministic (either bottom-up or top-down) tree
automata.

\subsection{Monadic Second Order (MSO) Logic}
% def of MSO

From a logic point of view, the trees we work with can be seen as structures
over a signature with a single binary relation $E$ and unary relations $U_a$ for
each $a \in \Sigma$. $E(v, w)$ represents a (directed from parent to child) edge
from $v$ to $w$. $U_a(v)$ signifies that $v$'s label is $a$.

In the case of binary trees, the relation $E$ is replaced with two binary
relations $E_l$ and $E_r$, which represent an edge from parent to left child,
and from parent to right child, respectively.

For convenience, we will also make use of the binary relation $\leq$, with $v
\leq w$ signifying that $v$ is an ancestor of $w$ (with every vertex being an
ancestor of itself; $<$ can be used to signify a strict ancestor). Note that in
the case of MSO on trees, $\leq$ can be defined using just the edge relation
$E$.

We make use of a fundamental theorem tying MSO logic on trees and tree automata:

\begin{theorem}\label{mso-to-automaton}
    For every MSO formula $\varphi$ over binary trees, there exists a tree
    automaton $A$ such that for every binary tree $T$, $T \models \varphi$ if,
    and only if $T \in L(A)$.
\end{theorem}

We note that the converse of this theorem is also true (i.e. that for every tree
automaton, there is a corresponding MSO formula), however we will use only the
MSO to automata direction in this work. See for example Bojańczyk's text
\cite{bojanczyktoolbox} for a proof of both directions.

\subsection{Computational Model}

We use the Random Access Machine (RAM) model with uniform cost measure. In
particular, basic arithmetic operations are assigned a constant cost, regardless
of bitlength of arguments.

\section{Question answering problems}\label{query-answering-problems}

Consider a computational problem whose inputs are of the form $(S, q) \in
\mathcal{S} \times \mathcal{Q}$ for some set of \definedterm{structures}
$\mathcal{S}$ and some set of \definedterm{questions} $\mathcal{Q}$, and answers
come from a set $\mathcal{A}$. In other words, fix a function $f : \mathcal{S}
\times \mathcal{Q} \to \mathcal{A}$. This induces a \definedterm{question
answering problem} which is divided into two phases:

\begin{description}
    \item[preprocessing] An input structure $S \in \mathcal{S}$ is given and a
        \definedterm{preprocessing algorithm} outputs a data structure
        $S'$ (which can be an arbitrary data structure, not necessarily an
        element of $\mathcal{S}$).
    \item[questions] Given $S'$ and $q \in \mathcal{Q}$, output $f(S, q)$.
\end{description}

We are interested in the time complexities of both phases. We use the following
notation for algorithms that have both a preprocessing and question phase: if
it takes $f(n)$ time to complete the preprocessing step for an input of size
$n$, and $g(n, m)$ time to then handle a question of size $m$, we say the
algorithm has time complexity \qptime{$f(n)$}{$g(n, m)$}.

We turn to a discussion of several question answering problems with known
solutions, both to serve as examples and because we will be using them in our
algorithm.

\subsection{Lowest Common Ancestor}

\queryproblem{%
    Lowest Common Ancestor (LCA)
}{%
    a tree $T$.
}{%
    given vertices $x$ and $y$, find the vertex $z$ that's an ancestor of both
    $x$ and $y$, and is their lowest (i.e. furthest from the root) common
    ancestor.
}

\textcite{tarjan1984} were the first to show an optimal \qpoptimal{}
algorithm for LCA. \textcite{schieber1988} used a similar approach but
simplified the indexing structure, keeping the same time complexities.
\textcite{berkman1993} showed a completely new approach to the problem, which
relies on answering range minimum queries (see below) about an array of the
tree's vertices arranged in a specific order. \textcite{bender2000} offer a
simpler presentation of the algorithm in \cite{berkman1993} and note the
equivalence between the LCA and RMQ problems.

\subsection{Range Minimum Query (RMQ)}

\queryproblem{Range Minimum Queries}{%
    an array $A$ of integers.
}{%
    given indices $i$ and $j$, return the index of the smallest
    element in the subarray $A[i, j]$.
}

\textcite{bender2000} show an \qpoptimal{} algorithm for the RMQ problem.

Their method is as follows. First they show that a special case of the RMQ
problem, $\pm1$ RMQ, can be solved in \qpoptimal. This restriction of the problem
is enough to handle LCA questions. Then, for the general RMQ case, a Cartesian
tree\footnote{A Cartesian tree of an array is a binary tree with the array's
minimum element in its root, the root's children being Cartesian trees of the
left and right subarrays around the minimal element. It can be constructed in
linear time.} of the array is built, and LCA questions on this tree correspond
to range minimum queries on the array.

\subsection{Word infix regular questions}\label{wordinfix}

We now turn to a problem about regular languages. Note that the regular language
is fixed, i.e. we treat the size of its representation (e.g. the size of an
automaton representing it) as a parameter. Below we take a deterministic
automaton for convenience, but if we instead start with a nondeterministic
automaton we ``don't care'' about the exponential cost of determinizing it.

\queryproblem[%
    regular language $L$ over alphabet $\Sigma$, given by DFA $A$.
]{Word Infix Regular Questions}{%
    a word $w \in \Sigma^*$.
}{%
    given indices $i$ and $j$ with $1 \leq i < j \leq |w|$, does the infix $w[i,
    j]$ belong to $L$?
}

Let $n$ be the length of $w$. This problem has a very elegant \qpoptimal{}
solution, as presented by \textcite{bojanczyk}. We present the full construction
as we will be generalizing its internals for the tree case in Chapter
\ref{branchinfix}.

\textbf{preprocessing} We begin by creating a graph whose vertices are $n+1$
copies of the set of states of $A$, $Q$.  More precisely, we will be working
with a graph whose vertex set is $\{0, 1, \ldots, |w|\} \times Q$. We will refer
to a vertex in the $i$th copy of $Q$ labeled with state $q$ as $i.q$.

Since $A$ is deterministic, each letter $a \in \Sigma$ can be seen as a function
$a : Q \to Q$, and we draw these functions as directed edges between successive
copies of $Q$. For example, suppose the $i$th letter of $w$ is $a$. If $A$ in
state $q$, reading $a$, would move to state $q'$, then there will be an edge
from $(i-1).q$ to $i.q'$. We can think of this construction as first drawing
copies of $Q$ around each letter of $w$, then replacing each letter with the
edges induced by $A$'s transition function.

Note that by determinism, each vertex has exactly one outgoing edge (except for
vertices in the last copy of $Q$ which have no outgoing edges at all).

Now we will color the vertices of the graph we just constructed with the colors
$1, 2, \ldots, |Q|$ in such a way that

\begin{enumerate}
    \item every copy of $Q$ has one vertex of each of the $|Q|$ colors;
    \item when $i.q$ has color $c$ and there is an edge to $(i+1).q'$, which is
        colored with color $c'$, then $c \geq c'$.
\end{enumerate}

The second point basically means that we will be trying to draw single-color
paths, but when paths need to join, it is the higher-colored path that gets cut
off (think of the colors as priorities, with lower numbers representing more
important priorities; see Figure \ref{word-figure}).

The construction is as follows:

\begin{enumerate}
    \item Color an arbitrary vertex of the first copy of $Q$ with the color $1$.
    \item Follow the deterministic edges to the end of the word, coloring all
        vertices along this path with $1$.
    \item Now color another uncolored vertex of the first copy of $Q$ with the
        color $2$.
    \item Try following edges as far as possible, coloring all visited vertices
        with $2$.
    \item If your run into an already colored vertex, pick an arbitrary
        uncolored vertex in this copy of $Q$ to color with $2$ and continue from
        here.
    \item Repeat steps 3.-5. for each successive color up to $|Q|$.
\end{enumerate}

\begin{figure}
    \centering
    \def\svgwidth{\columnwidth}
    \input{figures/word.pdf_tex}
    \caption{
        The data structure for infix regular questions over word $abaacbc$ and a
        simple automaton with 4 states. As shown by the color blotches below the
        graph, red is color 1, with the highest priority, green is color 2 with
        the next priority, pink is color 3, blue is color 4, with the lowest
        priority.
        The information from $\breaktable$ is not shown in the figure, but it
        amounts to a pointer from each vertex to the rightmost vertex on its
        connected single color component.
    }\label{word-figure}
\end{figure}

Additionally, for each vertex $i.q$, we store the index of the next copy of $Q$
in which the path of this vertex's color is broken by a lower color. Put this
information in table $\breaktable$. For example, if $i.q$ is colored with color
$c$, and when following the deterministic path from $i.q$, all encountered
vertices are colored $c$ until $j.q'$ which is colored $c'$, then we set
$\breaktable[i.q] = j - 1$. See Figure \ref{word-figure} for a visual
representation of the data structure described for an example word and
automaton.

We can compute $\breaktable$ in linear time with a single backwards pass through
the colored graph.

\textbf{answering questions} Consider a question ``does $w[i, j] \in L$''. We
claim that using our colored graph and the table $\breaktable$ we can, in
constant time, conclude in what state the automaton $A$ will end up in on the
$j$th position of $w$ if it had started in its initial state on the $i$th
position.

First, look at vertex $(i-1).q_0$, which is the vertex of $A$'s initial state in
the $(i-1)$th copy of $Q$ and note its color $c$. Now we want to answer the
following question: if we follow the edges of the graph until the $j$th copy of
$Q$, what color will we end in? First, look at $k := \breaktable[(i-1).q_0]$. If
$k \geq j$, then we know that in the $j$th copy of $Q$, the path we're
interested in still has color $c$. Find the unique vertex $j.q$ in this copy of
$Q$ that's colored with $c$. $q$ is the state we will end up in, so if it is an
accepting state answer YES, if not, answer NO.

If $k < j$, then jump to the $k$th copy of $Q$ and consider the vertex $k.q$,
the unique vertex here colored $c$. From $k.q$, follow its single outgoing edge
to $(k+1).q'$, which will be colored with color $c' < c$. Continue as we did
before, by looking at $k' := \breaktable[(k+1).q']$, comparing it to $j$, and
either halting if $k' \geq j$, or jumping again otherwise. Because with each
jump we move to a color strictly smaller than before, the number of jumps is
bounded by the number of colors, $|Q|$. Thus the question is answered in time
$O(|Q|)$, which is constant with respect to $|w|$.

\chapter{Branch Infix Regular Questions}\label{branchinfix}

Before solving our main problem, that of MSO query answering on trees, we
generalize word infix regular questions (Section \ref{wordinfix}) to trees. This
will be a vital step in the MSO query algorithm, but is an interesting result on
its own.

The question answering problem we will solve is:

\queryproblem[%
    regular language $L$ over alphabet $\Sigma$, given by DFA $A$.
]{\label{branch-infix-problem} Branch Infix Regular Questions}{%
    a $\Sigma$-labeled tree $T$.
}{%
    given a vertex $x$ and its descendant $y$, does the word given by labels on
    the path from $x$ to $y$ belong to $L$?
}

Let $n$ be the size of $V(T)$.

\begin{theorem}\label{branch-infix-theorem}
    Problem \ref{branch-infix-problem} can be solved in time
    \qptime{$O_{A}(n)$}{$O_{A}(1)$}.
\end{theorem}

We begin with a similar construction as in the word case, i.e. we create a graph
of $n+1$ copies of $A$'s states $Q$. The vertex set of the graph will be $(V(T)
\cup \{ \mathsf{root} \}) \times Q \}$ and we can refer to a specific vertex as
$x.q$ for $x \in V(T) \cup \{ \mathsf{root} \}, q \in Q$.

Again, each letter $a \in \Sigma$ defines a function $a : Q \to Q$, and these
functions induce edges in our ``fattened'' tree: consider a vertex $x$ of $T$,
labeled with letter $a$ and let $y$ be its parent (or the new vertex
$\mathsf{root}$ when $x$ is the root of $T$). If $A$ in state $q$, reading
letter $a$ goes to $q'$, then there will be an edge from $y.q$ to $x.q'$. In the
word case we had placed copies of $Q$ around each letter and replaced the
letters with edges implied by $A$'s transition function, here we perform an
analogous transformation on a tree shaped input.

We also color all vertices with colors $1, \ldots, |Q|$ in this graph,
analogously to how we did in the word case: begin by coloring an arbitrary
vertex in the root with $1$, then follow edges downwards, coloring all visited
vertices with $1$. Then begin the same process with the next color, restarting
with a different vertex in a given copy of $Q$ if we run into an already colored
vertex.

This process will again lead to a coloring with the desired properties that

\begin{enumerate}
    \item every copy of $Q$ has one vertex of each of the $|Q|$ colors;
    \item when a vertex of color $c$ has an edge to a vertex of color $c'$ in a
        child copy of $Q$, then $c \geq c'$.
\end{enumerate}

\begin{figure}
    \centering
    \def\svgwidth{\columnwidth}
    \begin{subfigure}{0.3\linewidth}
        \input{figures/tree.pdf_tex}
        \caption{The original tree $T$}
    \end{subfigure}
    \def\svgwidth{\columnwidth}
    \begin{subfigure}{0.4\linewidth}
        \input{figures/tree-graph.pdf_tex}
        \caption{The colored graph}
    \end{subfigure}
    \caption{
        On the left, a sample labeled tree. On the right, the colored graph we
        create for the purposes of answering branch infix regular questions for
        this tree and a simple three state automaton. Note how the copies of the
        automaton's states are arranged in the same as shape as $T$, with an
        additional new root copy. The letters of the original tree fit in the
        spaces between copies of $Q$, inducing the edges between them.
    }\label{tree-figure}
\end{figure}

See Figure \ref{tree-figure} for how the colored graph is constructed for an
example tree.

Now let's consider how we could answer the question ``is the branch infix from
$x$ down to $y$ in $L$?''. Here we can't proceed exactly as in the word case. We
don't have a $\breaktable$ table since a vertex in an internal node can have
arbitrarily many points below it where its color is broken by a lower one.
Ideally we would like to be able to find such a color break point that is ``in
the direction of $y$ from $x$''.

In the next section we formulate this question formally and solve it as a
subproblem.

\section{Highest Marked Descendant on Path}

Consider the following question answering problem:

\queryproblem{\label{highest-marked-problem} Highest Marked Descendant on Path Questions}{%
    a tree $T$ with set $M \subseteq V(T)$ of marked vertices.
}{%
    given a vertex $x$, its descendant $y$, find the node $z \in M$ that
    is the highest marked node on the path between $x$ and $y$, if such $z$
    exists.
}

\begin{theorem}\label{highest-marked-theorem}
    Problem \ref{highest-marked-problem} can be solved in time
    \qptime{$O(n)$}{$O(1)$}.
\end{theorem}

We will build an index structure, constructible in linear time, that allows us
to handle such questions in constant time. The structure is heavily inspired by
\textcite{bender2000}, where a simple algorithm for answering LCA questions is
presented.

First, we create the array $\posttable$ of length $n$, which is the post-order
of the nodes.

Next, we label each node of the tree with its pre-order number. We create the
array $\prefixtable$ with the corresponding pre-order labels of the nodes in
$\posttable$, i.e.  if $\posttable[i] = v$, then $\prefixtable[i]$ is $v$'s
pre-order number.

Finally, for each node of the tree, we record its index in $\posttable$ in the
array $\indextable$.

\begin{observation}
    Given a node $x$ and its descendant $y$, looking at the range
    $\prefixtable[\indextable[y], \indextable[x] - 1]$:
    \begin{enumerate}
        \item All the values in this range correspond to descendants of $x$.
            \label{obs-descendants}
        \item The values smaller than $\prefixtable[\indextable[y]]$ correspond
            to ancestors of $y$. \label{obs-ancestors}
        \item In particular, the minimum of this range corresponds to the
            highest ancestor of $y$ that is also a descendant of $x$.
            \label{obs-conclusion}
    \end{enumerate}
\end{observation}

Proof: The pre-order labels in $\prefixtable$ are ordered according to a
post-order. In a post-order, the root of a subtree is visited directly after all
its descendants. Since $y$ is a descendant of $x$, all values between its
position in $\prefixtable$ and $x$'s position there will also be descendants of
$x$, giving item \ref{obs-descendants}.

The values in $\prefixtable$ are pre-order numbers. In a pre-order, for a given
node $z$, the only nodes that will have lower pre-order numbers are $z$'s
ancestors, and the children of these ancestors that are to the left of the child
towards $z$. By starting our range at index $\indextable[y]$, we've skipped over
all of the second types of vertices with lower pre-order numbers, leaving only
$y$'s ancestors as nodes in the range with a lower pre-order number. This gives
us item \ref{obs-ancestors}.

Items \ref{obs-descendants} and \ref{obs-ancestors} together give us item
\ref{obs-conclusion}.

In our problem, we only care about ancestors of $y$ that are marked. So
we perform one final modification of our data structure: for all non-marked
vertices $v$, we change $\prefixtable[\indextable[v]]$ to $\infty$ (which can be
represented by $n+1$, an integer greater than any node's pre-order label). With
$\prefixtable$ modified like this, we observe that now the minimum value of
$\prefixtable[\indextable[y], \indextable[x] - 1]$ corresponds exactly to the
answer of our question -- the highest marked node between $x$ and $y$.

We preprocess $\prefixtable$ for RMQ in linear time.

Now when given a question ``what is the highest marked descendant of $x$ in the
direction of $y$'', we:

\begin{enumerate}
    \item Look up $i := \indextable[y]$ and $j := \indextable[x] - 1$.
    \item Perform an RMQ lookup on $\prefixtable[i, j]$, giving us index $k$ of
        the minimal value in that range. \label{hmd-algo-lookup}
    \item Look up the corresponding vertex as $z := \posttable[k]$. This is the
        answer to our question.
\end{enumerate}

We can detect the siutation of there not being a marked node between $x$ and $y$
during step \ref{hmd-algo-lookup} by checking if $\prefixtable[k]$ is smaller
than the pre-order label of $y$. If it isn't, then all the nodes on the path
from $x$ to $y$ were unmarked (had higher values in the modified $\prefixtable$
since values at indices corresponding to unmarked nodes are set to $\infty$),
and $k$ will correspond to a marked node that's not on the path from $x$ to $y$
(or some unmarked vertex if there were no marked vertices at all in the range we
check).

\begin{figure}
    \centering
    \def\svgwidth{0.5\columnwidth}
    \input{figures/tree-marked.pdf_tex}
    \caption{
        A preprocessed marked tree. The marked vertices are colored dark grey.
        To the left of each vertex is its pre-order number. To the right is its
        post-order number. On the bottom is the table $\prefixtable$ with only
        the marked vertices' prefix numbers not replaced by $\infty$. The red
        rectangle marks the range for which we issue an RMQ question when
        answering a highest marked descendant question form the vertex with
        post-order index 7 down to the one with post-order index 2.
    }\label{marked-figure}
\end{figure}

See Figure \ref{marked-figure} for a visual example of the data structure
described.

Constructing each of $\posttable$, $\prefixtable$, and $\indextable$ takes linear
time, RMQ is a \qptime{$O(n)$}{$O(1)$} problem, so we have proved Therom
\ref{highest-marked-theorem}.

\section{Answering branch infix regular questions}

With the above problem solved, we are ready to finish our algorithm for branch
infix regular questions. Recall that we have created a graph of copies of $Q$
shaped similar to $T$ where edges between vertices correspond to transitions of
$A$ along corresponding vertices of $T$. During preprocessing we additionally
need to remember in each vertex $x.q$ a unique identifier of the connected
component of vertices of the same color that it is in (the identifiers can be
the root vertices of each such component), call this data
$\componenttable[x.q]$.

For each color $c$, we will preprocess the above graph for answering questions
of the form ``given a vertex $z.q$ colored with $c$, which is the first copy of
$Q$ on the path towards the copy of $Q$ that contains $y$ where the path from
$z.q$ changes color?''. To achieve this, we create a copy of $T$ with a newly
added root (thus this tree's vertices correspond to copies of $Q$ in our
fattened tree) in which we mark the vertices corresponding to copies of $Q$
where a non-$c$-colored vertex is pointed to by an edge from a $c$-colored
vertex in the parent copy of $Q$. This tree we preprocess for highest marked
descendant on path questions, solved in the previous section.  Note that Figure
\ref{marked-figure} is exactly the marked tree we create and preprocess for the
green color of the colored graph in Figure \ref{tree-figure}.

Now question answering can proceed similar to the word case. Given a question
``does the word on vertices from $x$ down to $y$ belong to $L$?'', let $x'$ be
$x$'s parent in $T$ (or $\mathsf{root}$ if $x$ was $T$'s root), look at the
vertex $x'.q_0$ and consider its color $c$. In the copy of $Q$ corresponding to
$y$ consider the vertex $y.q$, the unique vertex here of color $c$. If it is in
the same connected component as $x'.q_0$ (which we check by comparing
$\componenttable[x'.q_0]$ with $\componenttable[y.q]$), we can immediately
answer whether or not $A$ will accept the word on this path based on whether
this state is accepting. Indeed, if the two vertices are in the same component
then there is a single-color path from $x'.q_0$ down to $y.q$ and $q$ is indeed
the state $A$ would have ended in after running on the word given by the labels
from $x$ down to $y$.

Otherwise, we need to jump down to a lower copy of $Q$. We can find the first
such copy where the color on our path changes from $c$ to something lower by
asking a highest marked descendant on path question on the marked tree we
created for color $c$. The answer to this question exactly corresponds to the
point where the path of color $c$ from our initial state merges into a lower
color $c'$ on the path towards $y$. From here we continue as at the beginning,
checking $\componenttable$ to see if we can answer immediately, or taking
another jump down. Handling each jump takes constant time, and the number of
jumps is bounded by $|Q|$ since we can only jump to a lower color. Thus we can
answer branch infix regular questions in time constant with respect to $|T|$.

This finishes the proof of Theorem \ref{branch-infix-theorem}.

\chapter{MSO Query Answering on Trees}\label{mso-query-answering}

Now we have all the pieces necessary to present our main result, which we
can formulate as Theorem \ref{mso-query-answering-theorem}:

\begin{theorem}\label{mso-query-answering-theorem}
    Problem \ref{mso-query-answering-problem} can be solved in time
    \qptime{$O_{\varphi}(n)$}{$O_{\varphi}(m \log m)$}.
\end{theorem}

First, let's reduce Problem \ref{mso-query-answering-problem} to Problem
\ref{relabel-regular-queries}:

\begin{lemma}
    If Problem \ref{relabel-regular-queries} has an \qptime{$O_A(n)$}{$O_A(m
    \log m)$} solution, then Problem \ref{mso-query-answering-problem} has an
    \qptime{$O_{\varphi}(n)$}{$O_{\varphi}(m \log m)$} one.
\end{lemma}

Proof: We proceed with several standard linear reductions that transform Problem
\ref{mso-query-answering-problem}, which speaks about MSO formulae with free
variables, to Problem \ref{relabel-regular-queries}, which talks about tree
automata over binary trees.

First of all, if $T$ is not binary, we can use a standard encoding to encode it
inside of a binary tree $T'$, modifying $\varphi$ to $\varphi'$, such that $T
\models \varphi(\vec{W})$ if and only if $T' \models \varphi'(\vec{W})$.

Now, to use Theorem \ref{mso-to-automaton}, we turn the formula into an MSO
sentence without free variables by adding a unary relation $U_X$ for each
variable $X \in \vec{X}$ and instead of considering the formula $\varphi'$, we
now consider the sentence

\begin{equation*}
    \varphi'' = \exists_{X_1} \ldots \exists_{X_k}
        (\forall{}_x x \in X_i \iff U_{X_i}(x)) \land
        \varphi'(X_1, \ldots, X_k).
\end{equation*}

Selecting a model in which the vertices in set $W$ are exactly those colored
with unary relation $U_X$ is equivalent to a valuation of $\vec{X}$ in which the
variable $X$ is set to $W$.

Now by Theorem \ref{mso-to-automaton}, there is a tree automaton $A$ for binary
trees over the alphabet $\Sigma \times \{ 0, 1 \}^{\vec{X}}$ which accepts a
tree if and only if its labeling corresponds to a model satisfying $\varphi''$.
Here a label $b_1 \ldots b_k$ (where $k = |\vec{X}|$ and each $b_i \in \{ 0, 1
\}$) should be interpreted as a bit vector signifying which unary relations
$U_X$ a vertex is assigned to.

By combining the above reductions, we can solve MSO query answering on trees by
solving the relabel regular qeustions problem on trees. Indeed, fix a formula
$\varphi(\vec{X})$ and take a tree $T$. Derive automaton $A$ and tree $T'$ as
above. Label each vertex of $T'$ with $0\ldots0$ (signifying that none of its
vertices have yet been assigned to any of the variables in $\vec{X}$).
Preprocess $A$ and $T'$ for relabel regular questions. Now a relabeling of
$T'$'s vertices corresponds to selecting a specific valuation $\vec{W}$ of
$\vec{X}$, thus answering whether or not $A$ accepts the relabeled tree is
equivalent to answering whether or not $T \models \varphi(\vec{W})$.

This concludes the proof of the lemma, and Theorem
\ref{mso-query-answering-theorem} will be proved if we prove:

\begin{theorem}\label{relabel-regular-queries-theorem}
    Problem \ref{relabel-regular-queries} can be solved in time
    \qptime{$O_A(n)$}{$O_A(m \log m)$}. Furthermore, time
    complexity with respect to the size of the tree automaton is exponential in
    both the preprocessing and question answering phases.
\end{theorem}

The rest of this chapter is dedicated to proving Theorem
\ref{relabel-regular-queries-theorem}.

\section{Relabel Regular Questions on Trees}

Fix a tree automaton $A$ and take a $\Sigma$-labeled binary tree $T$.

We'll preprocess the tree in such a way that when given a relabel question,
we'll be able to partition the tree into linearly (with respect to the
question's size) many parts, then, in a bottom-up fashion, compute the state in
the root of each part in the relabeled tree. The computation for each part will
take constant time.

Partitioning into the $O(m)$ parts we're interested in will take time $O(m \log
m)$, then after the bottom-up computation we will arrive at the root's state in
time $O_A(m)$, thus answering the question in the promised time.

\subsection{LCA partition}

Consider a set of tree vertices $W \subseteq V(T)$. We'll say that $W$ is
\definedterm{LCA closed} if for every pair of vertices $v, w \in W$, it is the
case that $\mathlca(v, w) \in W$ (note that one of $v$ or $w$ might be their
LCA).

$W \subseteq V(T)$ is \definedterm{partition ready} if

\begin{enumerate}
    \item it is LCA closed;
    \item it contains the root of the tree;
    \item\label{both-subtrees-property} for every $v \in W$, either it has
        descendants in $W$ in at most one of its subtrees, or both of its
        children are elements of $W$.
\end{enumerate}

We will define the \definedterm{LCA partition with respect to $W$} for a
partition ready subset $W$ of $T$'s vertices. The partition will assign each
vertex of $T$ to a unique part, and each part will be rooted in an element of
$W$ (in particular, there will be as many parts as elements in $W$).

Each part will be of one of three types:

\begin{description}
    \item[subtree] A \definedterm{subtree of $v$} is $v$ along with all its descendants;
        it is rooted in $v$.
    \item[subtree with hole] A \definedterm{subtree of $v$ with hole $w$} is the
        subtree of $v$ minus the subtree of $w$ (for $w$ a descendant of $v$);
        it is rooted in $v$.
    \item[singleton] A singleton vertex $v$, which is the root of such a part.
\end{description}


Now, for a partition ready set $W \subseteq V(T)$, the LCA partition of $T$ with
respect to $W$ is a partition into subtrees, subtrees with holes, and
singletons constructed according to the following rules:

\begin{enumerate}
    \item If $v \in W$ is maximal in $W$ with respect to the ancestor relation
        $<$ (i.e. there are no other elements of $W$ in the subtree of $v$),
        then the subtree of $v$ is a part of the partition.
    \item If $v \in W$ has descendants that are in $W$ only in its left child's
        subtree, let $w$ be the highest such descendant (there is a unique
        highest descendant because $W$ is LCA closed). The subtree of $v$ with
        hole $w$ is a part of the partition.
    \item Symmetrically if $v \in W$ only has descendants that are in $W$ in its
        right child's subtree.
    \item If $v \in W$ has descendants that are in $W$ in both its children's
        subtrees we know that $v$'s children are also elements of $W$ by
        property \ref{both-subtrees-property} of being partition ready. Put $v$
        into a singleton part on its own, all the other elements of its subtree
        were assigned to parts when its children were handled.
\end{enumerate}

\begin{figure}
    \centering
    \def\svgwidth{\columnwidth}
    \input{figures/partition.pdf_tex}
    \caption{
        An LCA-partition of the tree with respect to the black vertices. The
        dark grey vertices are added when computing the LCA closure, the light
        grey vertices are the final vertices needed to create a minimal
        partition ready set containing the black vertices. Part (a) is an
        example of a singleton, part (b) of a subtree with a hole, and (c) of a
        subtree.
    }\label{partition-figure}
\end{figure}

If $W$ is not parition ready, we will define the LCA partition with respect to
$W$ the partition with respect to the minimal partition ready set that contains
$W$. See Figure \ref{partition-figure} for an example of a partitioned tree.
Skipping ahead, the LCA partition of the set of vertices relabeled in a
relabeling question will be the partition we alluded to at the end of the
previous section. We promised that this partition will have $O(m)$ parts, so we
must show that for a set $W$, the minimal partition ready set containing it is
of size $O(|W|)$.

Take a set of $m$ tree vertices $W$ and consider the following procedure:

\begin{enumerate}
    \item Let $W'$ be $W$ with the tree's root added.
    \item\label{partition-ready-lca-step} Let $W''$ be the LCA closure of $W'$.
    \item\label{partition-ready-last-step} Let $W'''$ be $W''$ plus, for every
        element of $W''$ that has elements of $W''$ in both its children's
        subtrees, the element's both children.
\end{enumerate}

\begin{claim}
    The above procedure produces the minimal parition ready set containing $W$
    and each step adds at most $O(m)$ new vertices.
\end{claim}

By defintition of LCA closure, $W''$ above is the smallest LCA closed set
containing $W$ and the tree's root. Step \ref{partition-ready-last-step} adds at
most $2|W''|$ new vertices, we will now prove that these are exactly the only
vertices still needed to be added to arrive at a partition ready set. For now
assume that only $O(m)$ vertices were added in step
\ref{partition-ready-lca-step}, this will be proved in the next section.

It's easy to see that $W'''$ is still LCA closed. It is obvious that it is
necessary to add at least all the vertices added in step
\ref{partition-ready-last-step}, otherwise property \ref{both-subtrees-property}
of partition readiness won't be satisfied. What we will show is that no other
new vertices need to be added. Take $v \in W''$ that has descendants from $W''$
in both its subtrees, let $v_l$ and $v_r$ be its left and right child,
respectively. Without loss of generality, suppose $v_l$ is not an element of
$W''$. The only reason we would need to add even more vertices after adding
$v_l$ would be if $v_l$ had descendants in both its children's subtrees in
$W'''$. First let's show that it has descendants in $W''$ in at most one of its
subtrees. Indeed, if it did have descendants from $W''$ in both its subtrees,
$W''$ wouldn't have been LCA closed -- $v_l$ would have been the LCA of the
topmost members of $W''$ of its children's subtrees. Newly added elements of
$W'''$ can appear only underneath vertices that were originally in $W''$, so if
$v_l$ didn't have an element of $W''$ in one of its subtrees, then it also
doesn't have elements of $W'''$ in that subtree. Thus $W'''$ is indeed the
minimal partition ready set containing $W$.

\subsection{Computing the LCA closure}\label{computing-closure}

\begin{lemma}\label{lca-lemma}
    After linear preprocessing of a tree $T$, the LCA closure of a set of $m$
    tree vertices $W \subseteq V(T)$ can be computed in time $O(m \log m)$, and
    the size of the closure is linear in the size of $W$.
\end{lemma}

Proof: In the preprocessing step, we will preprocess the tree for LCA questions,
and assign each vertex $v$ its in-order number, $\infixtable[v]$.

Now to compute the closure of $W$, we first sort the vertices in $W$ with
respect to their in-order numbers, so we end up with a list $v_1, \ldots, v_m$
of vertices such that $\infixtable[v_1] < \ldots < \infixtable[v_m]$.

\begin{observation}\label{infix-observation}
    If $\infixtable[u] < \infixtable[v] < \infixtable[w]$, then $\mathlca(u, w)$
    is equal to either $\mathlca(u, v)$ or $\mathlca(v, w)$.
\end{observation}

This may be proved by a simple case analysis.

\begin{observation}\label{lca-observation}
    Consider two tree vertices, $u, w \in V(T)$ with $\infixtable[u] <
    \infixtable [w]$, and their least common ancestor
    $v$. Then $\infixtable[u] \leq \infixtable[v] \leq \infixtable[w]$.
\end{observation}

Proof: If $v$ is equal to one of $u$ or $w$, the observation is obvious. For
example, if $v = u$, we have that $\infixtable[u] = \infixtable[v] <
\infixtable[w]$. If $v$ is a third vertex, then $u$ must be in its left subtree
and $w$ must be in its right subtree. Thus an infix walk over $T$ will first
visit $u$, then $v$, and then $w$.

Now, once $W$ has been sorted by in-order numbers (which takes time $O(m \log
m)$ using a standard sorting algorithm), we can complete our computation in time
$O(m)$. Consider the following set of vertices:

\begin{equation*}
    U := \{ v \;|\; v = LCA(v_i, v_{i+1}) \text{ for $1 \leq i \leq m - 1$} \}
\end{equation*}

$U$ is of size $O(m)$ and can be computed in time $O(m)$ by issuing LCA
questions to successive pairs of the sorted $W$. We claim that $W' := W \cup U$
is the LCA closure of $W$.

Let's define $u_i := LCA(v_i, v_{i+1})$. By Observation
\ref{lca-observation}, each $u_i$ falls between $v_i$ and $v_{i+1}$ in the infix
order, i.e. $W'$ sorted by infix numbers is the sequence $v_1, u_1, v_2, u_2,
\ldots, u_{m - 1}, v_m$ (note that in the way this sequence is written, some
vertices might be repeated, e.g. it might be the case that $LCA(v_2, v_3) =
v_2$, in which case $u_2$ will be equal to $v_2$). We claim that for every pair
of vertices in $W'$, their LCA is also in $W'$. We'll proceed by induction, and
it will be easier if we now rename the sorted $W'$ to $w_1 := v_1, w_2 := u_2,
\ldots, w_{2m-2} := u_{m-1}, w_{2m - 1} := v_m$. As quick recap, the following
are properties of the sequence of $w_i$'s:

\begin{enumerate}
    \item For $i < j$, $\infixtable[w_i] \leq \infixtable[w_j]$.
    \item For odd $i$, $w_i$ comes from the original set $W$.
    \item For even $i$, $w_i$ comes from the set of added vertices $U$. In
        particular, $w_i$ is equal to $LCA(w_{i-1}, w_{i+1})$.
\end{enumerate}

We prove that $W'$ is LCA closed by showing that for all pairs $w_i, w_j$, their
LCA belongs to $W'$, by induction on the difference between $i$ and $j$.

The base case is to show that the LCA of $w_i$ and $w_{i+1}$ is in $W'$ for
every pair of successive vertices. Suppose $i$ is odd, then $w_{i+1}$ comes from
$U$ and was constructed as the LCA of $w_i$ and $w_{i+2}$. Then $w_{i+1}$ must
be on the path from $w_i$ to the root and is the LCA of the pair. Similarly for
when $i$ is even.

Now assume that for all $w_i$, $w_j$, if $j - i < k$ then $LCA(w_i, w_j)$ is in
$W'$. We want to show the same is true when $j - i \leq k$. Take $w_i$ and $w_j$
with $j - i = k$. Consider the triple $w_i$, $w_{i+1}$, $w_j$. By Observation
\ref{infix-observation}, $LCA(w_i, w_j)$ is equal to $LCA(w_i, w_{i+1})$ or
$LCA(w_{i+1}, w_j)$. By the induction assumption, we already know both of these
are in $W'$, thus showing $W'$ is LCA closed and finishing the proof of Lemma
\ref{lca-lemma}

\section{Computing root states of parts}

Let's now describe how we will use LCA paritions to solve the relabel questions
problem. Our approach is the following: given the question ``is $T$ accepted by
$A$ after relabeling according to $v_1 \mapsto a_1, \ldots, v_m \mapsto a_m$?'',
let $W = \{ v_1, \ldots, v_m \}$. Consider the partition of $T$ with respect to
$W$. We will show how to compute $A$'s state in the root of each part of the
partition, after the tree had been relabeled. Per our definition of the
partition, we have three types of parts to consider:

\begin{enumerate}
    \item A subtree rooted at vertex $v$.
    \item A singleton vertex $v$.
    \item A subtree rooted at vertex $v$ with hole $w$.
\end{enumerate}

The first case is trivial to compute: no descendants of $v$ had been relabeled,
so the states of the children of $v$ are the same as in $A$'s run on the
original tree $T$. We look them up in the precomputed run, then apply $A$'s
transition to those states and $v$'s new label.

We will compute the states of part roots in a bottom-up fashion, so the second
case is also simple. Both of $v$'s children are roots of parts (since the LCA
partition is based on a partition ready set), so once we've computed the states
in the children of $v$, we again simply apply $A$'s transition function to those
states and $v$'s new label.

The third is the interesting case, that of a subtree with a hole. Consider the
path from the hole $w$ up to the root $v$. What we claim is that the question
``if $A$'s state in $w$ is $q$, is the state in $v$ going to be $q'$?'' can be
answered with a branch infix regular question.

\subsection{Computing the root state of a subtree with a hole}

Consider the subtree of $v$ with hole $w$. It can be divided into the following
components:

\begin{itemize}
    \item A path from the parent of $w$ up to $v$, notate it as $u_1, u_2,
        \ldots, u_p = v$.
    \item For each $i$ from 1 to $p$, the subtree of $u_i$'s child that doesn't
        lie on the path from $w$ to $v$. Let $u_i'$ be the child of $u_i$ that
        doesn't lie on the $w$ to $v$ path.
\end{itemize}

\begin{figure}
    \centering
    \def\svgwidth{0.5\columnwidth}
    \input{figures/subtree-with-hole.pdf_tex}
    \caption{
        The subtree of $v$ with hole $w$. All its nodes can be partitioned into
        the path $u_1, \ldots, u_4 = v$ and the subtrees of $u_1', \ldots,
        u_4'$.
    }\label{subtree-with-hole-figure}
\end{figure}

The node $w$ is not a part of the subtree with hole, but we will need to
consider it during the following computation, so for ease of notation we will
define $u_0 := w$, extending the path of $u_i$'s.

Figure \ref{subtree-with-hole-figure} depicts a subtree with hole labeled
according to this nomenclature.

With our bottom-up computation, we've already computed the new state in $w$.
There have been no relabelings in the subtrees of the $u_i'$'s, so we can get
their states from the precomputed run of $A$ on the original tree. Now consider
how, using all this information, we could easily compute $v$'s new state.

To compute the state in $u_1$, all we need is $u_1$'s label and the states of
$u_0$ and $u_1'$ -- $u_1$'s children. This is all information we already have.
So going up the path from $w$ to $y$, we can compute each $u_i$'s new state
from its label and the states of $u_{i-1}$ and $u_i'$. The computation as just
described would take time linear in $p$, the length of the path, but the
computation is simple enough that it could be performed by a finite state word
automaton, allowing us to use our result from Chapter \ref{branchinfix}. To show
this formally, consider the alphabet $\Sigma' := Q \times \Sigma \times Q \times
\{ \leftsymbol, \rightsymbol \}$. A word over this alphabet can be used to
represent a subtree with a hole. A subtree with hole as described above would be
represented by word $\alpha_0 \alpha_1 \ldots \alpha_p$ with each $\alpha_i$
being a quadruple $\langle p_i, a_i, q_i, d_i \rangle \in \Sigma'$ (with $p_i,
q_i \in Q$, $a_i \in \Sigma$, $d_i$ either $\leftsymbol$ or $\rightsymbol$)
where:

\begin{itemize}
    \item $a_i$ is the original label of $u_i$.
    \item $p_i$ and $q_i$ are the states of $u_i$'s children in the precomputed
        run of $A$ over $T$.
    \item $d_i$ encodes the information of whether $u_i$ is the left or right
        child of its parent.
\end{itemize}

Note that this is slightly more information than we previously mentioned: in our
description of the computation above we only cared about the state of the child
not on the $w$ to $v$ path. The information about the other child is redundant
when considering just one subtree with hole and can be ignored when doing
computation on that particular subtree with hole. But we will keep information
about both children so that we can handle questions about any subtree with hole.

Now consider the language $L_{q,q'} \subseteq \Sigma'^*$ of words $\alpha_0
\ldots \alpha_p$ such that the subtree with hole encoded by such a word, if the
hole's state were set to $q$, would lead to $A$ arriving at state $q'$ after
running up it.

\begin{lemma}
    $L_{q,q'}$ is a regular language.
\end{lemma}

Let's explicitly construct the automaton recognizing $L_{q,q'}$. Its state
space will be $\{ \mathsf{initial} \} \cup Q \times \{ \leftsymbol, \rightsymbol
\}$, states other than $\mathsf{initial}$ encoding, for the vertex represented
by the previously read letter, what state we arrived in at it and whether it was
the current vertex's left or right child. The automaton works as follows:

\begin{enumerate}
    \item In the initial state $\mathsf{initial}$, read $\alpha_0 = \langle p_0,
        a_0, q_0, d_0 \rangle$, set the state to $\langle q, d_0 \rangle$.
    \item Now when reading letter $\alpha_i = \langle p_i, a_i, q_i, d_i
        \rangle$ in state $\langle r, d \rangle$, if $d$ if $\leftsymbol$, use
        $A$'s transition function $\delta$ to compute the new tree state as
        $\delta(r, a_i, q_i)$ (the previously computed tree state is of our left
        child's, and our right child's comes from the current letter). If $d$ is
        $\rightsymbol$, the new tree state will instead be $\delta(p_i, a_i,
        r)$. Store this and $d_i$ as the new state.
    \item Accept if the final tree state is $q'$.
\end{enumerate}

This automaton recognizes exactly $L_{q,q'}$, by performing essentially the same
computation we described before. This completes the proof of the lemma.

Our algorithm for branch infix regular questions dealt with top-down questions
(i.e. the word we ask about runs from a higher vertex down to its descendant).
Regular languages are reversible, so we can preprocess $T$ for branch infix
regular questions for each language $L_{q, q'}^R$ (where $\cdot^R$ denotes the
reversed language). A question about whether the word on the path from $v$ down
to $w$ belongs to $L_{q, q'}^R$ is the same as asking if the word from $w$ up to
$v$ belongs to $L_{q,q'}$. Note that the number of languages we need to
preprocess for is constant in the size of the tree -- it only depends on the
size of the automaton.

Given the subtree of $v$ with hole $w$, once we've computed $w$'s new state $q$,
to compute $v$'s new state, take $v'$ -- $v$'s child in the direction of $w$,
find $q' \in Q$ such that the path from $v'$ to $w$ belongs to $L_{q, q'}^R$
(using branch infix regular questions; since $A$ is deterministic, there will be
exactly one such $q'$). Now from $q'$, the state of $v$'s other child, and $v$'s
new label, use $A$'s transition function to compute $v$'s new state.

Since $T$'s root is the root of one of the parts of our LCA partition, in the
end we will know whether or not $A$ accepts $T$ after relabeling. Handling each
part takes $O_A(1)$ time, and there are $O(m)$ parts to handle. We did require
$O(m \log m)$ time to sort the vertices mentioned in the question when computing
their LCA closure, thus question answering takes time $O_A(m \log m)$. This
concludes the proof of Theorem \ref{relabel-regular-queries-theorem}.

\section{The full algorithm}

As a recap we summarize all the steps of the full MSO query answering algorithm.

\textbf{preprocessing} Take an MSO formula $\varphi(X_1, \ldots, X_k)$ and a
$\Sigma$ labeled tree $T$.

\begin{enumerate}
    \item Transform $\varphi$ and $T$ into a deterministic bottom-up automaton
        $A$ and binary tree $T'$.
    \item Preprocess $T'$ for computing LCA closures:
        \begin{enumerate}
            \item Preprocess $T'$ for LCA questions;
            \item Store each vertex's in-order number.
        \end{enumerate}
    \item For each pair of $A$'s states, $q$ and $q'$, preprocess $T'$ for
        branch infix regular questions about the language $L_{q, q'}^R$.
\end{enumerate}

\textbf{answering questions} We receive the question ``for a tuple of sets of
$T$'s vertices, $\vec{W}$, does $T \models \varphi(\vec{W})$?''

\begin{enumerate}
    \item From the selection of $\vec{W}$, generate a relabeling of $T'$'s
        vertices that labels vertices mentioned in $\vec{W}$ as being assigned
        to the appropriate variables in $\vec{X}$. Let $W$ be this set of
        relabeled vertices.
    \item Compute $W'$, the smallest partition ready set containing $W$.
    \item Compute the LCA parition of $T'$ with respect to $W'$.
    \item In a bottom-up fashion, compute the state of $A$ running over the
        relabeled tree in the roots of each part of the LCA partition.
    \item Return YES if the state of $T$'s root is accepting, NO otherwise.
\end{enumerate}

\chapter{Conclusion}\label{conclusion}

\section{Complexities with respect to formulae and automata}

In our algorithms in Section \ref{wordinfix} and Chapters \ref{branchinfix} and
\ref{mso-query-answering} we treated the MSO formula or automaton (over words or
trees) that needed to be handled as a fixed parameter to our algorithms, and
didn't analyze how they impact time complexities. Let's quickly perform this
analysis now.

The MSO to automaton formula reduction implied by Theorem \ref{mso-to-automaton}
is unfortunately non-elementary, making MSO query answering non-elementary with
respect to $|\varphi|$. The situation improves if we start with a relabel
regular questions problem directly, rather than needing to reduce from an MSO
formula. The branch infix regular questions problem itself, which we use as a
subalgorithm when answering relabel questions, behaves very well when given a
deterministic automaton on input. To answer branch infix regular questions about
a tree $T$ and deterministic automaton $A$ we need to create $|Q|$ copies of
$T$ for highest marked descendant questions, one for each color (and there are
$|Q|$ colors). When answering a question, we will make at most $|Q|$ jumps, each
jump itself taking constant time. Thus, using our notation for algorithms with a
preprocessing phase, branch infix regular questions have an
\qptime{$O(|A||T|$)}{$O(|A|)$} algorithm. Relabel regular queries themselves will
unfortunately work in exponential time: recall that the languages we preprocess
for branch infix questions for are constructed by \textit{reversing} a
deterministic automaton's language. This is a procedure that requires
determinization of a nondeterministic automaton, and thus can explode
exponentially.

\section{Structures of bounded treewidth}

With our algorithm we get, ``for free'', an
\qptime{$O_{\varphi}(n)$}{$O_{\varphi}(m \log m)$} algorithm for MSO query
answering on structures of bounded treewidth. Indeed, bounded treewidth
structures are MSO interpretable in trees in linear time \cite{courcelle1992}.

\section{LCA closure question answering}

In Section \ref{computing-closure} we showed that the following question answering
problem
\queryproblem{LCA closure question answering}{%
    a tree $T$.
}{%
    given a set of tree vertices $S$, compute the LCA closure of $S$.
}
has an \qptime{$O(|T|)$}{$|S| \log |S|$} solution. We do not know whether this
is an optimal solution. If the question answering step's complexity could be
reduced to $O(|S|)$, MSO query answering on structures of bounded treewidth
would then be known to be solvable in
\qptime{$O_{\varphi}(n)$}{$O_{\varphi}(m)$}. Or is there a lower bound on
computing LCA closures in this setting, for example, would it be possible to
reduce comparison sorting to computing LCA closures?

\printbibliography

\end{document}