{\rtf1\ansi\ansicpg1252\uc1 \deff0\deflang1033\deflangfe1033{\fonttbl{\f0\froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\f1\fswiss\fcharset0\fprq2{\*\panose 020b0604020202020204}Arial;}
{\f3\froman\fcharset2\fprq2{\*\panose 05050102010706020507}Symbol;}{\f15\fswiss\fcharset0\fprq2{\*\panose 020b0604030504040204}Tahoma;}{\f45\froman\fcharset238\fprq2 Times New Roman CE;}{\f46\froman\fcharset204\fprq2 Times New Roman Cyr;}
{\f48\froman\fcharset161\fprq2 Times New Roman Greek;}{\f49\froman\fcharset162\fprq2 Times New Roman Tur;}{\f50\froman\fcharset186\fprq2 Times New Roman Baltic;}{\f51\fswiss\fcharset238\fprq2 Arial CE;}{\f52\fswiss\fcharset204\fprq2 Arial Cyr;}
{\f54\fswiss\fcharset161\fprq2 Arial Greek;}{\f55\fswiss\fcharset162\fprq2 Arial Tur;}{\f56\fswiss\fcharset186\fprq2 Arial Baltic;}{\f135\fswiss\fcharset238\fprq2 Tahoma CE;}{\f136\fswiss\fcharset204\fprq2 Tahoma Cyr;}
{\f138\fswiss\fcharset161\fprq2 Tahoma Greek;}{\f139\fswiss\fcharset162\fprq2 Tahoma Tur;}{\f140\fswiss\fcharset186\fprq2 Tahoma Baltic;}}{\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;
\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128;\red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;
\red192\green192\blue192;}{\stylesheet{\widctlpar\adjustright \fs22\lang2057\cgrid \snext0 Normal;}{\s1\sb240\sa60\keepn\widctlpar\adjustright \b\f1\fs28\lang2057\kerning28\cgrid \sbasedon0 \snext0 heading 1;}{\s2\sb240\sa60\keepn\widctlpar\adjustright 
\b\i\f1\lang2057\cgrid \sbasedon0 \snext0 heading 2;}{\s3\qc\sb100\sa100\keepn\nowidctlpar\adjustright \lang2057 \sbasedon0 \snext0 heading 3;}{\s4\qj\sb240\sa60\keepn\nowidctlpar\adjustright \b\fs22\lang2057 \sbasedon0 \snext0 heading 4;}{\*\cs10 
\additive Default Paragraph Font;}{\s15\qj\sb100\sa100\nowidctlpar\adjustright \lang2057 \sbasedon0 \snext15 Body Text;}{\s16\qc\nowidctlpar\outlinelevel0\adjustright \b\fs28\lang3081\kerning28 \sbasedon0 \snext16 Title;}{
\s17\qc\sa200\nowidctlpar\outlinelevel0\adjustright \b\kerning28 \sbasedon0 \snext18 LREC author name;}{\s18\qc\nowidctlpar\outlinelevel0\adjustright \fs20\kerning28 \sbasedon0 \snext19 LREC affiliation;}{\s19\qc\sb240\widctlpar\adjustright \b\fs20\cgrid 
\sbasedon0 \snext20 LREC heading Abstract;}{\s20\qj\sl-200\slmult0\widctlpar\adjustright \fs18\cgrid \sbasedon0 \snext20 LREC abstract text;}{\s21\qc\sb240\sa60\sl-220\slmult0\widctlpar\adjustright \b\cgrid \sbasedon0 \snext22 LREC Heading 1;}{
\s22\qj\sl-220\slmult0\widctlpar\adjustright \fs20\cgrid \sbasedon0 \snext22 LREC main body text;}{\s23\qj\sb240\sa60\sl-220\slmult0\widctlpar\adjustright \b\fs22\cgrid \sbasedon0 \snext22 LREC Heading 2;}{\s24\qj\sb240\sl-220\slmult0
\widctlpar\adjustright \b\fs20\cgrid \sbasedon0 \snext22 LREC Heading 3;}{\*\cs25 \additive \i \sbasedon10 Emphasis;}{\*\cs26 \additive \ul\cf2 \sbasedon10 Hyperlink;}{\*\cs27 \additive \b \sbasedon10 Strong;}{\s28\qc\sb240\widctlpar\adjustright 
\fs20\lang2057\cgrid \sbasedon0 \snext28 LREC table caption;}{\s29\qj\fi-198\li198\sl-220\slmult0\widctlpar\adjustright \fs20\lang2057\cgrid \sbasedon0 \snext29 LREC biblio references;}{\s30\qc\sa240\nowidctlpar\outlinelevel0\adjustright 
\b\fs28\kerning28 \sbasedon0 \snext17 LREC title;}{\s31\qc\sb240\widctlpar\adjustright \fs20\lang2057\cgrid \sbasedon0 \snext31 LREC caption;}{\s32\qj\widctlpar\adjustright \fs18\lang2057\cgrid \sbasedon0 \snext32 LREC footnote;}{\*\cs33 \additive 
\ul\cf12 \sbasedon10 FollowedHyperlink;}{\s34\widctlpar\adjustright \cbpat9 \f15\fs22\lang2057\cgrid \sbasedon0 \snext34 Document Map;}{\s35\widctlpar\adjustright \fs20\lang2057\cgrid \sbasedon0 \snext35 footnote text;}{\*\cs36 \additive \super 
\sbasedon10 footnote reference;}}{\*\listtable{\list\listtemplateid67698703\listsimple{\listlevel\levelnfc0\leveljc0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'02\'00.;}{\levelnumbers\'01;}\fi-360\li360\jclisttab\tx360 }{\listname 
;}\listid9181033}{\list\listtemplateid68419599\listsimple{\listlevel\levelnfc0\leveljc0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'02\'00.;}{\levelnumbers\'01;}\fi-360\li360\jclisttab\tx360 }{\listname ;}\listid296231065}
{\list\listtemplateid68419585\listsimple{\listlevel\levelnfc23\leveljc0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'01\u-3913 ?;}{\levelnumbers;}\f3\fbias0 \fi-360\li360\jclisttab\tx360 }{\listname ;}\listid296880076}
{\list\listtemplateid-1736287058\listsimple{\listlevel\levelnfc23\leveljc0\levelfollow0\levelstartat0\levelspace0\levelindent0{\leveltext\'01-;}{\levelnumbers;}\fbias0 \fi-360\li360\jclisttab\tx360 }{\listname ;}\listid314451071}
{\list\listtemplateid68419599\listsimple{\listlevel\levelnfc0\leveljc0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'02\'00.;}{\levelnumbers\'01;}\fi-360\li360\jclisttab\tx360 }{\listname ;}\listid323749570}{\list\listtemplateid68419585
\listsimple{\listlevel\levelnfc23\leveljc0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'01\u-3913 ?;}{\levelnumbers;}\f3\fbias0 \fi-360\li360\jclisttab\tx360 }{\listname ;}\listid626551441}{\list\listtemplateid67698689\listsimple
{\listlevel\levelnfc23\leveljc0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'01\u-3913 ?;}{\levelnumbers;}\f3\fbias0 \fi-360\li360\jclisttab\tx360 }{\listname ;}\listid723992380}{\list\listtemplateid-321330758{\listlevel\levelnfc0
\leveljc0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'01\'00;}{\levelnumbers\'01;}\fi-432\li432\jclisttab\tx432 }{\listlevel\levelnfc0\leveljc0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'03\'00.\'01;}{\levelnumbers
\'01\'03;}\fi-576\li576\jclisttab\tx576 }{\listlevel\levelnfc0\leveljc0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'05\'00.\'01.\'02;}{\levelnumbers\'01\'03\'05;}\fi-720\li720\jclisttab\tx720 }{\listlevel\levelnfc0\leveljc0
\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'07\'00.\'01.\'02.\'03;}{\levelnumbers\'01\'03\'05\'07;}\fi-864\li864\jclisttab\tx864 }{\listlevel\levelnfc0\leveljc0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext
\'09\'00.\'01.\'02.\'03.\'04;}{\levelnumbers\'01\'03\'05\'07\'09;}\fi-1008\li1008\jclisttab\tx1008 }{\listlevel\levelnfc0\leveljc0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'0b\'00.\'01.\'02.\'03.\'04.\'05;}{\levelnumbers
\'01\'03\'05\'07\'09\'0b;}\fi-1152\li1152\jclisttab\tx1152 }{\listlevel\levelnfc0\leveljc0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'0d\'00.\'01.\'02.\'03.\'04.\'05.\'06;}{\levelnumbers\'01\'03\'05\'07\'09\'0b\'0d;}\fi-1296\li1296
\jclisttab\tx1296 }{\listlevel\levelnfc0\leveljc0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'0f\'00.\'01.\'02.\'03.\'04.\'05.\'06.\'07;}{\levelnumbers\'01\'03\'05\'07\'09\'0b\'0d\'0f;}\fi-1440\li1440\jclisttab\tx1440 }{\listlevel
\levelnfc0\leveljc0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'11\'00.\'01.\'02.\'03.\'04.\'05.\'06.\'07.\'08;}{\levelnumbers\'01\'03\'05\'07\'09\'0b\'0d\'0f\'11;}\fi-1584\li1584\jclisttab\tx1584 }{\listname ;}\listid796603225}
{\list\listtemplateid68419585\listsimple{\listlevel\levelnfc23\leveljc0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'01\u-3913 ?;}{\levelnumbers;}\f3\fbias0 \fi-360\li360\jclisttab\tx360 }{\listname ;}\listid1175222695}
{\list\listtemplateid68419585\listsimple{\listlevel\levelnfc23\leveljc0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'01\u-3913 ?;}{\levelnumbers;}\f3\fbias0 \fi-360\li360\jclisttab\tx360 }{\listname ;}\listid1252347702}
{\list\listtemplateid68419599\listsimple{\listlevel\levelnfc0\leveljc0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'02\'00.;}{\levelnumbers\'01;}\fi-360\li360\jclisttab\tx360 }{\listname ;}\listid1276137688}{\list\listtemplateid67698689
\listsimple{\listlevel\levelnfc23\leveljc0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'01\u-3913 ?;}{\levelnumbers;}\f3\fbias0 \fi-360\li360\jclisttab\tx360 }{\listname ;}\listid1562330479}{\list\listtemplateid-1982584262\listsimple
{\listlevel\levelnfc0\leveljc0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'02\'00.;}{\levelnumbers\'01;}\fi-360\li360\jclisttab\tx360 }{\listname ;}\listid1711806532}{\list\listtemplateid68419585\listsimple{\listlevel\levelnfc23
\leveljc0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'01\u-3913 ?;}{\levelnumbers;}\f3\fbias0 \fi-360\li360\jclisttab\tx360 }{\listname ;}\listid2092114841}{\list\listtemplateid68419585\listsimple{\listlevel\levelnfc23\leveljc0
\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'01\u-3913 ?;}{\levelnumbers;}\f3\fbias0 \fi-360\li360\jclisttab\tx360 }{\listname ;}\listid2120760388}}{\*\listoverridetable{\listoverride\listid1711806532\listoverridecount0\ls1}
{\listoverride\listid796603225\listoverridecount0\ls2}{\listoverride\listid323749570\listoverridecount0\ls3}{\listoverride\listid2120760388\listoverridecount0\ls4}{\listoverride\listid1252347702\listoverridecount0\ls5}{\listoverride\listid296231065
\listoverridecount0\ls6}{\listoverride\listid723992380\listoverridecount0\ls7}{\listoverride\listid296880076\listoverridecount0\ls8}{\listoverride\listid1276137688\listoverridecount0\ls9}{\listoverride\listid1175222695\listoverridecount0\ls10}
{\listoverride\listid1562330479\listoverridecount0\ls11}{\listoverride\listid2092114841\listoverridecount0\ls12}{\listoverride\listid626551441\listoverridecount0\ls13}{\listoverride\listid314451071\listoverridecount0\ls14}{\listoverride\listid9181033
\listoverridecount0\ls15}}{\info{\title Second International Conference on Language Resources and Evaluation}{\author Maria Gavrilidou}{\operator SINTEF}{\creatim\yr2000\mo4\dy12\hr13\min9}{\revtim\yr2000\mo4\dy12\hr13\min9}
{\printim\yr2000\mo3\dy24\hr10\min21}{\version2}{\edmins0}{\nofpages1}{\nofwords4777}{\nofchars27232}{\*\company IEL-ILSP}{\nofcharsws33442}{\vern89}}\paperw11906\paperh16838\margl1077\margr1077\margt1701\margb1077 
\widowctrl\ftnbj\aenddoc\makebackup\formshade\viewkind4\viewscale100\pgbrdrhead\pgbrdrfoot \fet0\sectd \linex0\headery1440\footery1440\colsx709\sectdefaultcl {\*\pnseclvl1\pnucrm\pnstart1\pnindent720\pnhang{\pntxta .}}{\*\pnseclvl2
\pnucltr\pnstart1\pnindent720\pnhang{\pntxta .}}{\*\pnseclvl3\pndec\pnstart1\pnindent720\pnhang{\pntxta .}}{\*\pnseclvl4\pnlcltr\pnstart1\pnindent720\pnhang{\pntxta )}}{\*\pnseclvl5\pndec\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}{\*\pnseclvl6
\pnlcltr\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}{\*\pnseclvl7\pnlcrm\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}{\*\pnseclvl8\pnlcltr\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}{\*\pnseclvl9\pnlcrm\pnstart1\pnindent720\pnhang
{\pntxtb (}{\pntxta )}}\pard\plain \s30\qc\sa240\nowidctlpar\outlinelevel0\adjustright \b\fs28\kerning28 {\lang2070 Providing Internet Access to Portuguese Corpora: the AC/DC Project}{\lang2057 
\par }\pard\plain \s17\qc\sa200\nowidctlpar\outlinelevel0\adjustright \b\kerning28 {\lang2057 Diana Santos, Eckhard Bick
\par }\pard\plain \s18\qc\nowidctlpar\outlinelevel0\adjustright \fs20\kerning28 {\lang2057 SINTEF Telecom and Informatics
\par Postboks 1024 Blindern, N-0314 Oslo, Norway
\par }{\field{\*\fldinst { HYPERLINK mailto:}{\lang2057 Diana.Santos@informatics.sintef.no}{ }{{\*\datafield 
00d0c9ea79f9bace118c8200aa004ba90b0200000017000000230000004400690061006e0061002e00530061006e0074006f007300400069006e0066006f0072006d00610074006900630073002e00730069006e007400650066002e006e006f000000e0c9ea79f9bace118c8200aa004ba90b540000006d00610069006c00
74006f003a004400690061006e0061002e00530061006e0074006f007300400069006e0066006f0072006d00610074006900630073002e00730069006e007400650066002e006e006f000000}}}{\fldrslt {\cs26\ul\cf2 Diana.Santos@informatics.sintef.no}}}{\lang2057 , }
{\field\fldedit{\*\fldinst { HYPERLINK mailto:}{\lang2057 lineb@visl.hum.dk}{ }{{\*\datafield 
00d0c9ea79f9bace118c8200aa004ba90b0200000017000000120000006c0069006e006500620040007600690073006c002e00680075006d002e0064006b000000e0c9ea79f9bace118c8200aa004ba90b320000006d00610069006c0074006f003a006c0069006e006500620040007600690073006c002e00680075006d00
2e0064006b000000}}}{\fldrslt {\cs26\ul\cf2 lineb@hum.au.dk}}}{\lang2057  
\par }\pard\plain \s19\qc\sb240\widctlpar\adjustright \b\fs20\cgrid {\lang2057 Abstract
\par }\pard\plain \s20\qj\sl-200\slmult0\widctlpar\adjustright \fs18\cgrid {\lang2070 In this paper we report on the activity of the project Computational Processing of Portuguese (}{\i\lang2070 Processamento computacional do portugu\'eas}{\lang2070 
) in what concerns providing access to Portuguese corpora through the Internet. One of its activities, the AC/DC project (}{\i\lang2070 Acesso a corpora/Disponibiliza\'e7\'e3o de Corpora}{\lang2070 
, roughly "Access and Availability of Corpora") allows a user to query around 40 million words of Portuguese text. After describing the aim
s of the service, which is still being subject to regular improvements, we focus on the process of tagging and parsing the underlying corpora, using a Constraint Grammar parser for Portuguese}{\lang2057 . 
\par 
\par \sect }\sectd \margtsxn1418\sbknone\linex0\headery1440\footery1440\cols2\colsx357\sectdefaultcl \pard\plain \s21\qc\sb240\sa60\sl-220\slmult0\widctlpar\adjustright \b\cgrid {\lang2057 General Background
\par }\pard\plain \s22\qj\sl-220\slmult0\widctlpar\adjustright \fs20\cgrid {\lang2070 The project Computational Processing of P
ortuguese (CPP) is financed by the Portuguese Ministry of Science and Technology in order to foster R&D on the computational processing of Portuguese and eventually lead to the availability of state-of-the-art products and services in Portuguese in the ne
w information age. The AC/DC project, a subactivity of CPP, fits in under the more general goal of tackling the problem of lack of available and usable resources for research and evaluation.}{\lang2057  
\par The main lines of activity are:
\par {\pntext\pard\plain\s22 \fs20\lang2057\cgrid \hich\af0\dbch\af0\loch\f0 1.\tab}}\pard \s22\qj\fi-360\li360\sl-220\slmult0\widctlpar\jclisttab\tx360{\*\pn \pnlvlbody\ilvl0\ls3\pnrnot0\pndec\pnstart1\pnindent360\pnhang{\pntxta .}}\ls3\adjustright {
\lang2057 Creation of publicly available resources
\par {\pntext\pard\plain\s22 \fs20\lang2057\cgrid \hich\af0\dbch\af0\loch\f0 2.\tab}}\pard \s22\qj\fi-360\li360\sl-220\slmult0\widctlpar\jclisttab\tx360{\*\pn \pnlvlbody\ilvl0\ls3\pnrnot0\pndec\pnstart1\pnindent360\pnhang{\pntxta .}}\ls3\adjustright {
\lang2057 Redistribution of already available resources
\par {\pntext\pard\plain\s22 \fs20\lang2057\cgrid \hich\af0\dbch\af0\loch\f0 3.\tab}}\pard \s22\qj\fi-360\li360\sl-220\slmult0\widctlpar\jclisttab\tx360{\*\pn \pnlvlbody\ilvl0\ls3\pnrnot0\pndec\pnstart1\pnindent360\pnhang{\pntxta .}}\ls3\adjustright {
\lang2057 Cataloguing the area 
\par {\pntext\pard\plain\s22 \fs20\lang2057\cgrid \hich\af0\dbch\af0\loch\f0 4.\tab}}\pard \s22\qj\fi-360\li360\sl-220\slmult0\widctlpar\jclisttab\tx360{\*\pn \pnlvlbody\ilvl0\ls3\pnrnot0\pndec\pnstart1\pnindent360\pnhang{\pntxta .}}\ls3\adjustright {
\lang2057 Evaluation of particular fields
\par }\pard \s22\qj\sl-220\slmult0\widctlpar\adjustright {\lang2057 The AC/DC project falls mainly under the second class of activities, but insofar that it adds value to existing corpora, it can be viewed as contributing equally to the first goal.
\par }\pard\plain \s21\qc\sb240\sa60\sl-220\slmult0\widctlpar\adjustright \b\cgrid {\lang2057 The AC/DC project
\par }\pard\plain \s22\qj\sl-220\slmult0\widctlpar\adjustright \fs20\cgrid {\lang2070 
The AC/DC project stemmed from the wish to have in a comparable form for all so far publicly available corpora of the Portuguese language. After identification and clearance of the relevant copyright issue
s, the corpora were encoded in the IMS Corpus Workbench (Christ et al., 1999), to which a Web interface was added. Technical and scientific reasons for the choice of the underlying corpus system have already been described in (Santos, 1998); an additional
 political advantage was the fact that this system runs under Linux, a non-proprietary system. 
\par The AC/DC project has so far consisted of two phases. The first phase identified and brought to a common format the textual resources already available, providin
g a Web-based service of Portuguese corpora. The second phase improved the information associated with these corpora by parsing them with a broad-coverage parser for Portuguese, encoding the result in the same workbench, and serving them in the same overa
ll service.
\par }\pard\plain \s23\qj\sb240\sa60\sl-220\slmult0\widctlpar\adjustright \b\fs22\cgrid {\lang2057 Overview of the first phase
\par }\pard\plain \s22\qj\sl-220\slmult0\widctlpar\adjustright \fs20\cgrid {\lang2070 
The first phase of the project resulted in five different corpora available for search on the Web since September 1999, on http://cgi.portugues.mct.pt/acesso/. This service has been regularly updated and improved sinc
e, and the corpora featured by the service \endash  reflecting its status on 23 March 2000 \endash  are summarized in Table 1. 
\par Paragraph and sentence separation were computed automatically for every corpus; for some corpora, other parts were also explicitly encoded by means of structural attributes, such as titles, footnotes and/or author identification.
\par 
\par }\trowd \trqc\trleft-129\trbrdrt\brdrs\brdrw10 \trbrdrl\brdrs\brdrw10 \trbrdrb\brdrs\brdrw10 \trbrdrr\brdrs\brdrw10 \trbrdrh\brdrs\brdrw10 \trbrdrv\brdrs\brdrw10 \clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr
\brdrs\brdrw10 \cltxlrtb \cellx2030\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb \cellx3068\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr
\brdrs\brdrw10 \cltxlrtb \cellx4439\pard \s22\sl-220\slmult0\widctlpar\intbl\adjustright {\b\lang2070 Corpus
\par Identification\cell }\pard \s22\qc\sl-220\slmult0\widctlpar\intbl\adjustright {\b\lang2070 Size in
\par words (k)\cell Size in
\par sentences\cell }\pard\plain \widctlpar\intbl\adjustright \fs22\lang2057\cgrid {\lang2070 \row }\trowd \trqc\trgaph56\trleft-185\trbrdrt\brdrs\brdrw10 \trbrdrl\brdrs\brdrw10 \trbrdrb\brdrs\brdrw10 \trbrdrr\brdrs\brdrw10 \trbrdrh\brdrs\brdrw10 \trbrdrv
\brdrs\brdrw10 \clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb \cellx1974\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb 
\cellx3012\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb \cellx4383\pard\plain \s22\qj\sl-220\slmult0\widctlpar\intbl\adjustright \fs20\cgrid {\lang2070 Natura/P\'fablico\cell }\pard 
\s22\qr\sl-220\slmult0\widctlpar\intbl\adjustright {\lang2070 6,242\cell 225,088\cell }\pard\plain \widctlpar\intbl\adjustright \fs22\lang2057\cgrid {\lang2070 \row }\pard\plain \s22\qj\sl-220\slmult0\widctlpar\intbl\adjustright \fs20\cgrid {\lang2070 
ENPCpub\cell }\pard \s22\qr\sl-220\slmult0\widctlpar\intbl\adjustright {\lang2070 72\cell 4,371\cell }\pard\plain \widctlpar\intbl\adjustright \fs22\lang2057\cgrid {\lang2070 \row }\pard\plain \s22\qj\sl-220\slmult0\widctlpar\intbl\adjustright 
\fs20\cgrid {\lang2070 Natura/Di\'e1rio do Minho\cell }\pard \s22\qr\sl-220\slmult0\widctlpar\intbl\adjustright {\lang2070 2,110\cell 91,203\cell }\pard\plain \widctlpar\intbl\adjustright \fs22\lang2057\cgrid {\lang2070 \row }\pard\plain 
\s22\qj\sl-220\slmult0\widctlpar\intbl\adjustright \fs20\cgrid {\lang2070 ECI-EBR\cell }\pard \s22\qr\sl-220\slmult0\widctlpar\intbl\adjustright {\lang2070 718\cell 43,866\cell }\pard\plain \widctlpar\intbl\adjustright \fs22\lang2057\cgrid {\lang2070 
\row }\pard\plain \s22\qj\sl-220\slmult0\widctlpar\intbl\adjustright \fs20\cgrid {\lang2070 ECI-EE\cell }\pard \s22\qr\sl-220\slmult0\widctlpar\intbl\adjustright {\lang2070 26\cell 776\cell }\pard\plain \widctlpar\intbl\adjustright \fs22\lang2057\cgrid {
\lang2070 \row }\pard\plain \s22\qj\sl-220\slmult0\widctlpar\intbl\adjustright \fs20\cgrid {\lang2070 NILC/S\'e3o Carlos\cell }\pard \s22\qr\sl-220\slmult0\widctlpar\intbl\adjustright {\lang2070 33,618\cell 2,195,056\cell }\pard\plain 
\widctlpar\intbl\adjustright \fs22\lang2057\cgrid {\lang2070 \row }\pard\plain \s22\qj\sl-220\slmult0\widctlpar\intbl\adjustright \fs20\cgrid {\lang2070 FRASESPP\cell }\pard \s22\qr\sl-220\slmult0\widctlpar\intbl\adjustright {\lang2070 16\cell 594\cell 
}\pard\plain \widctlpar\intbl\adjustright \fs22\lang2057\cgrid {\lang2070 \row }\pard\plain \s22\qj\sl-220\slmult0\widctlpar\intbl\adjustright \fs20\cgrid {\lang2070 FRASESPB\cell }\pard \s22\qr\sl-220\slmult0\widctlpar\intbl\adjustright {\lang2070 19
\cell 652\cell }\pard\plain \widctlpar\intbl\adjustright \fs22\lang2057\cgrid {\lang2070 \row }\trowd \trqc\trgaph56\trleft-185\trbrdrt\brdrs\brdrw10 \trbrdrl\brdrs\brdrw10 \trbrdrb\brdrs\brdrw10 \trbrdrr\brdrs\brdrw10 \trbrdrh\brdrs\brdrw10 \trbrdrv
\brdrs\brdrw10 \clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb \cellx1974\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb 
\cellx3012\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb \cellx4383\pard\plain \s22\qj\sl-220\slmult0\widctlpar\intbl\adjustright \fs20\cgrid {\b\lang2070 Total\cell }\pard 
\s22\qr\sl-220\slmult0\widctlpar\intbl\adjustright {\b\lang2070 42,821\cell 2,561,606\cell }\pard\plain \widctlpar\intbl\adjustright \fs22\lang2057\cgrid {\b\lang2070 \row }\pard\plain \s22\qc\sl-220\slmult0\widctlpar\adjustright \fs20\cgrid {\lang2057 
Table 1: Portuguese corpora
\par }\pard \s22\qj\sl-220\slmult0\widctlpar\adjustright {\lang2070 
\par With the exception of the last two, which are sentence corpora instead of text corpora and whose original goal was parser p
erformance testing, none of these corpora have been compiled by the authors nor under the framework of the AC/DC project. Rather, our project started by amassing all resources already publicly available \endash 
 in quite distinct formats and revision status, incidentally. (We hope to report on the creation of a larger resource soon, but this lies outside the scope of the present paper.)
\par Let us outline here the process followed in the first phase of the AC/DC project (a more detailed description of the whole process can be found in (Santos, to appear)): 
\par After getting the corpora as text files in ISO-8859-1 format, Perl programs were written that 
\par {\pntext\pard\plain\s22 \f3\fs20\lang2070\cgrid \loch\af3\dbch\af0\hich\f3 \'b7\tab}}\pard \s22\qj\fi-360\li360\sl-220\slmult0\widctlpar\jclisttab\tx360{\*\pn \pnlvlblt\ilvl0\ls4\pnrnot0\pnf3\pnstart1\pnindent360\pnhang{\pntxtb \'b7}}\ls4\adjustright {
\lang2070 Cleaned the input
\par {\pntext\pard\plain\s22 \f3\fs20\lang2070\cgrid \loch\af3\dbch\af0\hich\f3 \'b7\tab}}\pard \s22\qj\fi-360\li360\sl-220\slmult0\widctlpar\jclisttab\tx360{\*\pn \pnlvlblt\ilvl0\ls4\pnrnot0\pnf3\pnstart1\pnindent360\pnhang{\pntxtb \'b7}}\ls4\adjustright {
\lang2070 Added structural tags (such as titles, parts, captions, signatures and the like)
\par {\pntext\pard\plain\s22 \f3\fs20\lang2070\cgrid \loch\af3\dbch\af0\hich\f3 \'b7\tab}}\pard \s22\qj\fi-360\li360\sl-220\slmult0\widctlpar\jclisttab\tx360{\*\pn \pnlvlblt\ilvl0\ls4\pnrnot0\pnf3\pnstart1\pnindent360\pnhang{\pntxtb \'b7}}\ls4\adjustright {
\lang2070 (Assembled the whole corpus in case it happened to be distributed among several files)
\par {\pntext\pard\plain\s22 \f3\fs20\lang2070\cgrid \loch\af3\dbch\af0\hich\f3 \'b7\tab}}\pard \s22\qj\fi-360\li360\sl-220\slmult0\widctlpar\jclisttab\tx360{\*\pn \pnlvlblt\ilvl0\ls4\pnrnot0\pnf3\pnstart1\pnindent360\pnhang{\pntxtb \'b7}}\ls4\adjustright {
\lang2070 Separated sentences and paragraphs
\par {\pntext\pard\plain\s22 \f3\fs20\lang2070\cgrid \loch\af3\dbch\af0\hich\f3 \'b7\tab}}\pard \s22\qj\fi-360\li360\sl-220\slmult0\widctlpar\jclisttab\tx360{\*\pn \pnlvlblt\ilvl0\ls4\pnrnot0\pnf3\pnstart1\pnindent360\pnhang{\pntxtb \'b7}}\ls4\adjustright {
\lang2070 Tokenized properly the result (a quite complex process indeed)
\par {\pntext\pard\plain\s22 \f3\fs20\lang2070\cgrid \loch\af3\dbch\af0\hich\f3 \'b7\tab}}\pard \s22\qj\fi-360\li360\sl-220\slmult0\widctlpar\jclisttab\tx360{\*\pn \pnlvlblt\ilvl0\ls4\pnrnot0\pnf3\pnstart1\pnindent360\pnhang{\pntxtb \'b7}}\ls4\adjustright {
\lang2070 Counted (and documented) the resulting object 
\par }\pard \s22\qj\sl-220\slmult0\widctlpar\adjustright {\lang2070 Then, the corpus objects were encoded in the 
IMS workbench, and installed in the Web server together with the corresponding HTML documentation, semi-automatically generated. For each corpus, several counts were done, and a quantitative overview of all corpora produced in table format. It would be ex
tremely tedious, and error-prone, to change the values by hand every time a change was introduced in the programs.
\par Comparing this service with the one reported (Santos, 1998) for the Oslo Corpus of Bosnian Texts (OCBT), the main differences are:
\par {\pntext\pard\plain\s22 \f3\fs20\lang2070\cgrid \loch\af3\dbch\af0\hich\f3 \'b7\tab}}\pard \s22\qj\fi-360\li360\sl-220\slmult0\widctlpar\jclisttab\tx360{\*\pn \pnlvlblt\ilvl0\ls5\pnrnot0\pnf3\pnstart1\pnindent360\pnhang{\pntxtb \'b7}}\ls5\adjustright {
\lang2070 no user identification is required, i.e., there's no additional layer of bureaucracy imposed on those who want to query the corpus;
\par {\pntext\pard\plain\s22 \f3\fs20\lang2070\cgrid \loch\af3\dbch\af0\hich\f3 \'b7\tab}}\pard \s22\qj\fi-360\li360\sl-220\slmult0\widctlpar\jclisttab\tx360{\*\pn \pnlvlblt\ilvl0\ls5\pnrnot0\pnf3\pnstart1\pnindent360\pnhang{\pntxtb \'b7}}\ls5\adjustright {
\lang2070 no restrictions are made in terms of size of results or of query (since the corpora are freely available);}{\cs36\lang2070\super \chftn {\footnote \pard\plain \s32\qj\widctlpar\adjustright \fs18\lang2057\cgrid {\cs36\super \chftn }{
 Except for one corpus, which has the appropriate restrictions concealed in the Web interface.}}}{\lang2070 
\par {\pntext\pard\plain\s22 \f3\fs20\lang2070\cgrid \loch\af3\dbch\af0\hich\f3 \'b7\tab}}\pard \s22\qj\fi-360\li360\sl-220\slmult0\widctlpar\jclisttab\tx360{\*\pn \pnlvlblt\ilvl0\ls5\pnrnot0\pnf3\pnstart1\pnindent360\pnhang{\pntxtb \'b7}}\ls5\adjustright {
\lang2070 there is no parsing of the user queries in addition to the one done by CQP (no attempt to correct the user, or to help him/her apart from very obvious cases).
\par }\pard \s22\qj\sl-220\slmult0\widctlpar\adjustright {\cs27\lang2057 Even though the last feature may seem to be a step backwards, it is our belief that corpus users who require sophisticated q
ueries have to be able to pose them, so that, in the long run, they have to consult the CQP user's manual (Christ et al., 1999) and work their way through the possibilities offered. Adding a layer of "simplification" is just replacing one query language w
i
th another, which is not our goal. Although a new query language may be considered more user-friendly by some users, following such method may lead to compromising significantly the system's expressive power, as is the case of the Norwegian tagged corpus,
 a project (see }{\lang2057 http://www.tekstlab.uio.no/norsk/bokmaal) }{\cs27\lang2057 which used}{\lang2057  }{\cs27\lang2057 
OCBT's underlying Web interface but provided a menu-based query language on top of it, which restricted, to a large extent, the system's original capabilities.
\par }\pard\plain \s23\qj\sb240\sa60\sl-220\slmult0\widctlpar\adjustright \b\fs22\cgrid {Introduction to the second phase
\par }\pard\plain \s22\qj\sl-220\slmult0\widctlpar\adjustright \fs20\cgrid {\lang2070 The second phase of the project aims at enriching the aforementioned corpora with morphological, PoS and}{\i\lang2070  }{\lang2070 
syntactic annotation. To do this, automatically, we use a robust Constraint Grammar based tagger-parser which - to our knowledge - is the most developed system for}{\cf6\lang2070  }{\lang2070 Portuguese to date.}{\i\lang2070  
\par }{\lang2070 The first two corpora were annotated and made available on the Web on February 2000. In Table 2, we provide a rough quantitative overview of their constitution.
\par 
\par }\trowd \trgaph108\trbrdrt\brdrs\brdrw10 \trbrdrl\brdrs\brdrw10 \trbrdrb\brdrs\brdrw10 \trbrdrr\brdrs\brdrw10 \trbrdrh\brdrs\brdrw10 \trbrdrv\brdrs\brdrw10 \clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr
\brdrs\brdrw10 \cltxlrtb \cellx1529\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb \cellx3166\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr
\brdrs\brdrw10 \cltxlrtb \cellx4536\pard \s22\qj\sl-220\slmult0\widctlpar\intbl\adjustright {\b\lang2070 Parsed corpus\cell NATPANOT\cell EBRANOT\cell }\pard\plain \widctlpar\intbl\adjustright \fs22\lang2057\cgrid {\lang2070 \row }\trowd \trgaph108
\trbrdrt\brdrs\brdrw10 \trbrdrl\brdrs\brdrw10 \trbrdrb\brdrs\brdrw10 \trbrdrr\brdrs\brdrw10 \trbrdrh\brdrs\brdrw10 \trbrdrv\brdrs\brdrw10 \clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb 
\cellx1529\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb \cellx3166\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb \cellx4536
\pard\plain \s22\qj\sl-220\slmult0\widctlpar\intbl\adjustright \fs20\cgrid {\lang2070 Sentences\cell }\pard \s22\qr\sl-220\slmult0\widctlpar\intbl\adjustright {\lang2070 224,500\cell 43,500\cell }\pard\plain \widctlpar\intbl\adjustright 
\fs22\lang2057\cgrid {\lang2070 \row }\pard\plain \s22\qj\sl-220\slmult0\widctlpar\intbl\adjustright \fs20\cgrid {\lang2070 Words\cell }\pard \s22\qr\sl-220\slmult0\widctlpar\intbl\adjustright {\lang2070 6,250,000\cell 709,000\cell }\pard\plain 
\widctlpar\intbl\adjustright \fs22\lang2057\cgrid {\lang2070 \row }\pard\plain \s22\qj\sl-220\slmult0\widctlpar\intbl\adjustright \fs20\cgrid {\lang2070 Nouns\cell }\pard \s22\qr\sl-220\slmult0\widctlpar\intbl\adjustright {\lang2070 1,311,000\cell 141,500
\cell }\pard\plain \widctlpar\intbl\adjustright \fs22\lang2057\cgrid {\lang2070 \row }\pard\plain \s22\qj\sl-220\slmult0\widctlpar\intbl\adjustright \fs20\cgrid {\lang2070 Verbs\cell }\pard \s22\qr\sl-220\slmult0\widctlpar\intbl\adjustright {\lang2070 
770,000\cell 112,500\cell }\pard\plain \widctlpar\intbl\adjustright \fs22\lang2057\cgrid {\lang2070 \row }\pard\plain \s22\qj\sl-220\slmult0\widctlpar\intbl\adjustright \fs20\cgrid {\lang2070 Adjectives\cell }\pard \s22\qr\sl-220\slmult0
\widctlpar\intbl\adjustright {\lang2070 353,000\cell 40,000\cell }\pard\plain \widctlpar\intbl\adjustright \fs22\lang2057\cgrid {\lang2070 \row }\pard\plain \s22\qj\sl-220\slmult0\widctlpar\intbl\adjustright \fs20\cgrid {\lang2070 Adverbs\cell }\pard 
\s22\qr\sl-220\slmult0\widctlpar\intbl\adjustright {\lang2070 319,700\cell 48,000\cell }\pard\plain \widctlpar\intbl\adjustright \fs22\lang2057\cgrid {\lang2070 \row }\pard\plain \s22\qj\sl-220\slmult0\widctlpar\intbl\adjustright \fs20\cgrid {\lang2070 
Proper nouns\cell }\pard \s22\qr\sl-220\slmult0\widctlpar\intbl\adjustright {\lang2070 541,000\cell 31,600\cell }\pard\plain \widctlpar\intbl\adjustright \fs22\lang2057\cgrid {\lang2070 \row }\trowd \trgaph108\trbrdrt\brdrs\brdrw10 \trbrdrl\brdrs\brdrw10 
\trbrdrb\brdrs\brdrw10 \trbrdrr\brdrs\brdrw10 \trbrdrh\brdrs\brdrw10 \trbrdrv\brdrs\brdrw10 \clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb \cellx1529\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl
\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb \cellx3166\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb \cellx4536\pard\plain \s22\qj\sl-220\slmult0
\widctlpar\intbl\adjustright \fs20\cgrid {\lang2070 Contractions\cell }\pard \s22\qr\sl-220\slmult0\widctlpar\intbl\adjustright {\lang2070 495,300\cell 43,100\cell }\pard\plain \widctlpar\intbl\adjustright \fs22\lang2057\cgrid {\lang2070 \row 
}\pard\plain \s22\qc\sl-220\slmult0\widctlpar\adjustright \fs20\cgrid {\lang2057 Table 2: Annotated corpora
\par }\pard\plain \s21\qc\sb240\sa60\sl-220\slmult0\widctlpar\adjustright \b\cgrid {\lang2057 A CG parser for Portuguese 
\par }\pard\plain \s22\qj\sl-220\slmult0\widctlpar\adjustright \fs20\cgrid {\lang2070 The present sect
ion introduces the CG multi-level parsing system used in the annotation project, and presents some statistics of its performance applied to the Portuguese corpora in question.
\par The parser uses a lexicon base of about 50.000 lexemes for its morphological ana
lysis, and performs context governed rule based disambiguation at successive levels of analysis, including word class, inflexion, dependency syntax, valency instantiation and some experimental polysemy resolution.}{\i\lang2070  }{\lang2070 
Processing speed is ca. 500 words/sec on a Pentium II based Linux system, when all annotation levels are included.}{\lang2057 
\par }{\lang2070 Rules are expressed in the Constraint Grammar formalism (Karlsson, 1995), using the CG2 variant (Tapanainen, 1997). Following CG tradition, modular word based tags are used on all lev
els, and the system's grammar is implemented by adding or removing individual tags or sets of tags in a context dependent way. Usually, the whole sentence is used as a rule scope window, providing for a much richer context than is used in most probabilist
ic or automated learning systems. All in all, the grammar comprises of about 8000 rules, of which 6000 are used in the present annotation task. The fact that the final parse is created in a reductionist way, and the }{\i\lang2070 last surviving }{
\lang2070 reading is regarded as correct, guarantees a high degree of robustness, especially when comparing to PSG type systems based on rewriting rules.
\par In previous evaluations (Bick, 1996 and 2000)}{\i\lang2070 , }{\lang2070 at near 100% disambiguation, the system achieved correctness rates of over 99% for PoS and 
96-97% for syntax, when analysing free running text. So far, tests suggest that performance and robustness are fairly stable across a variety of written text types, for both Brazilian and modern European Portuguese. Pilot evaluations for the analysis of t
r
ansscribed speech and historical texts indicate that the same system can handle even non-standard text types when allowing for a drop in syntactic performance of  a few percentage points. Provided a fairly standard (or filtered) orthography, PoS tagging s
u
ffered no substantial decrease in performance. Not least in the present corpus annotation task, robustness has been a key factor, due to considerable text type differences between individual sub-corpora, and the incorporation of some speech and dialectal 
data (ECI-EBR). At the time of writing, only some pilot evaluation of parser performance variation had been done on the first couple of  the AC/DC corpora. Correctness percentages relate to parser word token numbers (excluding punctuation).
\par }\pard \s22\qj\fi720\sl-220\slmult0\widctlpar\adjustright {\lang2070 
\par }\trowd \trgaph108\trbrdrt\brdrs\brdrw10 \trbrdrl\brdrs\brdrw10 \trbrdrb\brdrs\brdrw10 \trbrdrr\brdrs\brdrw10 \trbrdrh\brdrs\brdrw10 \trbrdrv\brdrs\brdrw10 \clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr
\brdrs\brdrw10 \cltxlrtb \cellx1560\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb \cellx3166\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr
\brdrs\brdrw10 \cltxlrtb \cellx4536\pard \s22\qj\sl-220\slmult0\widctlpar\intbl\adjustright {\b\lang2070 Parsed corpus\cell NATPANOT\cell EBRANOT\cell }\pard\plain \widctlpar\intbl\adjustright \fs22\lang2057\cgrid {\lang2070 \row }\trowd \trgaph108
\trbrdrt\brdrs\brdrw10 \trbrdrl\brdrs\brdrw10 \trbrdrb\brdrs\brdrw10 \trbrdrr\brdrs\brdrw10 \trbrdrh\brdrs\brdrw10 \trbrdrv\brdrs\brdrw10 \clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb 
\cellx1560\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb \cellx3166\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb \cellx4536
\pard\plain \s22\qj\sl-220\slmult0\widctlpar\intbl\adjustright \fs20\cgrid {\lang2070 sample size\cell }\pard \s22\qc\sl-220\slmult0\widctlpar\intbl\adjustright {\lang2070 4127 words\cell 2592 words\cell }\pard\plain \widctlpar\intbl\adjustright 
\fs22\lang2057\cgrid {\lang2070 \row }\pard\plain \s22\qj\sl-220\slmult0\widctlpar\intbl\adjustright \fs20\cgrid {\lang2070 PoS correct\cell }\pard \s22\qc\sl-220\slmult0\widctlpar\intbl\adjustright {\lang2070 99%\cell 99.3%\cell }\pard\plain 
\widctlpar\intbl\adjustright \fs22\lang2057\cgrid {\lang2070 \row }\trowd \trgaph108\trbrdrt\brdrs\brdrw10 \trbrdrl\brdrs\brdrw10 \trbrdrb\brdrs\brdrw10 \trbrdrr\brdrs\brdrw10 \trbrdrh\brdrs\brdrw10 \trbrdrv\brdrs\brdrw10 \clvertalt\clbrdrt\brdrs\brdrw10 
\clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb \cellx1560\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb \cellx3166\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl
\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb \cellx4536\pard\plain \s22\qj\sl-220\slmult0\widctlpar\intbl\adjustright \fs20\cgrid {\lang2070 Syntax correct\cell }\pard \s22\qc\sl-220\slmult0\widctlpar\intbl\adjustright {
\lang2070 96.3%\cell 96.8%\cell }\pard\plain \widctlpar\intbl\adjustright \fs22\lang2057\cgrid {\lang2070 \row }\pard\plain \s22\qc\sl-220\slmult0\widctlpar\adjustright \fs20\cgrid {\lang2057 Table 3: Parser performance
\par }\pard\plain \s21\qc\sb240\sa60\sl-220\slmult0\widctlpar\adjustright \b\cgrid {\lang2057 The syntactic annotation paradigm 
\par }\pard\plain \s22\qj\sl-220\slmult0\widctlpar\adjustright \fs20\cgrid {\lang2070 Though it can be used to generate syntactic tree structures  (as in the grammar teaching system at }{\field\flddirty{\*\fldinst {\lang2070  HYPERLINK http://visl.hum.sdu.dk) 
}{\lang2070 {\*\datafield 
00d0c9ea79f9bace118c8200aa004ba90b02000000170000001800000068007400740070003a002f002f007600690073006c002e00680075006d002e007300640075002e0064006b0029000000e0c9ea79f9bace118c8200aa004ba90b3200000068007400740070003a002f002f007600690073006c002e00680075006d00
2e007300640075002e0064006b0029002f000000}}}{\fldrslt {\cs26\ul http://visl.hum.sdu.dk)}}}{\lang2070 
, the parser internally handles syntax as (flat) dependency grammar, using syntactic function tags with directional dependency markers. On the clause level, @SUBJ> and @<SUBJ, for instance, mark subjects (or \endash  in the case of  groups \endash  }{
\i\lang2070 heads }{\lang2070 of subjects), the arrows indicating the position of the governing verb (i.e. pre- and post-positioned subjects, respectively). At group level, dependency arrow  heads are marked for head type: @>N is 
a prenominal modifier (to be combined with PoS information, like 'adjective', 'determiner' etc.), and @P< is the argument of a preposition. Clause function is marked by adding a second, "external", tag on clause header words (relatives, interrogatives and
 
subordinating conjunctions) or non-finite verb chain headers (in clauses without a header word). These tags indicate both syntactic form and function: @#FS-<ACC, for instance, means a finite subclause which functions as a direct (accusative) object. Toget
her, these syntactic dependency tags represent overall structure the same way a mobile is built \endash 
 every word knows its head, and derives its "outer function" from this head. In the example below, the prenominal (@>N) 'os' attaches to a subject head (@SUBJ>) 'problemas', which again "knows" its head, the finite main verb (@FMV) 's\'e3
o'. The whole subclause is "represented" by 'que' which carries two tags, the - internal - subordinator tag (@SUB), and the \endash  external \endash  object tag (@#FS-<ACC), which in turn attaches the clause to the top main verb (@FMV) 'sabe'.
\par }\pard\plain \sb120\widctlpar\tx851\tx2694\tx3828\tx6220\tx7938\adjustright \fs22\lang2057\cgrid {\fs18 Sabe\tab [saber]  V PR 3S IND\tab @FMV\tab 
\par }\pard \widctlpar\tx851\tx2694\tx3828\tx6220\tx7938\adjustright \shading1000 {\fs18 que\tab [que] KS}{\b\fs18 \tab @#FS-<ACC@SUB}{\fs18 
\par }\pard \widctlpar\tx851\tx1720\tx2694\tx3828\tx6220\tx7938\adjustright {\fs18 os\tab [o] <art> DET M P\tab \tab   @>N
\par problemas\tab [problema] N M P\tab \tab @SUBJ>
\par s\'e3o\tab [ser]  V PR 3P IND\tab \tab @FMV\tab 
\par }\pard \qj\widctlpar\tx851\tx2694\tx3828\tx6220\tx7938\adjustright {\fs18 graves\tab [grave] ADJ M/F P\tab \tab @<SC\tab 
\par }\pard\plain \s22\qj\sl-220\slmult0\widctlpar\tx851\tx2694\tx3828\adjustright \fs20\cgrid {\fs18\lang2057 
\par }{\lang2057 Since syntactic tags, in this scheme, are word based, they combine with other \endash  morphological \endash  tags in a natural way, and can easily be searched  for  with the same corpus searching tools.
\par }\pard\plain \s21\qc\sb240\sa60\sl-220\slmult0\widctlpar\adjustright \b\cgrid {\lang2057 The annotation process
\par }\pard\plain \s22\qj\sl-220\slmult0\widctlpar\adjustright \fs20\cgrid {\lang2070 Given the existence of the previously described parser, it was an obvious choice to use it in the AC/DC project. }{\cs27\lang2057 
The second phase of this project happened thus as a collaborative effort between the two sites Oslo and }{\cs27\lang1044 \'c5rhus. It proceeded as follows}{\cs27\lang2057 : }{\lang2057  
\par From the corpus encoded in the first phase of the AC/DC project in Oslo, a pure text version was re-created and sent to }{\lang1044 \'c5rhus}{\lang2057 
 to be analysed by the parser, which was therefore free to do whatever tokenization it would find appropriate. The result, in the format returned by the parser \endash  after some filtering of parser-internal information \endash  was sent back to Oslo}{
\lang2070 , where it would then undergo considerable restructuring in order to:
\par {\pntext\pard\plain\s22 \fs20\lang2070\cgrid \hich\af0\dbch\af0\loch\f0 1.\tab}}\pard \s22\qj\fi-360\li360\sl-220\slmult0\widctlpar\jclisttab\tx360{\*\pn \pnlvlbody\ilvl0\ls6\pnrnot0\pndec\pnstart1\pnindent360\pnhang{\pntxta .}}\ls6\adjustright {
\lang2070 get back to the original tokenization.
\par {\pntext\pard\plain\s22 \fs20\lang2070\cgrid \hich\af0\dbch\af0\loch\f0 2.\tab}}\pard \s22\qj\fi-360\li360\sl-220\slmult0\widctlpar\jclisttab\tx360{\*\pn \pnlvlbody\ilvl0\ls6\pnrnot0\pndec\pnstart1\pnindent360\pnhang{\pntxta .}}\ls6\adjustright {
\lang2070 prepare for encoding in the IMS workbench.}{\lang2057 
\par }\pard \s22\qj\sl-220\slmult0\widctlpar\adjustright {\lang2057 The fact that this task is done in a collaborative way, and that ne
ither of the authors wanted to make fundamental changes to their respective systems for the purpose of this cooperation, resulted in a far more complicated process than is generally reported in the literature. Let us explain the reasons for doing it this 
way: 
\par First, Bick's parser is a general purpose system, not especially designed to annotate corpora in any specific}{\i\lang2057  }{\lang2057 
format. In fact, some of the many applications it has been used for (such as language teaching (Bick, 1997), machine translation (Bick, 2000)
 and lexicography) are generally considered more challenging than corpus annotation by itself. One would not, therefore, expect the parser to need to suffer considerable changes for the present application.
\par Second, the corpora and their prior processing were carefully considered, and there are several reasons why it would not be appropriate to change them:
\par {\pntext\pard\plain\s22 \f3\fs20\lang2057\cgrid \loch\af3\dbch\af0\hich\f3 \'b7\tab}}\pard \s22\qj\fi-360\li360\sl-220\slmult0\widctlpar\jclisttab\tx360{\*\pn \pnlvlblt\ilvl0\ls7\pnrnot0\pnf3\pnstart1\pnindent360\pnhang{\pntxtb \'b7}}\ls7\adjustright {
\lang2057 We aimed at the most neutral tokenization, namely using only spaces and punctuation, as well as a list of abbreviations, as the sole source for the proce
ss. This makes tokenization repeatable, easy to document, and theory-neutral. That tokenization of real text is an error-prone complex process (as documented e.g. by Grefenstette and Tapanainen, 1994) can be easily seen in the fact that a percentage as hi
gh as 6% of the elements in some of the corpora (excluding punctuation proper) included punctuation marks (i.e., dots, hyphens, slashes, commas, etc.).
\par {\pntext\pard\plain\s22 \f3\fs20\lang2057\cgrid \loch\af3\dbch\af0\hich\f3 \'b7\tab}}\pard \s22\qj\fi-360\li360\sl-220\slmult0\widctlpar\jclisttab\tx360{\*\pn \pnlvlblt\ilvl0\ls7\pnrnot0\pnf3\pnstart1\pnindent360\pnhang{\pntxtb \'b7}}\ls7\adjustright {
\lang2057 The corpora are intended to be employed, among other uses, as a way of comparing different systems as far
 as the annotation they provide is concerned. It would not do to bias any aspect of such comparison through the use of a particular parser's (in this case, Bick's) choices. 
\par }\pard \s22\qj\sl-220\slmult0\widctlpar\adjustright {\lang2057 On the other hand, it would have been a bad idea to force more than sentence separ
ation on a parser designed for handling running text, and this is why the corpora were transformed into "running text form" before submitting them to the parser. The presence or absence of spaces between punctuation and lexical material, for instance, con
t
ains a great deal of structural information which is exploited by the parser, but often lost in standard corpus mark-up where all punctuation is isolated and angle-bracketed. Also, the run-time splitting of contractions into individual "words" (like 'em+u
ma' for 'numa'), and the intermediate introduction of polylexical units (like 'em_vez_de' or 'do_que') considerably facilitates the recognition of rule context patterns, and thus, assignment of }{\i\lang2057 syntactic}{\lang2057  function tags.}{
\i\lang2057 
\par }{\lang2057 We}{\lang2070  show one example of the parser result, followed by the way it was re-encoded for the AC/DC project.
\par }{\fs18\lang2057 <p par=1>
\par <s>
\par H\'e1       \tab [haver]\tab  V PR 3S IND VFIN @FMV
\par casos    \tab [caso] \tab N M P @<ACC
\par jur\'eddicos \tab [jur\'eddico] ADJ M P @N<
\par que      \tab [que] <rel> SPEC M/F S/P }{\lang2057 @SUBJ> }{\fs18\lang2057 @#FS-N<
\par s\'e3o      \tab [ser] V PR 3P IND VFIN @FMV
\par como     \tab [como] <rel> <prp> ADV @COM @#AS-<SC
\par as      \tab [a] <artd> DET F P @>N
\par cerejas  \tab [cereja] N F P @AS<
\par $.
\par </s>
\par <s>
\par O        \tab [o] <dem> DET M S @APP
\par de      \tab [de] PRP @N<
\par Otelo    \tab [Otelo] PROP M/F S/P @P<
\par $,
\par por=exemplo      [por=exemplo] ADV @ADVL
\par $.
\par </s>
\par }{\lang2057 
\par }{\fs18\lang2057 <p par=1>
\par <s>
\par H\'e1      \tab haver  \tab V       \tab PR_3S_IND_VFIN  FMV
\par casos  \tab caso    \tab N       \tab M_P     <ACC
\par jur\'eddicos \tab jur\'eddico   \tab ADJ     \tab M_P     N<
\par que     \tab que     \tab SPEC_rel M/F_S/P SUBJ>_#FS-N<
\par s\'e3o     \tab ser     \tab V       \tab PR_3P_IND_VFIN  FMV
\par como    \tab como    \tab ADV_rel_prp     0       COM_#AS-<SC
\par as      \tab a       \tab DET_artd        F_P     >N
\par cerejas \tab cereja  \tab N       \tab F_P     AS<
\par .       \tab .       \tab PU      \tab 0       PONT
\par </s>
\par <s>
\par O      \tab o       \tab DET_dem M_S     APP
\par de      \tab de      \tab PRP     \tab 0       N<
\par Otelo   \tab Otelo   \tab PROP    \tab M/F_S/P P<
\par ,      \tab ,       \tab PU      \tab 0       PONT
\par por     \tab por=exemplo     \tab ADV     0       ADVL
\par exemplo \tab por=exemplo     \tab ADV     0       ADVL
\par .      \tab .       \tab PU      \tab 0       PONT
\par </s>
\par }{\lang2070 On the parser side of the proc
ess, two filter interfaces had to be crafted. An input filter, used to restore running text, removed and "stored" corpus meta tags (<par =357>, </s> etc.), and normalized punctuation to ordinary text standard (e.g. "-quotes). After analysis, a (more compl
ex) output filter}{\i\lang2070  }{\lang2070 
was used to remove all valency and semantic tags and, in the case of derived words, to create standard lexical base forms from the internally used root base forms and affix information. Also, possible orthographical changes introduced by t
he parser in its search for lexicon matches (Luso-Brazilian variation, spelling and accentuation irregularities) were reversed in order to maintain maximal corpus fidelity.
\par What was }{\i\lang2070 not }{\lang2070 filtered in the present project, were the actual tags, making the anno
tated corpora compatible with live CG style analyses (as, for instance, at http://visl.hum.sdu.dk). Given the size, modularity and granularity of the combined tag sets from }{\i\lang2070 all}{\lang2070  parsing}{\i\lang2070  }{\lang2070 
levels, it would, however, be feasible to create filters for a wide variety of different (less detailed) tag sets at a later stage, as has been repeatedly shown when co-operating with other teaching or tagging projects.
\par }\pard\plain \s23\qj\sb240\sa60\sl-220\slmult0\widctlpar\adjustright \b\fs22\cgrid {\lang2070 The annotation result
\par }\pard\plain \s22\qj\sl-220\slmult0\widctlpar\adjustright \fs20\cgrid {\lang2070 Let us explain in more detail the conversions done in some cases, and their motivation.
\par }\pard\plain \s24\qj\sb240\sl-220\slmult0\widctlpar\adjustright \b\fs20\cgrid {\lang2070 Clitic processing
\par }\pard\plain \s22\qj\sl-220\slmult0\widctlpar\adjustright \fs20\cgrid {\lang2070 Clitic processing is always a vexing problem in the automatic analysis of Portuguese, due to the mesoclitics (}{\i\lang2070 afirm\'e1-lo-ei, }{\lang2070 
'state-it-I will') and the phono-graphical changes required by the clitics (}{\i\lang2070 pu-la}{\lang2070  = }{\i\lang2070 pus}{\lang2070 +}{\i\lang2070 a, }{\lang2070 'I put her'). 
\par In a 
corpus context, the problem is to keep both the used form and the information of which "canonical", non-cliticized, form it corresponds in a way that allows easy search for both. There is no simple solution to this problem, as the discussion of several al
ternatives will hopefully convince the reader:
\par {\pntext\pard\plain\s22 \f3\fs20\lang2070\cgrid \loch\af3\dbch\af0\hich\f3 \'b7\tab}}\pard \s22\qj\fi-360\li360\sl-220\slmult0\widctlpar\jclisttab\tx360{\*\pn \pnlvlblt\ilvl0\ls8\pnrnot0\pnf3\pnstart1\pnindent360\pnhang{\pntxtb \'b7}}\ls8\adjustright {
\lang2070 "Restoration" of the underlying forms (such as }{\i\lang2070 afirmarei-o}{\lang2070 ) would destroy the actual text, in addition to creating non-Portuguese (i.e., ungrammatical) sequences. This would undermine the most important motivat
ion for consulting corpora in the first place: the need for authentic text material.
\par {\pntext\pard\plain\s22 \f3\fs20\lang2070\cgrid \loch\af3\dbch\af0\hich\f3 \'b7\tab}}\pard \s22\qj\fi-360\li360\sl-220\slmult0\widctlpar\jclisttab\tx360{\*\pn \pnlvlblt\ilvl0\ls8\pnrnot0\pnf3\pnstart1\pnindent360\pnhang{\pntxtb \'b7}}\ls8\adjustright {
\lang2070 Separation by the hyphen would create morphemes that cannot occur in isolation (such as }{\i\lang2070 afirm\'e1}{\lang2070  or }{\i\lang2070 ei}{\lang2070 ); and would increase ambiguity of the participant forms where there is
 none (e.g., in the case of }{\i\lang2070 como-a}{\lang2070  ('I eat it'), a verb form followed by an accusative personal pronoun would be transformed into }{\i\lang2070 como}{\lang2070  and }{\i\lang2070 a}{\lang2070 
, both highly ambiguous word forms in Portuguese).
\par {\pntext\pard\plain\s22 \f3\fs20\lang2070\cgrid \loch\af3\dbch\af0\hich\f3 \'b7\tab}}\pard \s22\qj\fi-360\li360\sl-220\slmult0\widctlpar\jclisttab\tx360{\*\pn \pnlvlblt\ilvl0\ls8\pnrnot0\pnf3\pnstart1\pnindent360\pnhang{\pntxtb \'b7}}\ls8\adjustright {
\lang2070 Marking that the forms did not occur in isolation by leaving hyphens in both sides, finally, would have both the disadvantage of not preserving the text and of not letting one look for similar words (e.g., in }{\i\lang2070 Vi o rapaz}{\lang2070 
 and }{\i\lang2070 Vi-o}{\lang2070 , one would have }{\i\lang2070 Vi}{\lang2070  and }{\i\lang2070 Vi-}{\lang2070  for the first person of the verb }{\i\lang2070 ver}{\lang2070 ).
\par {\pntext\pard\plain\s22 \f3\fs20\lang2070\cgrid \loch\af3\dbch\af0\hich\f3 \'b7\tab}}\pard \s22\qj\fi-360\li360\sl-220\slmult0\widctlpar\jclisttab\tx360{\*\pn \pnlvlblt\ilvl0\ls8\pnrnot0\pnf3\pnstart1\pnindent360\pnhang{\pntxtb \'b7}}\ls8\adjustright {
\lang2070 Finally, leaving the verb 
and the clitic as a single token, in addition to not letting forms decide on similar words, as in the previous case, makes the information rather compact, in that the classification of one token has to carry both the features pertaining to the verb and th
e ones pertaining to the clitics(s).  
\par }\pard \s22\qj\sl-220\slmult0\widctlpar\adjustright {\lang2070 This last choice is, however, the easiest to accomplish and the one chosen in the }{\i\lang2070 first}{\lang2070 
 phase of the AC/DC project. Its advantages are that the form is preserved, and that the information on the smaller constituents is then provided by the parser in a second phase}{\cs36\lang2070\super \chftn {\footnote \pard\plain 
\s32\qj\widctlpar\adjustright \fs18\lang2057\cgrid {\cs36\super \chftn }{ Syntactically, in any case, regarding a verb+clitic construction or a preposition+determiner contraction as one (functional) unit is awkward, since the very
 notion of syntactic constituents contradicts graphical word boundaries in these cases.}}}{\lang2070 . It is also the option that makes counting easiest: a word is determined solely by graphical means.}{\cs36\lang2070\super \chftn {\footnote \pard\plain 
\s32\qj\widctlpar\adjustright \fs18\lang2057\cgrid {\cs36\super \chftn }{ One can, of course, also count which words have hyphens and, of these, which are most probably verbs with clitics. A full-fledged parser may still be required to deci
de in a few cases, though, especially when there are typos in the material.}}}{\lang2070 
\par }\pard\plain \s24\qj\sb240\sl-220\slmult0\widctlpar\adjustright \b\fs20\cgrid {\lang2070 Compounds/MWE treatment
\par }\pard\plain \s22\qj\sl-220\slmult0\widctlpar\adjustright \fs20\cgrid {\lang2070 The same rationale applies to the treatment of multiword expressions. While wishing
 to maintain the information provided by the parser on compounds, we do not want to lose a parser independent tokenization strategy. 
\par On can distinguish three kinds of cases considered as one token by the parser:
\par {\pntext\pard\plain\s22 \fs20\lang2070\cgrid \hich\af0\dbch\af0\loch\f0 1.\tab}}\pard \s22\qj\fi-360\li360\sl-220\slmult0\widctlpar\jclisttab\tx360{\*\pn \pnlvlbody\ilvl0\ls9\pnrnot0\pndec\pnstart1\pnindent360\pnhang{\pntxta .}}\ls9\adjustright {
\lang2070 What is called in traditional Portuguese grammar "locu\'e7\'f5es", i.e., several words working as a grammatical unit, such as }{\i\lang2070 a partir de}{\lang2070 , }{\i\lang2070 no entanto}{\lang2070 , }{\i\lang2070 por tr\'e1s de}{\lang2070 
, etc.
\par {\pntext\pard\plain\s22 \fs20\lang2070\cgrid \hich\af0\dbch\af0\loch\f0 2.\tab}}\pard \s22\qj\fi-360\li360\sl-220\slmult0\widctlpar\jclisttab\tx360{\*\pn \pnlvlbody\ilvl0\ls9\pnrnot0\pndec\pnstart1\pnindent360\pnhang{\pntxta .}}\ls9\adjustright {
\lang2070 Idioms and fixed phrases with no morphological variation, such as }{\i\lang2070 por exemplo}{\lang2070 , }{\i\lang2070 um pouco}{\lang2070 , }{\i\lang2070 de p\'e9}{\lang2070 , }{\i\lang2070 castelo de cartas}{\lang2070 , }{\i\lang2070 
seja como for}{\lang2070 , }{\i\lang2070 fora de si}{\lang2070 , }{\i\lang2070 por minha causa}{\lang2070 , }{\i\lang2070 n\'e3o h\'e1 como}{\lang2070 , }{\i\lang2070 horas seguidas}{\lang2070 , etc.
\par {\pntext\pard\plain\s22 \fs20\lang2070\cgrid \hich\af0\dbch\af0\loch\f0 3.\tab}}\pard \s22\qj\fi-360\li360\sl-220\slmult0\widctlpar\jclisttab\tx360{\*\pn \pnlvlbody\ilvl0\ls9\pnrnot0\pndec\pnstart1\pnindent360\pnhang{\pntxta .}}\ls9\adjustright {
\lang2070 And all sorts of compound proper nouns, e.g. }{\i\lang2070 Rua S\'e3o Justino}{\lang2070 , }{\i\lang2070 Seu Carlos}{\lang2070 , or }{\i\lang2070 Auto da Compadecida}{\lang2070 .
\par }\pard \s22\qj\sl-220\slmult0\widctlpar\adjustright {\lang2070 Even though the difficulty of making this sort of decisions varies according to the kind, in neither 
case is this identification an error-free process. One cannot, therefore, rely blindly on the parser output. Nor can one expect to be able to decide without help of any parser. It is also obvious that different parsers, grammar theories and lexicons may d
rastically differ in such classification decisions. In (Santos, 1990), it was even suggested that the definition of MWEs is actually application dependent.
\par We have thus decided to provide each form as a distinct token, while keeping available the result of the parser's processing by encoding the whole compound as the lemma for each form, as in the example }{\i\lang2070 por exemplo}{\lang2070 
 in the previous example, or in the following MWEs featuring the word }{\i\lang2070 horas }{\lang2070 ('hours').}{\cf6\lang2070 
\par }{\fs18\lang2070 
\par horas   \tab horas=de=ponta  N       F_P     P<
\par de      \tab horas=de=ponta  N       F_P     P<
\par ponta   \tab horas=de=ponta  N       F_P     P<
\par 
\par horas   \tab horas=seguidas  N       F_P     <ADV
\par seguidas  horas=seguidas  N       F_P     <ADV
\par 
\par Horas   \tab horas=a=fio     ADV     0       ADVL>
\par a      \tab horas=a=fio     ADV     0       ADVL>
\par fio     \tab horas=a=fio     ADV     0       ADVL>
\par 
\par }{\lang2070 The encoded result shows that not all tokens in the corpus are individually classified. While this is possibly irrelevant for "members" of a proper noun, it may be disturbing for other uses of the cor
pus. However, since these cases form a closed list, automatic addition of subanalyses for MWEs is a feasible solution:
\par }{\fs18\cf6\lang2070 
\par }{\fs18\lang2070 Horas\tab hora\tab N\tab F_P\tab ADVL>
\par a\tab a\tab PRP\tab 0\tab N<
\par fio\tab fio\tab N\tab M_S\tab P<
\par }\pard\plain \s24\qj\sb240\sl-220\slmult0\widctlpar\adjustright \b\fs20\cgrid {\lang2070 Contractions
\par }\pard\plain \s22\qj\sl-220\slmult0\widctlpar\adjustright \fs20\cgrid {\lang2070 Contractions in Portuguese are cases where a preposition and a determi
ner (article or pronoun) are contracted into a single word form, with no ortographical marking (examples are }{\i\lang2070 dela}{\lang2070 , }{\i\lang2070 comigo}{\lang2070 , }{\i\lang2070 pro}{\lang2070 , }{\i\lang2070 pelas}{\lang2070 , }{\i\lang2070 do
}{\lang2070 , respectively de + ela, com + eu, para + o, por + as, de +o). The parser transforms these items into their constituents, and sets a morphological flag.
\par Consistent with our approach in the two previous cases, we restore the contractions and add the corresponding attributes. 
\par 
\par Summing up, in addition to our wish not to modify the original text, one important reason why we undergo all this trouble is that alternative analyses require different tokenization, in each of the three cases discussed: 
\par {\pntext\pard\plain\s22 \f3\fs20\lang2070\cgrid \loch\af3\dbch\af0\hich\f3 \'b7\tab}}\pard \s22\qj\fi-360\li360\sl-220\slmult0\widctlpar\jclisttab\tx360{\*\pn \pnlvlblt\ilvl0\ls11\pnrnot0\pnf3\pnstart1\pnindent360\pnhang{\pntxtb \'b7}}\ls11\adjustright {
\lang2070 Contractions: }{\i\lang2070 deste}{\lang2070  (a verb form or the contraction }{\i\lang2070 de}{\lang2070  + }{\i\lang2070 este}{\lang2070 ); }{\i\lang2070 pelo}{\lang2070  (a singular noun or the contraction }{\i\lang2070 por}{\lang2070  + }{
\i\lang2070 o}{\lang2070 ); }{\i\lang2070 consigo}{\lang2070  (a verb form or the contraction }{\i\lang2070 com}{\lang2070  + }{\i\lang2070 si}{\lang2070 )
\par {\pntext\pard\plain\s22 \f3\fs20\lang2070\cgrid \loch\af3\dbch\af0\hich\f3 \'b7\tab}}\pard \s22\qj\fi-360\li360\sl-220\slmult0\widctlpar\jclisttab\tx360{\*\pn \pnlvlblt\ilvl0\ls11\pnrnot0\pnf3\pnstart1\pnindent360\pnhang{\pntxtb \'b7}}\ls11\adjustright {
\lang2070 MWEs: }{\i\lang2070 mais valia}{\lang2070  (adverb plus verb or complex noun); }{\i\lang2070 a favor de}{\lang2070  (preposition noun preposition or complex expression).
\par {\pntext\pard\plain\s22 \f3\fs20\lang2070\cgrid \loch\af3\dbch\af0\hich\f3 \'b7\tab}}\pard \s22\qj\fi-360\li360\sl-220\slmult0\widctlpar\jclisttab\tx360{\*\pn \pnlvlblt\ilvl0\ls11\pnrnot0\pnf3\pnstart1\pnindent360\pnhang{\pntxtb \'b7}}\ls11\adjustright {
\lang2070 Clitics: }{\i\lang2070 tem-nos}{\lang2070  (the clitic is }{\i\lang2070 nos}{\lang2070  (first person plural) or underlying }{\i\lang2070 os}{\lang2070  (third person object masculine pronoun).
\par }\pard \s22\qj\sl-220\slmult0\widctlpar\adjustright {\lang2070 The extent to which tokenization is different in the two systems is surprisingly large, as Table 4 proves beyond doubt. 
\par 
\par }\trowd \trkeep\trbrdrt\brdrs\brdrw10 \trbrdrl\brdrs\brdrw10 \trbrdrb\brdrs\brdrw10 \trbrdrr\brdrs\brdrw10 \trbrdrh\brdrs\brdrw10 \trbrdrv\brdrs\brdrw10 \clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 
\cltxlrtb \cellx1418\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb \cellx2977\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb 
\cellx4678\pard \s22\qc\sl-220\slmult0\widctlpar\intbl\adjustright {\b\lang2070 Processing
\par stage\cell EBRANOT
\par with and without
\par punctuation\cell NATPANOT
\par with and without
\par punctuation\cell }\pard\plain \widctlpar\intbl\adjustright \fs22\lang2057\cgrid {\lang2070 \row }\trowd \trbrdrt\brdrs\brdrw10 \trbrdrl\brdrs\brdrw10 \trbrdrb\brdrs\brdrw10 \trbrdrr\brdrs\brdrw10 \trbrdrh\brdrs\brdrw10 \trbrdrv\brdrs\brdrw10 \clvertalt
\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb \cellx1418\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb \cellx2268\clvertalt\clbrdrt
\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb \cellx2977\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb \cellx3828\clvertalt\clbrdrt
\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb \cellx4678\pard\plain \s22\qj\sl-220\slmult0\widctlpar\intbl\adjustright \fs20\cgrid {\lang2070 Original version\cell }\pard \s22\qc\sl-220\slmult0
\widctlpar\intbl\adjustright {\lang2070 884,729\cell 661,395\cell 857,742\cell 696,918\cell }\pard\plain \widctlpar\intbl\adjustright \fs22\lang2057\cgrid {\lang2070 \row }\pard\plain \s22\qj\sl-220\slmult0\widctlpar\intbl\adjustright \fs20\cgrid {
\lang2070 Parser's output\cell }\pard \s22\qc\sl-220\slmult0\widctlpar\intbl\adjustright {\lang2070 889,580\cell 669,162\cell 878,351\cell 719,700\cell }\pard\plain \widctlpar\intbl\adjustright \fs22\lang2057\cgrid {\lang2070 \row }\pard\plain 
\s22\qj\sl-220\slmult0\widctlpar\intbl\adjustright \fs20\cgrid {\lang2070 Contr. merging\cell }\pard \s22\qc\sl-220\slmult0\widctlpar\intbl\adjustright {\lang2070 846,723\cell 626,305\cell 816,170\cell 657,520\cell }\pard\plain 
\widctlpar\intbl\adjustright \fs22\lang2057\cgrid {\lang2070 \row }\pard\plain \s22\qj\sl-220\slmult0\widctlpar\intbl\adjustright \fs20\cgrid {\lang2070 MWE expansion\cell }\pard \s22\qc\sl-220\slmult0\widctlpar\intbl\adjustright {\lang2070 878,586\cell 
658,168\cell 845,113\cell 686,463\cell }\pard\plain \widctlpar\intbl\adjustright \fs22\lang2057\cgrid {\lang2070 \row }\trowd \trbrdrt\brdrs\brdrw10 \trbrdrl\brdrs\brdrw10 \trbrdrb\brdrs\brdrw10 \trbrdrr\brdrs\brdrw10 \trbrdrh\brdrs\brdrw10 \trbrdrv
\brdrs\brdrw10 \clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb \cellx1418\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb 
\cellx2268\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb \cellx2977\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb \cellx3828
\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb \cellx4678\pard\plain \s22\qj\sl-220\slmult0\widctlpar\intbl\adjustright \fs20\cgrid {\lang2070 Clitics merge\cell }\pard 
\s22\qc\sl-220\slmult0\widctlpar\intbl\adjustright {\lang2070 872,563\cell 652,146\cell 843,372\cell 684,723\cell }\pard\plain \widctlpar\intbl\adjustright \fs22\lang2057\cgrid {\lang2070 \row }\pard\plain \s22\qc\sl-220\slmult0\widctlpar\adjustright 
\fs20\cgrid {\lang2057 Table 4: Tokenization size
\par }\pard \s22\qj\sl-220\slmult0\widctlpar\adjustright {\lang2070 
\par Putting together 
verbs with enclitics accounts for a reduction of 0.9% in EBRANOT and 0.25% in the first part of NATPANOT. The corresponding shrinking for contractions is 6.4% and 8.6% respectively. On the other hand, the expansion of MWE into several tokens contributes t
o raises in the number of tokens of 5.1% and  4.4%. Even though the aim of all this processing is the restoration of the original tokenization, we still have  differences in the number of tokens, the reasons for which are currently being investigated. 

\par All 
in all, compared to the tokens returned by the parser, 12% to 14% of the final tokens, excluding punctuation marks, are new. This is an interesting measure, in our opinion, since it shows how unreliable measures of performance (e.g. errors per "words")}{
\cf6\lang2070  }{\lang2070 can be when they are compared numerically without taking into account the tokenization}{\cf6\lang2070  }{\lang2070 assumptions involved.
\par }\pard\plain \s21\qc\sb240\sa60\sl-220\slmult0\widctlpar\adjustright \b\cgrid {\lang2057 Encoding in IMS-CWB
\par }\pard\plain \s22\qj\sl-220\slmult0\widctlpar\adjustright \fs20\cgrid {\lang2070 We do not intend to provide here more than some general clues as to the use of the IMS-CWB. Readers of this section are encouraged
 to read elsewhere (Christ et al., 1999, Christ, 1998) on the capabilities and internals of this corpus system. But for those who already use it, it might be relevant to motivate some of our choices.
\par }\pard\plain \s23\qj\sb240\sa60\sl-220\slmult0\widctlpar\adjustright \b\fs22\cgrid {\lang2070 Positional attributes
\par }\pard\plain \s22\qj\sl-220\slmult0\widctlpar\adjustright \fs20\cgrid {\lang2070 Different annotation levels were e
ncoded as four different positional attributes: lemma; part of speech (N, V, ADJ, DET, etc. plus a combination thereof in the case of multiword expressions; together with some subcategories returned by the parser, as in DET_poss or ADV_dem_quant_komp); mo
r
phological information (like gender, tense, etc.); and functional information. Neither morphological nor functional information is necessarily unique, so the several pieces are concatenated, separated by underscores, in order to provide one value for the 
corresponding attribute. Thus, "M_P" stands for }{\ul\lang2070 M}{\lang2070 asculine }{\ul\lang2070 P}{\lang2070 
lural, and "PRD_#AS-<ADVL" means that the word functions as a role predicator and indicates an absolute clause which has the function ADVL in the main clause. 
\par In some undecidable cases the parser uses morphological portmanteau-tags. Here, alternative or undefined options are marked with a slash (e.g. "S/P" means singular }{\b\lang2070 or}{\lang2070 
 plural). For its syntactical annotation, the parser resolves remaining ambiguity by progressively more heuristic rule levels.}{\cs36\lang2070\super \chftn {\footnote \pard\plain \s32\qj\widctlpar\adjustright \fs18\lang2057\cgrid {\cs36\super \chftn }{
 In fact, some syntactic ambiguity can - in flat dependency grammar - be expressed by using only }{\i one}{ tag: @N<, for instance, means a postnominal constituent, but underspecifies ju
st how many nouns to the left the attachment head is to be found (e.g. 'o @>N }{\ul homem}{ @? com @N< a @>N bicicleta @P< da @N< China @P<').}}}{\lang2070  
Provisionally, for the task at hand (a corpus search interface), tag ambiguity was set to be zero. 
\par One of the obvious advantages of the physical separation between corpus and annotation provided by the IMS-CWB is that the very same corpus can have POS1, L
EMA1, etc. for this parser's output, and POS2, LEMA2, etc. for another. As mentioned above, we intend to annotate the same corpora with different parsers and taggers, which will allow a user to look for systematic differences between systems or problemati
c areas in general for the parsing of Portuguese. 
\par }\pard\plain \s23\qj\sb240\sa60\sl-220\slmult0\widctlpar\adjustright \b\fs22\cgrid {\lang2070 The Web interface
\par }\pard\plain \s22\qj\sl-220\slmult0\widctlpar\adjustright \fs20\cgrid {\cs27\lang2057 For the moment, the Web interface is simply a window into CQP \endash  the corpus query processor \endash  with some trivial possibilities added 
\par {\pntext\pard\plain\s22 \cs27\f3\fs20\lang2057\cgrid \loch\af3\dbch\af0\hich\f3 \'b7\tab}}\pard \s22\qj\fi-360\li360\sl-220\slmult0\widctlpar\jclisttab\tx360{\*\pn \pnlvlblt\ilvl0\ls12\pnrnot0\pnf3\pnstart1\pnindent360\pnhang{\pntxtb \'b7}}
\ls12\adjustright {\cs27\lang2057 the semicolon is not required when only one command is involved 
\par {\pntext\pard\plain\s22 \cs27\f3\fs20\lang2057\cgrid \loch\af3\dbch\af0\hich\f3 \'b7\tab}}\pard \s22\qj\fi-360\li360\sl-220\slmult0\widctlpar\jclisttab\tx360{\*\pn \pnlvlblt\ilvl0\ls12\pnrnot0\pnf3\pnstart1\pnindent360\pnhang{\pntxtb \'b7}}
\ls12\adjustright {\cs27\lang2057 quotes are not obligatory when only one token is involved. 
\par }\pard \s22\qj\sl-220\slmult0\widctlpar\adjustright {\cs27\lang2057 The only substantial addition is the possibility of asking for the distribution of a regular expression as a simple query. 
\par As far as restrictions are concerned, the user cannot change the corpus s/he is querying. Likewise, s/he cannot, for obvious reasons, rely on the use of local corpora.
\par Given that the parsing of the corpora is still work in progress, we keep the non-annotated and the annotated versions of each corpus distinct. We expect to merge them when 100% tokenization agreement is achieved.}{\lang2070 
\par We present here some examples of the query power allowed by our service, due to the combined advantages of using this parser, the IMS corpus workbench and the particular Web interface. One can l
ook for (see documentation on our Website for the actual syntax):
\par {\pntext\pard\plain\s22 \f3\fs20\lang2070\cgrid \loch\af3\dbch\af0\hich\f3 \'b7\tab}}\pard \s22\qj\fi-360\li360\sl-220\slmult0\widctlpar\jclisttab\tx360{\*\pn \pnlvlblt\ilvl0\ls13\pnrnot0\pnf3\pnstart1\pnindent360\pnhang{\pntxtb \'b7}}\ls13\adjustright {
\lang2070 Objects of the verb X
\par {\pntext\pard\plain\s22 \f3\fs20\lang2070\cgrid \loch\af3\dbch\af0\hich\f3 \'b7\tab}}\pard \s22\qj\fi-360\li360\sl-220\slmult0\widctlpar\jclisttab\tx360{\*\pn \pnlvlblt\ilvl0\ls13\pnrnot0\pnf3\pnstart1\pnindent360\pnhang{\pntxtb \'b7}}\ls13\adjustright {
\lang2070 Verbs which have as object Y
\par {\pntext\pard\plain\s22 \f3\fs20\lang2070\cgrid \loch\af3\dbch\af0\hich\f3 \'b7\tab}}\pard \s22\qj\fi-360\li360\sl-220\slmult0\widctlpar\jclisttab\tx360{\*\pn \pnlvlblt\ilvl0\ls13\pnrnot0\pnf3\pnstart1\pnindent360\pnhang{\pntxtb \'b7}}\ls13\adjustright {
\lang2070 Preposition X occurring }{\i\lang2070 not}{\lang2070  within}{\cf6\lang2070  }{\lang2070 a proper noun 
\par {\pntext\pard\plain\s22 \f3\fs20\lang2070\cgrid \loch\af3\dbch\af0\hich\f3 \'b7\tab}}\pard \s22\qj\fi-360\li360\sl-220\slmult0\widctlpar\jclisttab\tx360{\*\pn \pnlvlblt\ilvl0\ls13\pnrnot0\pnf3\pnstart1\pnindent360\pnhang{\pntxtb \'b7}}\ls13\adjustright {
\lang2070 Nouns having pre-modifying }{\i\lang2070 and/or}{\lang2070  post-modifying adjectives
\par {\pntext\pard\plain\s22 \f3\fs20\lang2070\cgrid \loch\af3\dbch\af0\hich\f3 \'b7\tab}}\pard \s22\qj\fi-360\li360\sl-220\slmult0\widctlpar\jclisttab\tx360{\*\pn \pnlvlblt\ilvl0\ls13\pnrnot0\pnf3\pnstart1\pnindent360\pnhang{\pntxtb \'b7}}\ls13\adjustright {
\lang2070 Verbs in the conditional
\par {\pntext\pard\plain\s22 \f3\fs20\lang2070\cgrid \loch\af3\dbch\af0\hich\f3 \'b7\tab}}\pard \s22\qj\fi-360\li360\sl-220\slmult0\widctlpar\jclisttab\tx360{\*\pn \pnlvlblt\ilvl0\ls13\pnrnot0\pnf3\pnstart1\pnindent360\pnhang{\pntxtb \'b7}}\ls13\adjustright {
\lang2070 Words not in the lexicon
\par {\pntext\pard\plain\s22 \f3\fs20\lang2070\cgrid \loch\af3\dbch\af0\hich\f3 \'b7\tab}}\pard \s22\qj\fi-360\li360\sl-220\slmult0\widctlpar\jclisttab\tx360{\*\pn \pnlvlblt\ilvl0\ls13\pnrnot0\pnf3\pnstart1\pnindent360\pnhang{\pntxtb \'b7}}\ls13\adjustright {
\lang2070 Adjectives forming part of (complex) proper nouns
\par {\pntext\pard\plain\s22 \f3\fs20\lang2070\cgrid \loch\af3\dbch\af0\hich\f3 \'b7\tab}}\pard \s22\qj\fi-360\li360\sl-220\slmult0\widctlpar\jclisttab\tx360{\*\pn \pnlvlblt\ilvl0\ls13\pnrnot0\pnf3\pnstart1\pnindent360\pnhang{\pntxtb \'b7}}\ls13\adjustright {
\lang2070 Forms being used both as verbs and as nouns in the corpus, but more frequently as verbs
\par }\pard\plain \s21\qc\sb240\sa60\sl-220\slmult0\widctlpar\adjustright \b\cgrid {\lang2057 Evaluation
\par }\pard\plain \s22\qj\sl-220\slmult0\widctlpar\adjustright \fs20\cgrid {\lang2070 Evaluation of the usefulness of the service \endash  or more especially of the second phase \endash  can be done according to different axes
\par {\pntext\pard\plain\s22 \f3\fs20\lang2070\cgrid \loch\af3\dbch\af0\hich\f3 \'b7\tab}}\pard \s22\qj\fi-360\li360\sl-220\slmult0\widctlpar\jclisttab\tx360{\*\pn \pnlvlblt\ilvl0\ls10\pnrnot0\pnf3\pnstart1\pnindent360\pnhang{\pntxtb \'b7}}\ls10\adjustright {
\lang2070 number of visits and successful queries
\par {\pntext\pard\plain\s22 \f3\fs20\lang2070\cgrid \loch\af3\dbch\af0\hich\f3 \'b7\tab}}\pard \s22\qj\fi-360\li360\sl-220\slmult0\widctlpar\jclisttab\tx360{\*\pn \pnlvlblt\ilvl0\ls10\pnrnot0\pnf3\pnstart1\pnindent360\pnhang{\pntxtb \'b7}}\ls10\adjustright {
\lang2070 accuracy/recall of the queries (which is obviously also dependent on parser performance)
\par }\pard \s22\qj\sl-220\slmult0\widctlpar\adjustright {\lang2070 Due to lack of sufficient information \endash  the documentation is being written in parallel
 with writing the present paper, and no general announcement has yet been made regarding the second phase of the AC/DC project \endash 
 it is too early to study access patterns. We intend to measure the usefulness of the query result for particular queries by making three different counts:
\par {\pntext\pard\plain\s22 \fs20\lang2070\cgrid \hich\af0\dbch\af0\loch\f0 1.\tab}}\pard \s22\qj\fi-360\li360\sl-220\slmult0\widctlpar\jclisttab\tx360{\*\pn \pnlvlbody\ilvl0\ls15\pnrnot0\pndec\pnstart1\pnindent360\pnhang{\pntxta .}}\ls15\adjustright {
\lang2070 how many examples one would have to look at if the corpus was not tagged, compared to the ones found
\par {\pntext\pard\plain\s22 \fs20\lang2070\cgrid \hich\af0\dbch\af0\loch\f0 2.\tab}}\pard \s22\qj\fi-360\li360\sl-220\slmult0\widctlpar\jclisttab\tx360{\*\pn \pnlvlbody\ilvl0\ls15\pnrnot0\pndec\pnstart1\pnindent360\pnhang{\pntxta .}}\ls15\adjustright {
\lang2070 how many cases found were actually right
\par {\pntext\pard\plain\s22 \fs20\lang2070\cgrid \hich\af0\dbch\af0\loch\f0 3.\tab}}\pard \s22\qj\fi-360\li360\sl-220\slmult0\widctlpar\jclisttab\tx360{\*\pn \pnlvlbody\ilvl0\ls15\pnrnot0\pndec\pnstart1\pnindent360\pnhang{\pntxta .}}\ls15\adjustright {
\lang2070 how many cases were missing
\par }\pard \s22\qj\sl-220\slmult0\widctlpar\adjustright {\lang2070 The first measure can actually be the most important in an interactive service where one can refine one's queries and try alternative query options at once.
\par }\pard\plain \s21\qc\sb240\sa60\sl-220\slmult0\widctlpar\adjustright \b\cgrid {\lang2057 Future work: a third phase?
\par }\pard\plain \s22\qj\sl-220\slmult0\widctlpar\adjustright \fs20\cgrid {\lang2070 The most obvious need after creating this service, and the resources it serves, is to supply enough secondary documentation and t
eaching material so that would-be corpus users can exploit to their satisfaction a reasonable part of the tools provided. Such secondary material could include guided tours, a discussion of alternative grammatical or tagging approaches, FAQ-lists, or a re
gular teaching interface for university students.
\par In terms of content, the service could be improved by proof-reading some of the automatically annotated corpora, tagging other text kinds (speech data, historical data) or using the parser to provide graphic
al syntactic tree structure annotation (cf. http://visl.hum.sdu.dk). 
\par Although manual annotation of the corpora is a possibility, we believe it is better first to engage in a detailed analysis of the parser's performance, with subsequent documentation of i
ts strengths and weaknesses. In the long run, an improved parser would allow faster annotation proofing, so }{\i\lang2070 manual}{\lang2070  corpus annotation should not be seen as an end in itself.
\par Finally, other work falling under the scope of the AC/DC project, no longer necessa
rily connected with the second author, is to engage in the same kind of collaborative process with other parsers, resources and systems available for Portuguese, and compare the results.
\par }\pard\plain \s21\qc\sb240\sa60\sl-220\slmult0\widctlpar\adjustright \b\cgrid {\lang2057 References 
\par }\pard\plain \s29\qj\fi-198\li198\sl-220\slmult0\widctlpar\adjustright \fs20\lang2057\cgrid {\lang2070 Bick, Eckhard. (1996). Automatic parsing of Portuguese. I
n Proc. Second Workshop on Computational Processing of Written Portuguese (Curitiba, 23-25 October 1996) (pp. 91--100).
\par Bick, Eckhard. (1997). Internet Based Grammar Teaching. In E. Christoffersen & B. Music (Eds.),}{\i\lang2070  }{\lang2070 Datalingvistisk Forenings \'e5rsm\'f8de 1997 - DALF '97 (pp. 86--106). Kolding.
\par Bick, Eckhard. (1998). Structural Lexical Heuristics in the Automatic Analysis of Portuguese. In B. Maegaard 
\par (Ed.), Proc. 11th Nordic Conference on Computational Linguistics, Nodalida '98 (pp. 44--56). Copenhagen.
\par Bick, Eckhard. (2000). The Parsing System "Palavras" \endash  Automatic Grammatical Analysis of Portuguese in a Constraint Grammar Framework. \'c5rhus.
\par }{Christ, Oliver. (1998). Linking WordNet to a Corpus Query System. In J. Nerbonne (Ed.), Linguistic Databases}{\i  }{(pp.189--202). Stanford: CSLI Publications.
\par Christ, O., Schulze, B. M., Hofmann, A., & Koenig, E. (1999). The IMS Corpus Workbench: Corpus Query Processor (CQP): User's Manual. University of Stuttgart, March 8, 1999 (CQP V2.2).
\par Grefenstette, G. & Tapanainen, P. (1994). What is a word, What is a sentence? Problems of Tokenization. In Proc. 3rd International Conference on Computational Lexicography, COMPLEX'94 (pp. 79--87).
\par }\pard\plain \qj\fi-284\li284\widctlpar\tx851\tx1701\tx2552\tx3402\tx4253\tx5103\tx5954\tx6804\tx7655\adjustright \fs22\lang2057\cgrid {\fs20 Karlsson, Fred, et. al. (1995). Constraint Grammar, A Language-Independent System for Parsing Unrestricted Text}{
\i\fs20 . }{\fs20 Berlin: Mouton de Gruyter.
\par }\pard\plain \s29\qj\fi-198\li198\sl-220\slmult0\widctlpar\adjustright \fs20\lang2057\cgrid {Santos, Diana. (1990). Lexical gaps and idioms in Machine Translation. In H. Karlgren (Ed.),}{\i  }{Proceedings of COLING'90,}{\i  }{
Vol 2 (pp.330--335). Helsinki.}{\lang2070 
\par Santos, Diana. (1998). }{Providing access to language resources th
rough the World Wide Web: the Oslo Corpus of Bosnian Texts. In Rubio et al. (Eds.), Proceedings of The First International Conference on Language Resources and Evaluation, Vol. 1 (pp.475--481). Granada.
\par Santos, Diana. (to appear). Compara\'e7\'e3o de corpora em portugu\'eas: algumas experi\'eancias. In T. Berber Sardinha (Ed.), L\'edngua Portuguesa no Computador, S\'e3o Paulo.
\par }\pard\plain \qj\fi-284\li284\widctlpar\tx851\tx1701\tx2552\tx3402\tx4253\tx5103\tx5954\tx6804\tx7655\adjustright \fs22\lang2057\cgrid {\fs20 
Tapanainen, Pasi. (1996). The Constraint Grammar Parser CG-2. Publication No. 27. Helsinki: Department of General Linguistics, University of Helsinki.}{\i\fs20  }{\fs20 
\par }}