001    package org.rakeshv.tex;
002    
003    import com.stevesoft.pat.*;
004    
005    /**
006            *       <p>This class provides <code>static methods</code> that do very 
007            *       common transformations to <code>ASCII text</code> files that
008            *       are converted to <code>L<sup><small>A</small></sup>T<sub><small>E</small></sub>X</code>.</p>
009            */
010    public final class TransformTextFile
011    {
012            /**
013                    *       Regular expressions that are used to transform unformatted
014                    *       dates to TeX and HTML.
015                    */
016            private static Regex dateRegex[] = {
017                    new Regex( "(?i) ([0-9]{1,2})st", "\\\\opt{dvi,pdf,ps}{${1}\\\\textsuperscript{st}}\\\\opt{html}{\\\\HCode{${1}<sup><small>st</small></sup>}}" ),
018                    new Regex( "(?i)([0-9]{1,2})nd", "\\\\opt{dvi,pdf,ps}{${1}\\\\textsuperscript{nd}}\\\\opt{html}{\\\\HCode{${1}<sup><small>nd</small></sup>}}" ),
019                    new Regex( "(?i)([0-9]{1,2})rd", "\\\\opt{dvi,pdf,ps}{${1}\\\\textsuperscript{rd}}\\\\opt{html}{\\\\HCode{${1}<sup><small>rd</small></sup>}}" ),
020                    new Regex( "(?i)([0-9]{1,2})th", "\\\\opt{dvi,pdf,ps}{${1}\\\\textsuperscript{th}}\\\\opt{html}{\\\\HCode{${1}<sup><small>th</small></sup>}}" ),
021            };
022    
023            /**
024                    *       Regular expressions that change American spelling to English
025                    */
026            private static Regex spellingRegex[] = {
027                    new Regex( "([a|A])rdor", "${1}rdour" ),
028                    new Regex( "([a|A])rmor", "${1}rmour" ),
029                    new Regex( "([a|A])rmourial", "${1}rmorial" ),
030                    new Regex( "([b|B])ehavior", "${1}ehaviour" ),
031                    new Regex( "([c|C])andor", "${1}andour" ),
032                    new Regex( "([c|C])olor", "${1}olour" ),
033                    new Regex( "([c|C])lamor", "${1}lamour" ),
034                    new Regex( "([d|D])emeanor", "${1}emeanour" ),
035                    new Regex( "([d|D])olor", "${1}olour" ),
036                    new Regex( "([e|E])ndeavor", "${1}ndeavour" ),
037                    new Regex( "([f|F])avor", "${1}avour" ),
038                    new Regex( "([f|F])lavor", "${1}lavour" ),
039                    new Regex( "([f|F])ervor", "${1}ervour" ),
040                    new Regex( "([h|H])arbor", "${1}arbour" ),
041                    new Regex( "([h|H])onor", "${1}onour" ),
042                    new Regex( "([h|H])umor([^a-z]{1,})", "${1}umour${2}" ),
043                    new Regex( "([h|H])umored", "${1}umoured" ),
044                    new Regex( "([l|L])abor([^a-z]{1,})", "${1}abour${2}" ),
045                    new Regex( "([l|L])aborer", "${1}abourer" ),
046                    new Regex( "([n|N])eighbor", "${1}eighbour" ),
047                    new Regex( "([o|O])dor", "${1}dour" ),
048                    new Regex( "([p|P])arlor", "${1}arlour" ),
049                    new Regex( "([r|R])igor", "${1}igour" ),
050                    new Regex( "([r|R])igourous", "${1}igorous" ),
051                    new Regex( "([r|R])umor", "${1}umour" ),
052                    new Regex( "([s|S])avior", "${1}aviour" ),
053                    new Regex( "([s|S])plendor", "${1}plendour" ),
054                    new Regex( "([s|S])uccor", "${1}uccour" ),
055                    new Regex( "([v|V])alor", "${1}alour" ),
056                    new Regex( "([v|V])apor", "${1}apour" ),
057                    new Regex( "([v|V])apourous", "${1}aporous" ),
058                    new Regex( "([v|V])igor([^a-z])", "${1}igour${2}" ),
059                    new Regex( "([e|E])labourate", "${1}laborate" ),
060                    new Regex( "([h|H])onourary", "${1}onorary" ),
061                    new Regex( "([l|L])abourious", "${1}aborious" ),
062                    new Regex( "([a|A])goniz", "${1}gonis" ),
063                    new Regex( "([a|A])nalyze", "${1}nalyse" ),
064                    new Regex( "([a|A])nalyzing", "${1}nalysing" ),
065                    new Regex( "([a|A])pologiz", "${1}pologis" ),
066                    new Regex( "([a|A])uthoriz", "${1}uthoris" ),
067                    new Regex( "([b|B])aptiz", "${1}aptis" ),
068                    new Regex( "([c|C])haracteriz", "${1}haracteris" ),
069                    new Regex( "([c|C])iviliz", "${1}ivilis" ),
070                    new Regex( "([c|C])ustomiz", "${1}ustomis" ),
071                    new Regex( "([c|C])riticiz", "${1}riticis" ),
072                    new Regex( "([c|C])rystalliz", "${1}rystallis" ),
073                    new Regex( "([d|D])ecentraliz", "${1}ecentralis" ),
074                    new Regex( "([e|E])conomiz", "${1}conomis" ),
075                    new Regex( "([e|E])mphasiz", "${1}mphasis" ),
076                    new Regex( "([e|E])ulogiz", "${1}ulogis" ),
077                    new Regex( "([e|E])vangeliz", "${1}vangelis" ),
078                    new Regex( "([f|F])amiliariz", "${1}amiliaris" ),
079                    new Regex( "([g|G])alvaniz", "${1}alvanis" ),
080                    new Regex( "([g|G])eneraliz", "${1}eneralis" ),
081                    new Regex( "([h|H])armoniz", "${1}armonis" ),
082                    new Regex( "([h|H])ypnotiz", "${1}ypnotis" ),
083                    new Regex( "([m|M])aterializ", "${1}aterialis" ),
084                    new Regex( "([m|M])esmerize", "${1}esmerise" ),
085                    new Regex( "([m|M])inimiz", "${1}inimis" ),
086                    new Regex( "([m|M])obiliz", "${1}obilis" ),
087                    new Regex( "([m|M])onopoliz", "${1}nopolis" ),
088                    new Regex( "([m|M])oraliz", "${1}oralis" ),
089                    new Regex( "([n|N])eutraliz", "${1}eutralis" ),
090                    new Regex( "([o|O])rganiz", "${1}rganis" ),
091                    new Regex( "([p|P])aralyz", "${1}aralys" ),
092                    new Regex( "([p|P])atroniz", "${1}atronis" ),
093                    new Regex( "([p|P])olariz", "${1}olaris" ),
094                    new Regex( "([p|P])ulveriz", "${1}ulveris" ),
095                    new Regex( "([r|R])ealiz", "${1}ealis" ),
096                    new Regex( "([r|R])ecogniz", "${1}ecognis" ),
097                    new Regex( "([r|R])evolutioniz", "${1}evolutionis" ),
098                    new Regex( "([s|S])candaliz", "${1}candalis" ),
099                    new Regex( "([s|S])crutiniz", "${1}crutinis" ),
100                    new Regex( "([s|S])olemniz", "${1}olemnis" ),
101                    new Regex( "([s|S])oliloquiz", "${1}oliloquis" ),
102                    new Regex( "([s|S])pecializ", "${1}pecialis" ),
103                    new Regex( "([s|S])tandardiz", "${1}tandardis" ),
104                    new Regex( "([s|S])teriliz", "${1}terilis" ),
105                    new Regex( "([s|S])ympathiz", "${1}ympathis" ),
106                    new Regex( "([t|T])antaliz", "${1}antalis" ),
107                    new Regex( "([t|T])emporiz", "${1}emporis" ),
108                    new Regex( "([t|T])eorize", "${1}eorise" ),
109                    new Regex( "([t|T])erroriz", "${1}erroris" ),
110                    new Regex( "([u|U])tiliz", "${1}tilis" ),
111                    new Regex( "([v|V])isualiz", "${1}isualis" ),
112                    new Regex( "([c|C])enter", "${1}entre" ),
113                    new Regex( "([f|F])iber", "${1}ibre" ),
114                    new Regex( "([m|M])aneuver", "${1}anoeuvre" ),
115                    new Regex( "([m|M])eager", "${1}eagre" ),
116                    new Regex( "([s|S])epulcher", "${1}epulchre" ),
117                    new Regex( "([s|S])pecter", "${1}pectre" ),
118                    new Regex( "([t|T])eater", "${1}atre" ),
119                    new Regex( "([c|C])atalog[^u]", "${1}atalogue" ),
120                    new Regex( "([d|D])ialog[^u]", "${1}ialogue" ),
121                    new Regex( "([b|B])ehoove", "${1}ehove" ),
122                    new Regex( "([d|D])efense", "${1}efence" ),
123                    new Regex( "([l|L])icense", "${1}icence" ),
124                    new Regex( "([o|O])ffense", "${1}ffence" ),
125                    new Regex( "([p|P])rogram([^a-z])", "${1}rogramme${2}" ),
126                    new Regex( "([f|F])ulfill", "${1}ulfil" ),
127                    new Regex( "([j|J])ewlry", "${1}ewellery" ),
128                    new Regex( "([m|M])arvelous", "${1}arvellous" ),
129                    new Regex( "([m|M])odeling", "${1}odelling" ),
130                    new Regex( "([s|S])illful", "${1}ilful" ),
131                    new Regex( "([s|S])pecialty", "${1}peciality" ),
132            };
133    
134            /**
135                    *       Regular expression to change miscellaneous things
136                    */
137            private static Regex miscRegex[] = {
138                    // Regular expressions to handle titles
139                    new Regex( "M\\.[\\s]{1,}([A-Z])", "M\\.\\~${1}" ),
140                    new Regex( "Mr\\.[\\s]{1,}([A-Z])", "Mr\\.\\~${1}" ),
141                    new Regex( "Mrs\\.[\\s]{1,}([A-Z])", "Mrs\\.\\~${1}" ),
142                    new Regex( "St\\.[\\s]{1,}([A-Z])", "St\\.\\~${1}" ),
143                    
144                    // Regular expressions to handle double quotes
145                    new Regex( "([\\s]{1,})\"", "${1}``" ),
146                    new Regex( "\"([\\s]{1,})", "''${1}" ),
147    
148        // Regular expressions to handle multiple dots
149        new Regex( "\\.\\.\\.", "{\\\\dots}" ),
150            };
151    
152      /**
153       * Regular expressions to convert words to accentuated words.
154       */
155      private static Regex accentRegex[] = 
156      {
157                    new Regex( "([b|B])lase[^A-z]", "${1}las\\\\'\\{e}" ),
158        new Regex( "([b|B])ona fide", "${1}on\\\\^\\{a} fide" ),
159        new Regex( "([b|B])ete[^A-z]", "${1}\\\\^\\{e}te" ),
160        new Regex( "([c|C])afe[^A-z]", "${1}af\\\\'\\{e}" ),
161        new Regex( "coup-de-grace", "coup-de-gr\\\\^\\{a}ce" ),
162        new Regex( "([f|F])acade", "${1}a\\\\c\\{c}ade" ),
163        new Regex( "([f|F])ete[^A-z]", "${1}\\\\^\\{e}te" ),
164        new Regex( "([f|F])etes[^A-z]", "${1}\\\\^\\{e}tes" ),
165        new Regex( "([s|S])ignor[^A-z]", "${1}ig\\~{n}or" ),
166        new Regex( "tete-a-tete", "t\\\\^\\{e}te-\\\\`\\{a}-t\\\\^\\{e}te" ),
167      };
168    
169            /**
170                    *       Static initialiser for javaregex package.  Also optimises all
171                    *       the regular expressions.
172                    */
173            static
174            {
175                    Key.registeredTo( "Belladona.Columbine.Creeper v1.4/530369178" );
176    
177                    int length = dateRegex.length;
178                    // Optimise the regular expressions
179                    for ( int i = 0; i < length; ++i )
180                    {
181                            dateRegex[i].optimize();
182                    }
183    
184                    length = spellingRegex.length;
185                    for ( int i = 0; i < length; ++i )
186                    {
187                            spellingRegex[i].optimize();
188                    }
189    
190                    length = miscRegex.length;
191                    for ( int i = 0; i < length; ++i )
192                    {
193                            miscRegex[i].optimize();
194                    }
195    
196        length = accentRegex.length;
197        for ( int i = 0; i < length; ++i )
198        {
199          accentRegex[i].optimize();
200        }
201            }
202    
203            /**
204                    *       Default constructor.  This is made <code>private</code> to
205                    *       prevent instantiation.
206                    */
207            private TransformTextFile() {}
208    
209            /**
210                    *       <p>Convenience method that invokes all the methods.</p>
211                    *
212                    *       @param contents - The input String object that is to be processed.
213                    *       @return String - The transformed String object.
214                    */
215            public static final String convertText( String contents )
216            {
217                    String text = contents;
218                    text = convertDates( text );
219                    text = americanToEnglish( text );
220                    text = miscConversions( text );
221        text = accentConversions( text );
222                    return text;
223            }
224    
225            /**
226                    *       <p>Convert dates to formatted date.  Special code is used
227                    *       for generating proper <code>HTML</code> formatted dates.</p>
228                    *
229                    *       @param contents - The document that is being processed.
230                    *               document.
231                    *       @return String - The document after adding the part tags.
232                    */
233            public static final String convertDates( String contents )
234            {
235                    String text = contents;
236    
237    
238                    int length = dateRegex.length;
239                    for ( int i = 0; i < length; ++i )
240                    {
241                            text = dateRegex[i].replaceAll( text );
242                    }
243    
244                    return ( text );
245            }
246    
247            /**
248                    * Convert American spelling to English.
249                    *
250                    *       @param contents - The String object that is to be converted
251                    *       @return String - The converted String object
252                    */
253            public static final String americanToEnglish( String contents )
254            {
255                    String text = contents;
256    
257                    int length = spellingRegex.length;
258                    for ( int i = 0; i < length; ++i )
259                    {
260                            text = spellingRegex[i].replaceAll( text );
261                    }
262    
263                    return ( text );
264            }
265    
266            /**
267                    *       <p>Make various other transformations to the text.  Examples
268                    *       are converting double quotes, handling titles ....</p>
269                    *
270                    *       @param contents - The String object that is to be converted
271                    *       @return String - The converted String object
272                    */
273            public static final String miscConversions( String contents )
274            {
275                    String text = contents;
276    
277                    int length = miscRegex.length;
278                    for ( int i = 0; i < length; ++i )
279                    {
280                            text = miscRegex[i].replaceAll( text );
281                    }
282    
283                    return ( text );
284            }
285    
286            /**
287                    *       <p>Make words accentuated.  Examples
288                    *       are converting facade, fete ....</p>
289                    *
290                    *       @param contents - The String object that is to be converted
291                    *       @return String - The converted String object
292                    */
293            public static final String accentConversions( String contents )
294            {
295                    String text = contents;
296    
297                    int length = accentRegex.length;
298                    for ( int i = 0; i < length; ++i )
299                    {
300                            text = accentRegex[i].replaceAll( text );
301                    }
302    
303                    return ( text );
304            }
305    }