001 package org.rakeshv.tex;
002
003 import com.stevesoft.pat.*;
004
005 /**
006 * <p>This class provides <code>static methods</code> that do very
007 * common transformations to <code>ASCII text</code> files that
008 * are converted to <code>L<sup><small>A</small></sup>T<sub><small>E</small></sub>X</code>.</p>
009 */
010 public final class TransformTextFile
011 {
012 /**
013 * Regular expressions that are used to transform unformatted
014 * dates to TeX and HTML.
015 */
016 private static Regex dateRegex[] = {
017 new Regex( "(?i) ([0-9]{1,2})st", "\\\\opt{dvi,pdf,ps}{${1}\\\\textsuperscript{st}}\\\\opt{html}{\\\\HCode{${1}<sup><small>st</small></sup>}}" ),
018 new Regex( "(?i)([0-9]{1,2})nd", "\\\\opt{dvi,pdf,ps}{${1}\\\\textsuperscript{nd}}\\\\opt{html}{\\\\HCode{${1}<sup><small>nd</small></sup>}}" ),
019 new Regex( "(?i)([0-9]{1,2})rd", "\\\\opt{dvi,pdf,ps}{${1}\\\\textsuperscript{rd}}\\\\opt{html}{\\\\HCode{${1}<sup><small>rd</small></sup>}}" ),
020 new Regex( "(?i)([0-9]{1,2})th", "\\\\opt{dvi,pdf,ps}{${1}\\\\textsuperscript{th}}\\\\opt{html}{\\\\HCode{${1}<sup><small>th</small></sup>}}" ),
021 };
022
023 /**
024 * Regular expressions that change American spelling to English
025 */
026 private static Regex spellingRegex[] = {
027 new Regex( "([a|A])rdor", "${1}rdour" ),
028 new Regex( "([a|A])rmor", "${1}rmour" ),
029 new Regex( "([a|A])rmourial", "${1}rmorial" ),
030 new Regex( "([b|B])ehavior", "${1}ehaviour" ),
031 new Regex( "([c|C])andor", "${1}andour" ),
032 new Regex( "([c|C])olor", "${1}olour" ),
033 new Regex( "([c|C])lamor", "${1}lamour" ),
034 new Regex( "([d|D])emeanor", "${1}emeanour" ),
035 new Regex( "([d|D])olor", "${1}olour" ),
036 new Regex( "([e|E])ndeavor", "${1}ndeavour" ),
037 new Regex( "([f|F])avor", "${1}avour" ),
038 new Regex( "([f|F])lavor", "${1}lavour" ),
039 new Regex( "([f|F])ervor", "${1}ervour" ),
040 new Regex( "([h|H])arbor", "${1}arbour" ),
041 new Regex( "([h|H])onor", "${1}onour" ),
042 new Regex( "([h|H])umor([^a-z]{1,})", "${1}umour${2}" ),
043 new Regex( "([h|H])umored", "${1}umoured" ),
044 new Regex( "([l|L])abor([^a-z]{1,})", "${1}abour${2}" ),
045 new Regex( "([l|L])aborer", "${1}abourer" ),
046 new Regex( "([n|N])eighbor", "${1}eighbour" ),
047 new Regex( "([o|O])dor", "${1}dour" ),
048 new Regex( "([p|P])arlor", "${1}arlour" ),
049 new Regex( "([r|R])igor", "${1}igour" ),
050 new Regex( "([r|R])igourous", "${1}igorous" ),
051 new Regex( "([r|R])umor", "${1}umour" ),
052 new Regex( "([s|S])avior", "${1}aviour" ),
053 new Regex( "([s|S])plendor", "${1}plendour" ),
054 new Regex( "([s|S])uccor", "${1}uccour" ),
055 new Regex( "([v|V])alor", "${1}alour" ),
056 new Regex( "([v|V])apor", "${1}apour" ),
057 new Regex( "([v|V])apourous", "${1}aporous" ),
058 new Regex( "([v|V])igor([^a-z])", "${1}igour${2}" ),
059 new Regex( "([e|E])labourate", "${1}laborate" ),
060 new Regex( "([h|H])onourary", "${1}onorary" ),
061 new Regex( "([l|L])abourious", "${1}aborious" ),
062 new Regex( "([a|A])goniz", "${1}gonis" ),
063 new Regex( "([a|A])nalyze", "${1}nalyse" ),
064 new Regex( "([a|A])nalyzing", "${1}nalysing" ),
065 new Regex( "([a|A])pologiz", "${1}pologis" ),
066 new Regex( "([a|A])uthoriz", "${1}uthoris" ),
067 new Regex( "([b|B])aptiz", "${1}aptis" ),
068 new Regex( "([c|C])haracteriz", "${1}haracteris" ),
069 new Regex( "([c|C])iviliz", "${1}ivilis" ),
070 new Regex( "([c|C])ustomiz", "${1}ustomis" ),
071 new Regex( "([c|C])riticiz", "${1}riticis" ),
072 new Regex( "([c|C])rystalliz", "${1}rystallis" ),
073 new Regex( "([d|D])ecentraliz", "${1}ecentralis" ),
074 new Regex( "([e|E])conomiz", "${1}conomis" ),
075 new Regex( "([e|E])mphasiz", "${1}mphasis" ),
076 new Regex( "([e|E])ulogiz", "${1}ulogis" ),
077 new Regex( "([e|E])vangeliz", "${1}vangelis" ),
078 new Regex( "([f|F])amiliariz", "${1}amiliaris" ),
079 new Regex( "([g|G])alvaniz", "${1}alvanis" ),
080 new Regex( "([g|G])eneraliz", "${1}eneralis" ),
081 new Regex( "([h|H])armoniz", "${1}armonis" ),
082 new Regex( "([h|H])ypnotiz", "${1}ypnotis" ),
083 new Regex( "([m|M])aterializ", "${1}aterialis" ),
084 new Regex( "([m|M])esmerize", "${1}esmerise" ),
085 new Regex( "([m|M])inimiz", "${1}inimis" ),
086 new Regex( "([m|M])obiliz", "${1}obilis" ),
087 new Regex( "([m|M])onopoliz", "${1}nopolis" ),
088 new Regex( "([m|M])oraliz", "${1}oralis" ),
089 new Regex( "([n|N])eutraliz", "${1}eutralis" ),
090 new Regex( "([o|O])rganiz", "${1}rganis" ),
091 new Regex( "([p|P])aralyz", "${1}aralys" ),
092 new Regex( "([p|P])atroniz", "${1}atronis" ),
093 new Regex( "([p|P])olariz", "${1}olaris" ),
094 new Regex( "([p|P])ulveriz", "${1}ulveris" ),
095 new Regex( "([r|R])ealiz", "${1}ealis" ),
096 new Regex( "([r|R])ecogniz", "${1}ecognis" ),
097 new Regex( "([r|R])evolutioniz", "${1}evolutionis" ),
098 new Regex( "([s|S])candaliz", "${1}candalis" ),
099 new Regex( "([s|S])crutiniz", "${1}crutinis" ),
100 new Regex( "([s|S])olemniz", "${1}olemnis" ),
101 new Regex( "([s|S])oliloquiz", "${1}oliloquis" ),
102 new Regex( "([s|S])pecializ", "${1}pecialis" ),
103 new Regex( "([s|S])tandardiz", "${1}tandardis" ),
104 new Regex( "([s|S])teriliz", "${1}terilis" ),
105 new Regex( "([s|S])ympathiz", "${1}ympathis" ),
106 new Regex( "([t|T])antaliz", "${1}antalis" ),
107 new Regex( "([t|T])emporiz", "${1}emporis" ),
108 new Regex( "([t|T])eorize", "${1}eorise" ),
109 new Regex( "([t|T])erroriz", "${1}erroris" ),
110 new Regex( "([u|U])tiliz", "${1}tilis" ),
111 new Regex( "([v|V])isualiz", "${1}isualis" ),
112 new Regex( "([c|C])enter", "${1}entre" ),
113 new Regex( "([f|F])iber", "${1}ibre" ),
114 new Regex( "([m|M])aneuver", "${1}anoeuvre" ),
115 new Regex( "([m|M])eager", "${1}eagre" ),
116 new Regex( "([s|S])epulcher", "${1}epulchre" ),
117 new Regex( "([s|S])pecter", "${1}pectre" ),
118 new Regex( "([t|T])eater", "${1}atre" ),
119 new Regex( "([c|C])atalog[^u]", "${1}atalogue" ),
120 new Regex( "([d|D])ialog[^u]", "${1}ialogue" ),
121 new Regex( "([b|B])ehoove", "${1}ehove" ),
122 new Regex( "([d|D])efense", "${1}efence" ),
123 new Regex( "([l|L])icense", "${1}icence" ),
124 new Regex( "([o|O])ffense", "${1}ffence" ),
125 new Regex( "([p|P])rogram([^a-z])", "${1}rogramme${2}" ),
126 new Regex( "([f|F])ulfill", "${1}ulfil" ),
127 new Regex( "([j|J])ewlry", "${1}ewellery" ),
128 new Regex( "([m|M])arvelous", "${1}arvellous" ),
129 new Regex( "([m|M])odeling", "${1}odelling" ),
130 new Regex( "([s|S])illful", "${1}ilful" ),
131 new Regex( "([s|S])pecialty", "${1}peciality" ),
132 };
133
134 /**
135 * Regular expression to change miscellaneous things
136 */
137 private static Regex miscRegex[] = {
138 // Regular expressions to handle titles
139 new Regex( "M\\.[\\s]{1,}([A-Z])", "M\\.\\~${1}" ),
140 new Regex( "Mr\\.[\\s]{1,}([A-Z])", "Mr\\.\\~${1}" ),
141 new Regex( "Mrs\\.[\\s]{1,}([A-Z])", "Mrs\\.\\~${1}" ),
142 new Regex( "St\\.[\\s]{1,}([A-Z])", "St\\.\\~${1}" ),
143
144 // Regular expressions to handle double quotes
145 new Regex( "([\\s]{1,})\"", "${1}``" ),
146 new Regex( "\"([\\s]{1,})", "''${1}" ),
147
148 // Regular expressions to handle multiple dots
149 new Regex( "\\.\\.\\.", "{\\\\dots}" ),
150 };
151
152 /**
153 * Regular expressions to convert words to accentuated words.
154 */
155 private static Regex accentRegex[] =
156 {
157 new Regex( "([b|B])lase[^A-z]", "${1}las\\\\'\\{e}" ),
158 new Regex( "([b|B])ona fide", "${1}on\\\\^\\{a} fide" ),
159 new Regex( "([b|B])ete[^A-z]", "${1}\\\\^\\{e}te" ),
160 new Regex( "([c|C])afe[^A-z]", "${1}af\\\\'\\{e}" ),
161 new Regex( "coup-de-grace", "coup-de-gr\\\\^\\{a}ce" ),
162 new Regex( "([f|F])acade", "${1}a\\\\c\\{c}ade" ),
163 new Regex( "([f|F])ete[^A-z]", "${1}\\\\^\\{e}te" ),
164 new Regex( "([f|F])etes[^A-z]", "${1}\\\\^\\{e}tes" ),
165 new Regex( "([s|S])ignor[^A-z]", "${1}ig\\~{n}or" ),
166 new Regex( "tete-a-tete", "t\\\\^\\{e}te-\\\\`\\{a}-t\\\\^\\{e}te" ),
167 };
168
169 /**
170 * Static initialiser for javaregex package. Also optimises all
171 * the regular expressions.
172 */
173 static
174 {
175 Key.registeredTo( "Belladona.Columbine.Creeper v1.4/530369178" );
176
177 int length = dateRegex.length;
178 // Optimise the regular expressions
179 for ( int i = 0; i < length; ++i )
180 {
181 dateRegex[i].optimize();
182 }
183
184 length = spellingRegex.length;
185 for ( int i = 0; i < length; ++i )
186 {
187 spellingRegex[i].optimize();
188 }
189
190 length = miscRegex.length;
191 for ( int i = 0; i < length; ++i )
192 {
193 miscRegex[i].optimize();
194 }
195
196 length = accentRegex.length;
197 for ( int i = 0; i < length; ++i )
198 {
199 accentRegex[i].optimize();
200 }
201 }
202
203 /**
204 * Default constructor. This is made <code>private</code> to
205 * prevent instantiation.
206 */
207 private TransformTextFile() {}
208
209 /**
210 * <p>Convenience method that invokes all the methods.</p>
211 *
212 * @param contents - The input String object that is to be processed.
213 * @return String - The transformed String object.
214 */
215 public static final String convertText( String contents )
216 {
217 String text = contents;
218 text = convertDates( text );
219 text = americanToEnglish( text );
220 text = miscConversions( text );
221 text = accentConversions( text );
222 return text;
223 }
224
225 /**
226 * <p>Convert dates to formatted date. Special code is used
227 * for generating proper <code>HTML</code> formatted dates.</p>
228 *
229 * @param contents - The document that is being processed.
230 * document.
231 * @return String - The document after adding the part tags.
232 */
233 public static final String convertDates( String contents )
234 {
235 String text = contents;
236
237
238 int length = dateRegex.length;
239 for ( int i = 0; i < length; ++i )
240 {
241 text = dateRegex[i].replaceAll( text );
242 }
243
244 return ( text );
245 }
246
247 /**
248 * Convert American spelling to English.
249 *
250 * @param contents - The String object that is to be converted
251 * @return String - The converted String object
252 */
253 public static final String americanToEnglish( String contents )
254 {
255 String text = contents;
256
257 int length = spellingRegex.length;
258 for ( int i = 0; i < length; ++i )
259 {
260 text = spellingRegex[i].replaceAll( text );
261 }
262
263 return ( text );
264 }
265
266 /**
267 * <p>Make various other transformations to the text. Examples
268 * are converting double quotes, handling titles ....</p>
269 *
270 * @param contents - The String object that is to be converted
271 * @return String - The converted String object
272 */
273 public static final String miscConversions( String contents )
274 {
275 String text = contents;
276
277 int length = miscRegex.length;
278 for ( int i = 0; i < length; ++i )
279 {
280 text = miscRegex[i].replaceAll( text );
281 }
282
283 return ( text );
284 }
285
286 /**
287 * <p>Make words accentuated. Examples
288 * are converting facade, fete ....</p>
289 *
290 * @param contents - The String object that is to be converted
291 * @return String - The converted String object
292 */
293 public static final String accentConversions( String contents )
294 {
295 String text = contents;
296
297 int length = accentRegex.length;
298 for ( int i = 0; i < length; ++i )
299 {
300 text = accentRegex[i].replaceAll( text );
301 }
302
303 return ( text );
304 }
305 }