LibOFX
ofx_preproc.cpp
Go to the documentation of this file.
1 /***************************************************************************
2  ofx_preproc.cpp
3  -------------------
4  copyright : (C) 2002 by Benoit Gr�oir
5  email : benoitg@coeus.ca
6 ***************************************************************************/
12 /***************************************************************************
13  * *
14  * This program is free software; you can redistribute it and/or modify *
15  * it under the terms of the GNU General Public License as published by *
16  * the Free Software Foundation; either version 2 of the License, or *
17  * (at your option) any later version. *
18  * *
19  ***************************************************************************/
20 #include "../config.h"
21 #include <iostream>
22 #include <fstream>
23 #include <cstdlib>
24 #include <stdio.h>
25 #include <sstream>
26 #include <string>
27 #include "ParserEventGeneratorKit.h"
28 #include "libofx.h"
29 #include "messages.hh"
30 #include "ofx_sgml.hh"
31 #include "ofc_sgml.hh"
32 #include "ofx_preproc.hh"
33 #include "ofx_utilities.hh"
34 #ifdef HAVE_ICONV
35 #include <iconv.h>
36 #endif
37 
38 #ifdef __WIN32__
39 # define DIRSEP "\\"
40 #else
41 # define DIRSEP "/"
42 #endif
43 
44 #ifdef __WIN32__
45 # include "win32.hh"
46 # include <windows.h> // for GetModuleFileName()
47 # undef ERROR
48 # undef DELETE
49 #endif
50 
51 #define LIBOFX_DEFAULT_INPUT_ENCODING "CP1252"
52 #define LIBOFX_DEFAULT_OUTPUT_ENCODING "UTF-8"
53 
57 #ifdef MAKEFILE_DTD_PATH
58 const int DTD_SEARCH_PATH_NUM = 4;
59 #else
60 const int DTD_SEARCH_PATH_NUM = 3;
61 #endif
62 
67 {
68 #ifdef MAKEFILE_DTD_PATH
69  MAKEFILE_DTD_PATH,
70 #endif
71  "/usr/local/share/libofx/dtd",
72  "/usr/share/libofx/dtd",
73  "~"
74 };
75 
80 int ofx_proc_file(LibofxContextPtr ctx, const char * p_filename)
81 {
82  LibofxContext *libofx_context;
83  bool ofx_start = false;
84  bool ofx_end = false;
85  bool file_is_xml = false;
86  bool used_iconv = false;
87  std::ifstream input_file;
88  std::ofstream tmp_file;
89  char *filenames[3];
90  char tmp_filename[256];
91  int tmp_file_fd;
92 #ifdef HAVE_ICONV
93  iconv_t conversion_descriptor;
94 #endif
95  libofx_context = (LibofxContext*)ctx;
96 
97  if (p_filename != NULL && strcmp(p_filename, "") != 0)
98  {
99  message_out(DEBUG, std::string("ofx_proc_file():Opening file: ") + p_filename);
100 
101  input_file.open(p_filename);
102  if (!input_file)
103  {
104  message_out(ERROR, "ofx_proc_file():Unable to open the input file " + std::string(p_filename));
105  }
106 
107  mkTempFileName("libofxtmpXXXXXX", tmp_filename, sizeof(tmp_filename));
108 
109  message_out(DEBUG, "ofx_proc_file(): Creating temp file: " + std::string(tmp_filename));
110 #ifdef __WIN32__
111  tmp_file_fd = mkstemp_win32(tmp_filename);
112 #else
113  tmp_file_fd = mkstemp(tmp_filename);
114 #endif
115  if (tmp_file_fd)
116  {
117  tmp_file.open(tmp_filename);
118  if (!tmp_file)
119  {
120  message_out(ERROR, "ofx_proc_file():Unable to open the created temp file " + std::string(tmp_filename));
121  return -1;
122  }
123  }
124  else
125  {
126  message_out(ERROR, "ofx_proc_file():Unable to create a temp file at " + std::string(tmp_filename));
127  return -1;
128  }
129 
130  if (input_file && tmp_file)
131  {
132  std::size_t header_separator_idx;
133  std::string header_name;
134  std::string header_value;
135  std::string ofx_encoding;
136  std::string ofx_charset;
137  do
138  {
139  std::stringbuf buffer;
140  std::string s_buffer;
141  input_file.get(buffer, '\n');
142  //cout<< "got: \"" << buffer<<"\"\n";
143  s_buffer = buffer.str();
144 
145  // Watch out: If input_file is in eof(), any subsequent read or
146  // peek() will fail and we must exit this loop.
147  if (!input_file.eof())
148  {
149  //cout<<"input_file.gcount(): "<<input_file.gcount()<< " s_buffer.size=" << s_buffer.size()<<" sizeof(buffer): "<<sizeof(buffer) << " peek=\"" << int(input_file.peek()) << "\"" <<endl;
150  if (input_file.fail()) // If no characters were extracted above, the failbit is set.
151  {
152  // No characters extracted means that we've reached the newline
153  // delimiter (because we already checked for EOF). We will check
154  // for and remove that newline in the next if-clause, but must
155  // remove the failbit so that peek() will work again.
156  input_file.clear();
157  }
158 
159  // Is the next character really the newline?
160  if (input_file.peek() == '\n')
161  {
162  // Yes. Then discard that newline character from the stream
163  input_file.get();
164  }
165  }
166 
167  if (ofx_start == false && (s_buffer.find("<?xml") != std::string::npos))
168  {
169  message_out(DEBUG, "ofx_proc_file(): File is an actual XML file, iconv conversion will be skipped.");
170  file_is_xml = true;
171  }
172 
173  std::size_t ofx_start_idx;
174  if (ofx_start == false)
175  {
176  if (
177  (libofx_context->currentFileType() == OFX &&
178  ((ofx_start_idx = s_buffer.find("<OFX>")) != std::string::npos ||
179  (ofx_start_idx = s_buffer.find("<ofx>")) != std::string::npos))
180  ||
181  (libofx_context->currentFileType() == OFC &&
182  ((ofx_start_idx = s_buffer.find("<OFC>")) != std::string::npos ||
183  (ofx_start_idx = s_buffer.find("<ofc>")) != std::string::npos))
184  )
185  {
186  ofx_start = true;
187  if (file_is_xml == false)
188  {
189  s_buffer.erase(0, ofx_start_idx); //Fix for really broken files that don't have a newline after the header.
190  }
191  message_out(DEBUG, "ofx_proc_file():<OFX> or <OFC> has been found");
192 
193  if (file_is_xml == true)
194  {
195  static char sp_charset_fixed[] = "SP_CHARSET_FIXED=1";
196  if (putenv(sp_charset_fixed) != 0)
197  {
198  message_out(ERROR, "ofx_proc_file(): putenv failed");
199  }
200  /* Normally the following would be "xml".
201  * Unfortunately, opensp's generic api will garble UTF-8 if this is
202  * set to xml. So we set any single byte encoding to avoid messing
203  * up UTF-8. Unfortunately this means that non-UTF-8 files will not
204  * get properly translated. We'd need to manually detect the
205  * encoding in the XML header and convert the xml with iconv like we
206  * do for SGML to work around the problem. Most unfortunate. */
207  static char sp_encoding[] = "SP_ENCODING=ms-dos";
208  if (putenv(sp_encoding) != 0)
209  {
210  message_out(ERROR, "ofx_proc_file(): putenv failed");
211  }
212  }
213  else
214  {
215  static char sp_charset_fixed[] = "SP_CHARSET_FIXED=1";
216  if (putenv(sp_charset_fixed) != 0)
217  {
218  message_out(ERROR, "ofx_proc_file(): putenv failed");
219  }
220  static char sp_encoding[] = "SP_ENCODING=ms-dos"; //Any single byte encoding will do, we don't want opensp messing up UTF-8;
221  if (putenv(sp_encoding) != 0)
222  {
223  message_out(ERROR, "ofx_proc_file(): putenv failed");
224  }
225 #ifdef HAVE_ICONV
226  std::string fromcode;
227  std::string tocode;
228  if (ofx_encoding.compare("USASCII") == 0)
229  {
230  if (ofx_charset.compare("ISO-8859-1") == 0 || ofx_charset.compare("8859-1") == 0)
231  {
232  //Only "ISO-8859-1" is actually a legal value, but since the banks follows the spec SO well...
233  fromcode = "ISO-8859-1";
234  }
235  else if (ofx_charset.compare("1252") == 0 || ofx_charset.compare("CP1252") == 0)
236  {
237  //Only "1252" is actually a legal value, but since the banks follows the spec SO well...
238  fromcode = "CP1252";
239  }
240  else if (ofx_charset.compare("NONE") == 0)
241  {
242  fromcode = LIBOFX_DEFAULT_INPUT_ENCODING;
243  }
244  else
245  {
246  fromcode = LIBOFX_DEFAULT_INPUT_ENCODING;
247  }
248  }
249  else if (ofx_encoding.compare("UTF-8") == 0 || ofx_encoding.compare("UNICODE") == 0)
250  {
251  //While "UNICODE" isn't a legal value, some cyrilic files do specify it as such...
252  fromcode = "UTF-8";
253  }
254  else
255  {
256  fromcode = LIBOFX_DEFAULT_INPUT_ENCODING;
257  }
258  tocode = LIBOFX_DEFAULT_OUTPUT_ENCODING;
259  message_out(DEBUG, "ofx_proc_file(): Setting up iconv for fromcode: " + fromcode + ", tocode: " + tocode);
260  conversion_descriptor = iconv_open (tocode.c_str(), fromcode.c_str());
261  used_iconv = true;
262 #endif
263  }
264  }
265  else
266  {
267  //We are still in the headers
268  if ((header_separator_idx = s_buffer.find(':')) != std::string::npos)
269  {
270  //Header processing
271  header_name.assign(s_buffer.substr(0, header_separator_idx));
272  header_value.assign(s_buffer.substr(header_separator_idx + 1));
273  while ( header_value[header_value.length() - 1 ] == '\n' ||
274  header_value[header_value.length() - 1 ] == '\r' )
275  header_value.erase(header_value.length() - 1);
276  message_out(DEBUG, "ofx_proc_file():Header: " + header_name + " with value: " + header_value + " has been found");
277  if (header_name.compare("ENCODING") == 0)
278  {
279  ofx_encoding.assign(header_value);
280  }
281  if (header_name.compare("CHARSET") == 0)
282  {
283  ofx_charset.assign(header_value);
284  }
285  }
286  }
287  }
288 
289  if (file_is_xml == true || (ofx_start == true && ofx_end == false))
290  {
291  if (ofx_start == true)
292  {
293  /* The above test won't help us if the <OFX> tag is on the same line
294  * as the xml header, but as opensp can't be used to parse it anyway
295  * this isn't a great loss for now.
296  */
297  s_buffer = sanitize_proprietary_tags(s_buffer);
298  if (s_buffer.empty())
299  continue;
300  }
301  //cout<< s_buffer<<"\n";
302  if (file_is_xml == false)
303  {
304 #ifdef HAVE_ICONV
305  size_t inbytesleft = s_buffer.size();
306  size_t outbytesleft = inbytesleft * 2 - 1;
307  char * iconv_buffer = (char*) malloc (inbytesleft * 2);
308  memset(iconv_buffer, 0, inbytesleft * 2);
309  const char* inchar = s_buffer.c_str();
310  char * outchar = iconv_buffer;
311  int iconv_retval = iconv (conversion_descriptor,
312  const_cast<char**>(&inchar), &inbytesleft,
313  &outchar, &outbytesleft);
314  if (iconv_retval == -1)
315  {
316  message_out(ERROR, "ofx_proc_file(): Iconv conversion error");
317  }
318  // All validly converted bytes will be copied to the
319  // original buffer
320  s_buffer = std::string(iconv_buffer, outchar - iconv_buffer);
321  free (iconv_buffer);
322 #endif
323  }
324  //cout << s_buffer << "\n";
325  tmp_file << s_buffer << std::endl;
326  }
327 
328  if (ofx_start == true &&
329  (
330  (libofx_context->currentFileType() == OFX &&
331  ((ofx_start_idx = s_buffer.find("</OFX>")) != std::string::npos ||
332  (ofx_start_idx = s_buffer.find("</ofx>")) != std::string::npos))
333  || (libofx_context->currentFileType() == OFC &&
334  ((ofx_start_idx = s_buffer.find("</OFC>")) != std::string::npos ||
335  (ofx_start_idx = s_buffer.find("</ofc>")) != std::string::npos))
336  )
337  )
338  {
339  ofx_end = true;
340  message_out(DEBUG, "ofx_proc_file():</OFX> or </OFC> has been found");
341  }
342 
343  }
344  while (!input_file.eof() && !input_file.bad());
345  }
346  input_file.close();
347  tmp_file.close();
348 #ifdef HAVE_ICONV
349  if (used_iconv == true)
350  {
351  iconv_close(conversion_descriptor);
352  }
353 #endif
354  char filename_openspdtd[255];
355  char filename_dtd[255];
356  char filename_ofx[255];
357  STRNCPY(filename_openspdtd, find_dtd(ctx, OPENSPDCL_FILENAME)); //The opensp sgml dtd file
358  if (libofx_context->currentFileType() == OFX)
359  {
360  STRNCPY(filename_dtd, find_dtd(ctx, OFX160DTD_FILENAME)); //The ofx dtd file
361  }
362  else if (libofx_context->currentFileType() == OFC)
363  {
364  STRNCPY(filename_dtd, find_dtd(ctx, OFCDTD_FILENAME)); //The ofc dtd file
365  }
366  else
367  {
368  message_out(ERROR, std::string("ofx_proc_file(): Error unknown file format for the OFX parser"));
369  }
370 
371  if ((std::string)filename_dtd != "" && (std::string)filename_openspdtd != "")
372  {
373  strncpy(filename_ofx, tmp_filename, 255); //The processed ofx file
374  filenames[0] = filename_openspdtd;
375  filenames[1] = filename_dtd;
376  filenames[2] = filename_ofx;
377  int rv;
378  if (libofx_context->currentFileType() == OFX)
379  {
380  rv = ofx_proc_sgml(libofx_context, 3, filenames);
381  }
382  else if (libofx_context->currentFileType() == OFC)
383  {
384  rv = ofc_proc_sgml(libofx_context, 3, filenames);
385  }
386  else
387  {
388  message_out(ERROR, std::string("ofx_proc_file(): Error unknown file format for the OFX parser"));
389  rv = -1;
390  }
391  if (remove(tmp_filename) != 0)
392  {
393  message_out(ERROR, "ofx_proc_file(): Error deleting temporary file " + std::string(tmp_filename));
394  }
395  return rv;
396  }
397  else
398  {
399  message_out(ERROR, "ofx_proc_file(): FATAL: Missing DTD, aborting");
400  return -1;
401  }
402  }
403  else
404  {
405  message_out(ERROR, "ofx_proc_file():No input file specified");
406  return -1;
407  }
408  return 0;
409 }
410 
411 /* Searches input string for an opening or closing tag starting from pos_start.
412  * If found will return the tag_name and pos_start will be set to the string
413  * of the starting <, pos_end to the position after the closing '>'
414  * If the tag doesn't have a closing '>', pos_end will be set to string::npos.
415  */
416 static std::string find_tag_open (std::string& input_string, size_t& pos_start, size_t& pos_end)
417 {
418  pos_start = input_string.find ('<', pos_start);
419 
420  if (pos_start == std::string::npos)
421  {
422  pos_end = std::string::npos;
423  return std::string();
424  }
425 
426  pos_end = input_string.find ('>', pos_start + 1);
427  if (pos_end != std::string::npos)
428  pos_end = pos_end + 1;
429  size_t tag_size = (pos_end - 1) - (pos_start + 1);
430  return input_string.substr(pos_start + 1, tag_size);
431 }
432 
433 /* Searches input string for a closing tag matching tag_name starting at pos.
434  * If found pos will be set to the position right after of the closing '>'
435  * If no matching closing tag is found pos will be set to the start of the next
436  * opening or closing tag found.
437  */
438 static void find_tag_close (std::string& input_string, std::string& tag_name, size_t& pos)
439 {
440  size_t start_idx = input_string.find ("</" + tag_name + ">", pos);
441 
442  if (start_idx == std::string::npos)
443  {
444  start_idx = pos;
445  size_t end_idx;
446  std::string new_tag_name = find_tag_open (input_string, start_idx, end_idx);
447  if (!new_tag_name.empty())
448  {
449  message_out(DEBUG, "find_tag_close() fell back to next open tag: " + new_tag_name);
450  // find_tag_open returns the *end* of an opening tag, but in this
451  // case we want its start, so we need to rewind a bit..
452  pos = start_idx;
453  //printf("find_tag_close() returning pos after fallback: %d\n",pos);
454  }
455  else
456  {
457  pos = input_string.length();
458  }
459  }
460  else
461  {
462  pos = start_idx + tag_name.length() + 3;
463  }
464  return;
465 }
466 
467 
479 std::string sanitize_proprietary_tags(std::string input_string)
480 {
481  size_t last_known_good_pos = 0;
482  size_t open_tag_start_pos = last_known_good_pos;
483  size_t open_tag_end_pos;
484  size_t close_tag_end_pos;
485 
486  std::string tag_name = find_tag_open(input_string, open_tag_start_pos, open_tag_end_pos);
487  while (!tag_name.empty())
488  {
489  // Determine whether the current tag is proprietary.
490  if ((tag_name.find('.') != std::string::npos) || // tag has a . in the name
491  (tag_name == "CATEGORY")) // Chase bank started setting these in 2017
492  {
493  close_tag_end_pos = open_tag_end_pos;
494  find_tag_close (input_string, tag_name, close_tag_end_pos);
495  size_t tag_size = close_tag_end_pos - open_tag_start_pos;
496  std::string prop_tag = input_string.substr(open_tag_start_pos, tag_size);
497  message_out(INFO, "sanitize_proprietary_tags() removed: " + prop_tag);
498  input_string.erase(open_tag_start_pos, tag_size);
499  last_known_good_pos = open_tag_start_pos;
500  }
501  else
502  {
503  last_known_good_pos = open_tag_end_pos;
504  }
505  tag_name.clear();
506  open_tag_start_pos = last_known_good_pos;
507  if (last_known_good_pos != std::string::npos)
508  tag_name = find_tag_open(input_string, open_tag_start_pos, open_tag_end_pos);
509  }
510  return input_string;
511 }
512 
513 
514 #ifdef __WIN32__
515 static std::string get_dtd_installation_directory()
516 {
517  // Partial implementation of
518  // http://developer.gnome.org/doc/API/2.0/glib/glib-Windows-Compatibility-Functions.html#g-win32-get-package-installation-directory
519  char ch_fn[MAX_PATH], *p;
520  std::string str_fn;
521 
522  if (!GetModuleFileName(NULL, ch_fn, MAX_PATH)) return "";
523 
524  if ((p = strrchr(ch_fn, '\\')) != NULL)
525  * p = '\0';
526 
527  p = strrchr(ch_fn, '\\');
528  if (p && (_stricmp(p + 1, "bin") == 0 ||
529  _stricmp(p + 1, "lib") == 0))
530  *p = '\0';
531 
532  str_fn = ch_fn;
533  str_fn += "\\share\\libofx\\dtd";
534 
535  return str_fn;
536 }
537 #endif
538 
539 
552 std::string find_dtd(LibofxContextPtr ctx, const std::string& dtd_filename)
553 {
554  std::string dtd_path_filename;
555  char *env_dtd_path;
556 
557  dtd_path_filename = reinterpret_cast<const LibofxContext*>(ctx)->dtdDir();
558  if (!dtd_path_filename.empty())
559  {
560  dtd_path_filename.append(dtd_filename);
561  std::ifstream dtd_file(dtd_path_filename.c_str());
562  if (dtd_file)
563  {
564  message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
565  return dtd_path_filename;
566  }
567  }
568 
569 #ifdef __WIN32__
570  dtd_path_filename = get_dtd_installation_directory();
571  if (!dtd_path_filename.empty())
572  {
573  dtd_path_filename.append(DIRSEP);
574  dtd_path_filename.append(dtd_filename);
575  std::ifstream dtd_file(dtd_path_filename.c_str());
576  if (dtd_file)
577  {
578  message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
579  return dtd_path_filename;
580  }
581  }
582 #endif
583  /* Search in environment variable OFX_DTD_PATH */
584  env_dtd_path = getenv("OFX_DTD_PATH");
585  if (env_dtd_path)
586  {
587  dtd_path_filename.append(env_dtd_path);
588  dtd_path_filename.append(DIRSEP);
589  dtd_path_filename.append(dtd_filename);
590  std::ifstream dtd_file(dtd_path_filename.c_str());
591  if (!dtd_file)
592  {
593  message_out(STATUS, "find_dtd():OFX_DTD_PATH env variable was was present, but unable to open the file " + dtd_path_filename);
594  }
595  else
596  {
597  message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
598  return dtd_path_filename;
599  }
600  }
601 
602  for (int i = 0; i < DTD_SEARCH_PATH_NUM; i++)
603  {
604  dtd_path_filename = DTD_SEARCH_PATH[i];
605  dtd_path_filename.append(DIRSEP);
606  dtd_path_filename.append(dtd_filename);
607  std::ifstream dtd_file(dtd_path_filename.c_str());
608  if (!dtd_file)
609  {
610  message_out(DEBUG, "find_dtd():Unable to open the file " + dtd_path_filename);
611  }
612  else
613  {
614  message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
615  return dtd_path_filename;
616  }
617  }
618 
619  /* Last resort, look in source tree relative path (useful for development) */
620  dtd_path_filename = "";
621  dtd_path_filename.append("..");
622  dtd_path_filename.append(DIRSEP);
623  dtd_path_filename.append("dtd");
624  dtd_path_filename.append(DIRSEP);
625  dtd_path_filename.append(dtd_filename);
626  std::ifstream dtd_file(dtd_path_filename.c_str());
627  if (!dtd_file)
628  {
629  message_out(DEBUG, "find_dtd(): Unable to open the file " + dtd_path_filename + ", most likely we are not in the source tree.");
630  }
631  else
632  {
633  message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
634  return dtd_path_filename;
635  }
636 
637 
638  message_out(ERROR, "find_dtd():Unable to find the DTD named " + dtd_filename);
639  return "";
640 }
Main header file containing the LibOfx API.
@ OFX
Definition: libofx.h:129
@ OFC
Definition: libofx.h:130
int message_out(OfxMsgType error_type, const std::string message)
Message output function.
Definition: messages.cpp:61
Message IO functionality.
@ DEBUG
Definition: messages.hh:25
@ ERROR
Definition: messages.hh:34
@ INFO
Definition: messages.hh:32
@ STATUS
Definition: messages.hh:31
int ofc_proc_sgml(LibofxContext *libofx_context, int argc, char *const *argv)
Parses a DTD and OFX file(s)
Definition: ofc_sgml.cpp:351
OFX/SGML parsing functionality.
std::string sanitize_proprietary_tags(std::string input_string)
Removes proprietary tags and comments.
const char * DTD_SEARCH_PATH[DTD_SEARCH_PATH_NUM]
The list of paths to search for the DTDs.
Definition: ofx_preproc.cpp:66
const int DTD_SEARCH_PATH_NUM
The number of different paths to search for DTDs.
Definition: ofx_preproc.cpp:60
std::string find_dtd(LibofxContextPtr ctx, const std::string &dtd_filename)
Find the appropriate DTD for the file version.
int ofx_proc_file(LibofxContextPtr ctx, const char *p_filename)
File pre-processing of OFX AND for OFC files.
Definition: ofx_preproc.cpp:80
Preprocessing of the OFX files before parsing.
int ofx_proc_sgml(LibofxContext *libofx_context, int argc, char *const *argv)
Parses a DTD and OFX file(s)
Definition: ofx_sgml.cpp:433
OFX/SGML parsing functionality.
Various simple functions for type conversion & al.
void STRNCPY(T &dest, const std::string &src)