THRIFT-1800 Documentation text not always escaped correctly when rendered to HTML

Patch: Jens Geyer
2024-11-07 10:48:51 +00:00 · 2012-12-24 10:32:58 +01:00 · 2012-12-24 10:32:58 +01:00 · 63e3c63078
commit 63e3c63078
parent d0c3586e49
1 changed files with 146 additions and 10 deletions
--- a/compiler/cpp/src/generate/t_html_generator.cc
+++ b/compiler/cpp/src/generate/t_html_generator.cc
@ -32,6 +32,8 @@
 using namespace std;


+enum input_type { INPUT_UNKNOWN, INPUT_UTF8, INPUT_PLAIN };
+
 /**
 * HTML code generator
 *
@ -48,6 +50,7 @@ class t_html_generator : public t_generator {
    (void) parsed_options;
    (void) option_string;  
    out_dir_base_ = "gen-html";
+    input_type_ = INPUT_UNKNOWN;
    
    std::map<std::string, std::string>::const_iterator iter;
    iter = parsed_options.find("standalone");
@ -67,10 +70,13 @@ class t_html_generator : public t_generator {
  void generate_program_toc_rows(t_program* tprog,
         std::vector<t_program*>& finished);
  void generate_index();
+  std::string escape_html(std::string const & str);
  void generate_css();
  void generate_css_content(std::ofstream & f_target);
  void generate_style_tag();
  std::string make_file_link( std::string name);
+  bool is_utf8_sequence(std::string const & str, size_t firstpos);
+  void detect_input_encoding(std::string const & str, size_t firstpos);
  
  /**
   * Program-level generation functions
@ -91,10 +97,12 @@ class t_html_generator : public t_generator {
 private:
  std::ofstream f_out_;
  std::string  current_file_;
+  input_type input_type_;
 
  bool standalone_; 
 };

+
 /**
 * Emits the Table of Contents links at the top of the module's page
 */
@ -301,7 +309,7 @@ void t_html_generator::generate_program() {

  f_out_ << "</div></body></html>" << endl;
  f_out_.close();
-
+  
  generate_index();
  generate_css();
 }
@ -383,20 +391,148 @@ void t_html_generator::print_doc(t_doc* tdoc) {
    size_t index;
    while ((index = doc.find_first_of("\r\n")) != string::npos) {
      if (index == 0) {
-  f_out_ << "<p/>" << endl;
+        f_out_ << "<p/>" << endl;
      } else {
-  f_out_ << doc.substr(0, index) << endl;
+        f_out_ << escape_html( doc.substr(0, index)) << endl;
      }
      if (index + 1 < doc.size() && doc.at(index) != doc.at(index + 1) &&
-    (doc.at(index + 1) == '\r' || doc.at(index + 1) == '\n')) {
-  index++;
+         (doc.at(index + 1) == '\r' || doc.at(index + 1) == '\n')) {
+        index++;
      }
      doc = doc.substr(index + 1);
    }
-    f_out_ << doc << "<br/>";
+    f_out_ << escape_html(doc) << "<br/>";
  }
 }

+bool t_html_generator::is_utf8_sequence(std::string const & str, size_t firstpos) {
+  // leading char determines the length of the sequence
+  unsigned char c = str.at(firstpos);
+  int count = 0;
+  if(        (c & 0xE0) == 0xC0) {
+    count = 1;
+  } else if( (c & 0xF0) == 0xE0) {
+    count = 2;
+  } else if( (c & 0xF8) == 0xF0) {
+    count = 3;
+  } else if( (c & 0xFC) == 0xF8) {
+    count = 4;
+  } else if( (c & 0xFE) == 0xFC) {
+    count = 5;
+  } else {
+    //pdebug("UTF-8 test: char '%c' (%d) is not a valid UTF-8 leading byte", c, int(c));
+    return false;  // no UTF-8
+  }
+
+  // following chars
+  size_t pos = firstpos + 1;
+  while( (pos < str.length()) && (0 < count))
+  {
+    c = str.at(pos);
+    if( (c & 0xC0) !=  0x80) {
+      //pdebug("UTF-8 test: char '%c' (%d) is not a valid UTF-8 following byte", c, int(c));
+      return false;  // no UTF-8
+    }    
+    --count;
+    ++pos;
+  }
+  
+  // true if the sequence is complete
+  return (0 == count);
+}
+
+void t_html_generator::detect_input_encoding(std::string const & str, size_t firstpos) {
+  if( is_utf8_sequence(str,firstpos))
+  {
+    pdebug( "Input seems to be already UTF-8 encoded");
+    input_type_ = INPUT_UTF8;
+    return;
+  }
+
+  // fallback 
+  pwarning( 1, "Input is not UTF-8, treating as plain ANSI");
+  input_type_ = INPUT_PLAIN;
+}
+
+std::string t_html_generator::escape_html(std::string const & str) {
+
+  // the generated HTML header says it is UTF-8 encoded
+  // if UTF-8 input has been detected before, we don't need to change anything
+  if( input_type_ == INPUT_UTF8) {
+    return str;
+  }
+  
+  // convert unsafe chars to their &#<num>; equivalent
+  std::ostringstream result;
+  unsigned char c = '?';
+  unsigned int ic = 0;
+  size_t lastpos;
+  size_t firstpos = 0;
+  while( firstpos < str.length()) {
+
+    // look for non-ASCII char  
+    lastpos = firstpos;    
+    while( lastpos < str.length()) {
+      c = str.at(lastpos);
+      ic = c;
+      if( (32 > ic) || (127 < ic)) {
+        break;
+      }
+      ++lastpos;
+    }
+    
+    // copy what we got so far    
+    if( lastpos > firstpos) {
+      result << str.substr( firstpos, lastpos-firstpos);
+      firstpos = lastpos;
+    }
+
+    // some control code?
+    if( (0 <= ic) && (31 >= ic))
+    {
+      switch( c)
+      {
+        case '\r' :  
+        case '\n' :  
+          result << "<br/>";  
+          break;
+        case '\t' :
+          result << " ";  
+          break;
+      }
+      ++firstpos;
+      continue;        
+    }
+    
+    // reached the end?
+    if( firstpos >= str.length()) {
+      break;
+    }
+
+    // try to detect input encoding
+    if( input_type_ == INPUT_UNKNOWN) {
+      detect_input_encoding(str,firstpos);
+      if( input_type_ == INPUT_UTF8) {
+        lastpos = str.length();
+        result << str.substr( firstpos, lastpos-firstpos);
+        break;
+      }
+    }
+    
+    // convert the character to something useful based on the detected encoding
+    switch( input_type_) {
+      case INPUT_PLAIN: 
+        result << "&#" << ic << ";";  
+        ++firstpos;
+        break;
+      default:
+        throw "Unexpected or unrecognized input encoding";
+    }
+  }
+  
+  return result.str();
+}
+
 /**
 * Prints out the provided type in HTML
 */
@ -460,7 +596,7 @@ void t_html_generator::print_const_value(t_const_value* tvalue) {
    f_out_ << tvalue->get_double();
    break;
  case t_const_value::CV_STRING:
-    f_out_ << '"' << get_escaped_string(tvalue) << '"';
+    f_out_ << '"' << escape_html( get_escaped_string(tvalue)) << '"';
    break;
  case t_const_value::CV_MAP:
    {
@ -521,7 +657,7 @@ void t_html_generator::print_fn_args_doc(t_function* tfunction) {
      for ( ; arg_iter != args.end(); arg_iter++) {
        f_out_ << "<tr><td>" << (*arg_iter)->get_name();
        f_out_ << "</td><td>";
-        f_out_ << (*arg_iter)->get_doc();
+        f_out_ << escape_html( (*arg_iter)->get_doc());
        f_out_ << "</td></tr>" << endl;
      }
      f_out_ << "</table>";
@ -545,7 +681,7 @@ void t_html_generator::print_fn_args_doc(t_function* tfunction) {
      for ( ; ex_iter != excepts.end(); ex_iter++) {
        f_out_ << "<tr><td>" << (*ex_iter)->get_type()->get_name();
        f_out_ << "</td><td>";
-        f_out_ << (*ex_iter)->get_doc();
+        f_out_ << escape_html( (*ex_iter)->get_doc());
        f_out_ << "</td></tr>" << endl;
      }
      f_out_ << "</table>";
@ -640,7 +776,7 @@ void t_html_generator::generate_struct(t_struct* tstruct) {
    f_out_ << "</td><td>";
    print_type((*mem_iter)->get_type());
    f_out_ << "</td><td>";
-    f_out_ << (*mem_iter)->get_doc();
+    f_out_ << escape_html( (*mem_iter)->get_doc());
    f_out_ << "</td><td>";
    if ((*mem_iter)->get_req() == t_field::T_OPTIONAL) {
      f_out_ << "optional";