package genbook;

import java.util.regex.*;


public class TagRemover {

	String content;

	public TagRemover(String content){
		this.content = content;
	}

	public String removeTags(){
		// Replace anything between script or style tags
		// We should do some extra work to allow for white spaces in the HTML
		String scriptregex = "<(script|style)[^>]*>[^<]*</(script|style)>";
		Pattern p1 = Pattern.compile(scriptregex,Pattern.CASE_INSENSITIVE);
		Matcher m1 = p1.matcher(content);

		// For displaying results
		int count = 0;
		while (m1.find()) {
			//System.out.println(m1.group());
			count++;
		}
		//System.out.println("Removed " + count + " script & style tags");
		// Replace any matches with nothing
		content = m1.replaceAll("");

		// A Regex to match anything in between <>
		// Reads as: Match a "<"
		// Match one or more characters that are not ">"
		// Match "<";
		String tagregex = "<[^>]*>";
		Pattern p2 = Pattern.compile(tagregex);
		Matcher m2 = p2.matcher(content);
		count = 0;
		// Just counting all the tags first
		while (m2.find()) {
			//System.out.println(m.group());
			count++;
		}

		// Replace any matches with nothing
		content = m2.replaceAll("");
		//System.out.println("Removed " + count + " other tags.\n");

		// Oh what the hey, let's get rid of a lot of extra carriage returns
		// Matches one or two line breaks, 
		// followed any number of sequences of white spaces and line breaks
		String multiplenewlines = "(\\n{1,2})(\\s*\\n)+"; 
		// Replace with the original one or two new lines
		// Backreference not exactly necessary, but a nice demonstration
		content = content.replaceAll(multiplenewlines,"$1");
		content = content.replaceAll(multiplenewlines,"$2");
		
		
		String aposregex = "(&#39;|&apos;)";
		//in some examples shiffman does not use this pattern and macther dec why??
		Pattern p3 = Pattern.compile(aposregex,Pattern.CASE_INSENSITIVE);
		Matcher m3 = p3.matcher(content);
		content = m3.replaceAll("'");


		String dotregex = "(&middot;|&#183;)";
		//in some examples shiffman does not use this pattern and macther dec why??
		Pattern p4 = Pattern.compile(dotregex,Pattern.CASE_INSENSITIVE);
		Matcher m4 = p4.matcher(content);
		content = m4.replaceAll(" ");

		String quoteregex = "(&quot;|&#34;)";
		//in some examples shiffman does not use this pattern and macther dec why??
		Pattern p5 = Pattern.compile(quoteregex,Pattern.CASE_INSENSITIVE);
		Matcher m5 = p5.matcher(content);
		content = m5.replaceAll("\"");
		
		
		
		String andregex = "(&amp;|&#38;)";
		//in some examples shiffman does not use this pattern and macther dec why??
		Pattern p6 = Pattern.compile(andregex,Pattern.CASE_INSENSITIVE);
		Matcher m6 = p6.matcher(content);
		content = m6.replaceAll("&");


		String linebreakregex = "(&nbsp;|\\t|\\n)";
		//in some examples shiffman does not use this pattern and macther dec why??
		Pattern p7 = Pattern.compile(linebreakregex,Pattern.CASE_INSENSITIVE);
		Matcher m7 = p7.matcher(content);
		content = m7.replaceAll(" ");
		
		String rsquote = "(&rsquo;)";
		//in some examples shiffman does not use this pattern and macther dec why??
		Pattern p8 = Pattern.compile(rsquote,Pattern.CASE_INSENSITIVE);
		Matcher m8 = p8.matcher(content);
		content = m8.replaceAll("'");
		

		String space = "(&nbsp;|\\s|\\t)";
		//in some examples shiffman does not use this pattern and macther dec why??
		Pattern p9 = Pattern.compile(space,Pattern.CASE_INSENSITIVE);
		Matcher m9 = p9.matcher(content);
		content = m9.replaceAll(" ");
		
		return content;

	}


	
	public String cleanHTML(){
		return stripHTMLTags(content);
	}
	
	public String stripHTMLTags( String message ) {
		StringBuffer returnMessage = new StringBuffer(message);
		int startPosition = message.indexOf("<"); // encountered the first opening brace
		int endPosition = message.indexOf(">"); // encountered the first closing braces
		while( startPosition != -1 ) {
			returnMessage.delete( startPosition, endPosition+1 ); // remove the tag
			startPosition = (returnMessage.toString()).indexOf("<"); // look for the next opening brace
			endPosition = (returnMessage.toString()).indexOf(">"); // look for the next closing brace
		}
		return returnMessage.toString();
	}




}
