java - How to split the URL the get all the link in webpage? -


i'm doing project hyperlink crawler inspecting broken link. code. www.utem.edu.my/portal/portal. link give 404 error. think code split url wrong. me please.

public class hinterface extends jframe {      // declaring variables used components in interface     private jlabel lblurl;     private jtextfield inputsearch;     private jbutton btnsearch;     private jeditorpane outputlinks;      public hinterface() {         super("hyperlink crawler");         settype(type.popup);         setresizable(false);         getcontentpane().setbackground(color.black);         settitle("web link crawler inspecting broken link");         flowlayout flowlayout = new flowlayout();         flowlayout.setalignment(flowlayout.left);         getcontentpane().setlayout(flowlayout);          // creates label displaying text         lblurl = new jlabel("\r\nenter url :    ");         lblurl.setlocation(new point(13, 9));         lblurl.setdoublebuffered(true);         lblurl.setalignmenty(component.bottom_alignment);         lblurl.setalignmentx(component.right_alignment);         lblurl.setverticalalignment(swingconstants.top);         lblurl.setforeground(color.white);         lblurl.setfont(new font("tw cen mt condensed", font.bold, 20));         getcontentpane().add(lblurl);          // creates text field url input         inputsearch = new jtextfield();         inputsearch.settext("http://");         inputsearch.setpreferredsize(new dimension(400, 32));         inputsearch.setfont(new font("sansserif", font.bold, 17));         getcontentpane().add(inputsearch);          // creates search button         btnsearch = new jbutton("  search  ");         btnsearch.setpreferredsize(new dimension(100, 32));         btnsearch.setfont(new font("sansserif", font.bold, 13));         getcontentpane().add(btnsearch);          // adds results text area scroll-able pane         jscrollpane scrolloutput = new jscrollpane (jscrollpane.vertical_scrollbar_as_needed, jscrollpane.horizontal_scrollbar_never);         scrolloutput.setbackground(color.gray);         scrolloutput.setpreferredsize(new dimension(1100, 670));         getcontentpane().add(scrolloutput);          outputlinks = new jeditorpane();         outputlinks.addhyperlinklistener(new hyperlinklistener() {             public void hyperlinkupdate(hyperlinkevent e) {                 if (hyperlinkevent.eventtype.activated.equals(e.geteventtype())) {                     system.out.println(e.geturl());                     desktop desktop = desktop.getdesktop();                     try {                         desktop.browse(e.geturl().touri());                     } catch (exception ex) {                         ex.printstacktrace();                     }                 }              }         });           outputlinks.settext("result");         outputlinks.setcontenttype("text/html");         outputlinks.seteditable(true);         scrolloutput.setcolumnheaderview(outputlinks);         outputlinks.seteditable(false);          // add event handler search button click         handleevents theeventhandler = new handleevents();         inputsearch.addactionlistener(theeventhandler);         btnsearch.addactionlistener(theeventhandler);     }      private class handleevents implements actionlistener {          public void actionperformed(actionevent event) { // called when elements triggered             // preparing output variable string             string stroutput = "no results found!";              if (event.getsource() == btnsearch || event.getsource() == inputsearch) {                 if (!inputsearch.gettext().equals(""))                     stroutput = crawlurl(inputsearch.gettext());                 else                     stroutput = "please enter url crawl it's hyperlinks";             }              // prints out results             outputlinks.settext(stroutput);         }          public string pullurl(string strurl) {             string resutls = "";             urlconnection connection = null;             try {               connection =  new url(strurl).openconnection();               @suppresswarnings("resource")             scanner scanner = new scanner(connection.getinputstream());               scanner.usedelimiter("\\z");               if(scanner.hasnext())                   resutls = scanner.next();             } catch ( exception ex ) {                 ex.printstacktrace();             }              return resutls;         }           public string crawlurl(string strurl) {             string results = ""; // return             string protocol = "http://";              // assigns input inurl variable , checks add http             string inurl = strurl;             if (!inurl.tolowercase().contains("http://".tolowercase()) &&                      !inurl.tolowercase().contains("https://".tolowercase())) {                 inurl = protocol + inurl;             }              // pulls url contents web             string contecturl = pullurl(inurl);             if (contecturl == "") { // if fails, try https                 protocol = "https://";                 inurl = protocol + inurl.split("http://")[1];                 contecturl = pullurl(inurl);             }              // declares variables used inside loop             string atagattr = "";             string href = "";             string msg = "";              // finds tag , stores href value output var             string bodytag = contecturl.split("<body")[1]; // find 1st <body>             string[] atags = bodytag.split(">"); // splits on every tag              //to show link different 1             int index = 0;              (string s: atags) {                 // process if tag , contains href                 if (s.tolowercase().contains("<a") && s.tolowercase().contains("href")) {                      atagattr = s.split("href")[1]; // split on href                      // split on space if contains                     if (atagattr.tolowercase().contains("\\s"))                         atagattr = atagattr.split("\\s")[2];                      // splits on link , deals " or ' quotes                     href = atagattr.split(                         ((atagattr.tolowercase().contains("\""))? "\"" : "\'")                     )[1];                      if (!results.tolowercase().contains(href))                          //results += "~~~ " + href + "\r\n";                      /*                      * last touches url before display                      *      adds http(s):// if not exist                      *      adds base url if not exist                      */                          if(results.tolowercase().indexof("about") != -1) {                                //contains 'about'                             }                     if (!href.tolowercase().contains("http://") &&                             !href.tolowercase().contains("https://")) {                          // http:// + baseurl + href                         if (!href.tolowercase().contains(inurl.split("://")[1]))                             href = protocol + inurl.split("://")[1] + href;                         else                             href = protocol + href;                     }                      system.out.println(href);//debug                      try {                         msg = urlheker(href);                     } catch (exception e) {                         // todo auto-generated catch block                         e.printstacktrace();                     }                      // store link in output var                     if (!results.tolowercase().contains(href)){                           results += "<a href=\"";                         results += href;                         results += "\">";                         results +=  "link" + (index + 1)+ " : "+ href  ;                         results += "</a>";                         results += "                                                                                        :  ";                         results += msg;                         results += "<br>";                         index++;                     }                  }              }              system.out.println(results);             return results;          }     }   public string urlheker(string href) throws exception {      string msg = "";     int code = 0;     url url = new url(href);     urlconnection connection = url.openconnection();         if(connection instanceof httpurlconnection) {             httpurlconnection httpconn=(httpurlconnection)connection;             code = httpconn.getresponsecode();             msg = httpconn.getresponsemessage();                 if(code == httpurlconnection.http_ok )                     system.out.println("return normal response :"+msg);                 else                     system.out.println(code);             }          msg = msg+" [" + integer.tostring(code) + "]";  return msg;       }  } 

i'm not sure if resolves problem, can check response code before getting input stream connection:

public string pullurl(string strurl) {     string resutls = "";     httpurlconnection connection = null;     try {         connection = (httpurlconnection)new url(strurl).openconnection();         //connection.getresponsecode() <- check response code         @suppresswarnings("resource") scanner scanner = new scanner(connection.getinputstream());         scanner.usedelimiter("\\z");         if (scanner.hasnext())             resutls = scanner.next();     } catch (exception ex) {         ex.printstacktrace();     }      return resutls; } 

Comments

Popular posts from this blog

magento2 - Magento 2 admin grid add filter to collection -

Android volley - avoid multiple requests of the same kind to the server? -

Combining PHP Registration and Login into one class with multiple functions in one PHP file -