java - How to split the URL the get all the link in webpage? -
i'm doing project hyperlink crawler
inspecting broken link. code. www.utem.edu.my/portal/portal. link give 404 error. think code split url wrong. me please.
public class hinterface extends jframe { // declaring variables used components in interface private jlabel lblurl; private jtextfield inputsearch; private jbutton btnsearch; private jeditorpane outputlinks; public hinterface() { super("hyperlink crawler"); settype(type.popup); setresizable(false); getcontentpane().setbackground(color.black); settitle("web link crawler inspecting broken link"); flowlayout flowlayout = new flowlayout(); flowlayout.setalignment(flowlayout.left); getcontentpane().setlayout(flowlayout); // creates label displaying text lblurl = new jlabel("\r\nenter url : "); lblurl.setlocation(new point(13, 9)); lblurl.setdoublebuffered(true); lblurl.setalignmenty(component.bottom_alignment); lblurl.setalignmentx(component.right_alignment); lblurl.setverticalalignment(swingconstants.top); lblurl.setforeground(color.white); lblurl.setfont(new font("tw cen mt condensed", font.bold, 20)); getcontentpane().add(lblurl); // creates text field url input inputsearch = new jtextfield(); inputsearch.settext("http://"); inputsearch.setpreferredsize(new dimension(400, 32)); inputsearch.setfont(new font("sansserif", font.bold, 17)); getcontentpane().add(inputsearch); // creates search button btnsearch = new jbutton(" search "); btnsearch.setpreferredsize(new dimension(100, 32)); btnsearch.setfont(new font("sansserif", font.bold, 13)); getcontentpane().add(btnsearch); // adds results text area scroll-able pane jscrollpane scrolloutput = new jscrollpane (jscrollpane.vertical_scrollbar_as_needed, jscrollpane.horizontal_scrollbar_never); scrolloutput.setbackground(color.gray); scrolloutput.setpreferredsize(new dimension(1100, 670)); getcontentpane().add(scrolloutput); outputlinks = new jeditorpane(); outputlinks.addhyperlinklistener(new hyperlinklistener() { public void hyperlinkupdate(hyperlinkevent e) { if (hyperlinkevent.eventtype.activated.equals(e.geteventtype())) { system.out.println(e.geturl()); desktop desktop = desktop.getdesktop(); try { desktop.browse(e.geturl().touri()); } catch (exception ex) { ex.printstacktrace(); } } } }); outputlinks.settext("result"); outputlinks.setcontenttype("text/html"); outputlinks.seteditable(true); scrolloutput.setcolumnheaderview(outputlinks); outputlinks.seteditable(false); // add event handler search button click handleevents theeventhandler = new handleevents(); inputsearch.addactionlistener(theeventhandler); btnsearch.addactionlistener(theeventhandler); } private class handleevents implements actionlistener { public void actionperformed(actionevent event) { // called when elements triggered // preparing output variable string string stroutput = "no results found!"; if (event.getsource() == btnsearch || event.getsource() == inputsearch) { if (!inputsearch.gettext().equals("")) stroutput = crawlurl(inputsearch.gettext()); else stroutput = "please enter url crawl it's hyperlinks"; } // prints out results outputlinks.settext(stroutput); } public string pullurl(string strurl) { string resutls = ""; urlconnection connection = null; try { connection = new url(strurl).openconnection(); @suppresswarnings("resource") scanner scanner = new scanner(connection.getinputstream()); scanner.usedelimiter("\\z"); if(scanner.hasnext()) resutls = scanner.next(); } catch ( exception ex ) { ex.printstacktrace(); } return resutls; } public string crawlurl(string strurl) { string results = ""; // return string protocol = "http://"; // assigns input inurl variable , checks add http string inurl = strurl; if (!inurl.tolowercase().contains("http://".tolowercase()) && !inurl.tolowercase().contains("https://".tolowercase())) { inurl = protocol + inurl; } // pulls url contents web string contecturl = pullurl(inurl); if (contecturl == "") { // if fails, try https protocol = "https://"; inurl = protocol + inurl.split("http://")[1]; contecturl = pullurl(inurl); } // declares variables used inside loop string atagattr = ""; string href = ""; string msg = ""; // finds tag , stores href value output var string bodytag = contecturl.split("<body")[1]; // find 1st <body> string[] atags = bodytag.split(">"); // splits on every tag //to show link different 1 int index = 0; (string s: atags) { // process if tag , contains href if (s.tolowercase().contains("<a") && s.tolowercase().contains("href")) { atagattr = s.split("href")[1]; // split on href // split on space if contains if (atagattr.tolowercase().contains("\\s")) atagattr = atagattr.split("\\s")[2]; // splits on link , deals " or ' quotes href = atagattr.split( ((atagattr.tolowercase().contains("\""))? "\"" : "\'") )[1]; if (!results.tolowercase().contains(href)) //results += "~~~ " + href + "\r\n"; /* * last touches url before display * adds http(s):// if not exist * adds base url if not exist */ if(results.tolowercase().indexof("about") != -1) { //contains 'about' } if (!href.tolowercase().contains("http://") && !href.tolowercase().contains("https://")) { // http:// + baseurl + href if (!href.tolowercase().contains(inurl.split("://")[1])) href = protocol + inurl.split("://")[1] + href; else href = protocol + href; } system.out.println(href);//debug try { msg = urlheker(href); } catch (exception e) { // todo auto-generated catch block e.printstacktrace(); } // store link in output var if (!results.tolowercase().contains(href)){ results += "<a href=\""; results += href; results += "\">"; results += "link" + (index + 1)+ " : "+ href ; results += "</a>"; results += " : "; results += msg; results += "<br>"; index++; } } } system.out.println(results); return results; } } public string urlheker(string href) throws exception { string msg = ""; int code = 0; url url = new url(href); urlconnection connection = url.openconnection(); if(connection instanceof httpurlconnection) { httpurlconnection httpconn=(httpurlconnection)connection; code = httpconn.getresponsecode(); msg = httpconn.getresponsemessage(); if(code == httpurlconnection.http_ok ) system.out.println("return normal response :"+msg); else system.out.println(code); } msg = msg+" [" + integer.tostring(code) + "]"; return msg; } }
i'm not sure if resolves problem, can check response code before getting input stream connection:
public string pullurl(string strurl) { string resutls = ""; httpurlconnection connection = null; try { connection = (httpurlconnection)new url(strurl).openconnection(); //connection.getresponsecode() <- check response code @suppresswarnings("resource") scanner scanner = new scanner(connection.getinputstream()); scanner.usedelimiter("\\z"); if (scanner.hasnext()) resutls = scanner.next(); } catch (exception ex) { ex.printstacktrace(); } return resutls; }
Comments
Post a Comment