java - How to split URL? -
this code split url, code have problem. link appear double word, example www.utem.edu.my/portal/portal . words /portal/portal double in link appear. suggestion me extract links in webpage?
public string crawlurl(string strurl) { string results = ""; // return string protocol = "http://"; // assigns input inurl variable , checks add http string inurl = strurl; if (!inurl.tolowercase().contains("http://".tolowercase()) && !inurl.tolowercase().contains("https://".tolowercase())) { inurl = protocol + inurl; } // pulls url contents web string contecturl = pullurl(inurl); if (contecturl == "") { // if fails, try https protocol = "https://"; inurl = protocol + inurl.split("http://")[1]; contecturl = pullurl(inurl); } // declares variables used inside loop string atagattr = ""; string href = ""; string msg = ""; // finds tag , stores href value output var string bodytag = contecturl.split("<body")[1]; // find 1st <body> string[] atags = bodytag.split(">"); // splits on every tag //to show link different 1 int index = 0; (string s: atags) { // process if tag , contains href if (s.tolowercase().contains("<a") && s.tolowercase().contains("href")) { atagattr = s.split("href")[1]; // split on href // split on space if contains if (atagattr.tolowercase().contains("\\s")) atagattr = atagattr.split("\\s")[2]; // splits on link , deals " or ' quotes href = atagattr.split(((atagattr.tolowercase().contains("\""))? "\"" : "\'"))[1]; if (!results.tolowercase().contains(href)) //results += "~~~ " + href + "\r\n"; /* * last touches url before display * adds http(s):// if not exist * adds base url if not exist */ if(results.tolowercase().indexof("about") != -1) { //contains 'about' } if (!href.tolowercase().contains("http://") && !href.tolowercase().contains("https://")) { // http:// + baseurl + href if (!href.tolowercase().contains(inurl.split("://")[1])) href = protocol + inurl.split("://")[1] + href; else href = protocol + href; } system.out.println(href);//debug
consider use url class...
use suggested documentation : )
public static void main(string[] args) throws exception { url aurl = new url("http://example.com:80/docs/books/tutorial" + "/index.html?name=networking#downloading"); system.out.println("protocol = " + aurl.getprotocol()); system.out.println("authority = " + aurl.getauthority()); system.out.println("host = " + aurl.gethost()); system.out.println("port = " + aurl.getport()); system.out.println("path = " + aurl.getpath()); system.out.println("query = " + aurl.getquery()); system.out.println("filename = " + aurl.getfile()); system.out.println("ref = " + aurl.getref()); } }
the output:
protocol = http
authority = example.com:80
host = example.com
port = 80
etc
after can take elements need create new 1 string/url :)
Comments
Post a Comment