C++ Parse URL
urlregex.cpp
//sudo apt-get install libboost-all-dev; //g++ urlregex.cpp -lboost_regex #include <string> #include <iostream> #include <boost/regex.hpp> using std::string; using std::cout; using std::endl; using std::stringstream; void parse_url(const string& url) //with boost { boost::regex ex("(http|https)://([^/ :]+):?([^/ ]*)(/?[^ #?]*)\\x3f?([^ #]*)#?([^ ]*)"); boost::cmatch what; if(regex_match(url.c_str(), what, ex)) { string protocol = string(what[1].first, what[1].second); string domain = string(what[2].first, what[2].second); string port = string(what[3].first, what[3].second); string path = string(what[4].first, what[4].second); string query = string(what[5].first, what[5].second); cout << "[" << url << "]" << endl; cout << protocol << endl; cout << domain << endl; cout << port << endl; cout << path << endl; cout << query << endl; cout << "-------------------------------" << endl; } } int main(int argc, char* argv[]) { parse_url("http://www.google.com"); parse_url("https://mail.google.com/mail/"); parse_url("https://www.google.com:443/webhp?gws_rd=ssl"); return 0; }
main.cpp (another example, without boost)
#include <string> #include <iostream> using std::string; using std::cout; using std::endl; using std::stringstream; string _trim(const string& str) { size_t start = str.find_first_not_of(" \n\r\t"); size_t until = str.find_last_not_of(" \n\r\t"); string::const_iterator i = start==string::npos ? str.begin() : str.begin() + start; string::const_iterator x = until==string::npos ? str.end() : str.begin() + until+1; return string(i,x); } void parse_url(const string& raw_url) //no boost { string path,domain,x,protocol,port,query; int offset = 0; size_t pos1,pos2,pos3,pos4; x = _trim(raw_url); offset = offset==0 && x.compare(0, 8, "https://")==0 ? 8 : offset; offset = offset==0 && x.compare(0, 7, "http://" )==0 ? 7 : offset; pos1 = x.find_first_of('/', offset+1 ); path = pos1==string::npos ? "" : x.substr(pos1); domain = string( x.begin()+offset, pos1 != string::npos ? x.begin()+pos1 : x.end() ); path = (pos2 = path.find("#"))!=string::npos ? path.substr(0,pos2) : path; port = (pos3 = domain.find(":"))!=string::npos ? domain.substr(pos3+1) : ""; domain = domain.substr(0, pos3!=string::npos ? pos3 : domain.length()); protocol = offset > 0 ? x.substr(0,offset-3) : ""; query = (pos4 = path.find("?"))!=string::npos ? path.substr(pos4+1) : ""; path = pos4!=string::npos ? path.substr(0,pos4) : path; cout << "[" << raw_url << "]" << endl; cout << "protocol: " << protocol << endl; cout << "domain: " << domain << endl; cout << "port: " << port << endl; cout << "path: " << path << endl; cout << "query: " << query << endl; } int main(int argc, char* argv[]) { parse_url("http://www.google.com"); parse_url("https://mail.google.com/mail/"); parse_url("https://www.google.com:443/webhp?gws_rd=ssl"); return 0; }
main.cpp (one last example)
#include <string> #include <stdint.h> #include <cstring> #include <sstream> #include <algorithm> #include <iostream> using std::cerr; using std::cout; using std::endl; using std::string; class HTTPURL { private: string _protocol;// http vs https string _domain; // mail.google.com uint16_t _port; // 80,443 string _path; // /mail/ string _query; // [after ?] a=b&c=b public: const string &protocol; const string &domain; const uint16_t &port; const string &path; const string &query; HTTPURL(const string& url): protocol(_protocol),domain(_domain),port(_port),path(_path),query(_query) { string u = _trim(url); size_t offset=0, slash_pos, hash_pos, colon_pos, qmark_pos; string urlpath,urldomain,urlport; uint16_t default_port; static const char* allowed[] = { "https://", "http://", "ftp://", NULL}; for(int i=0; allowed[i]!=NULL && this->_protocol.length()==0; i++) { const char* c=allowed[i]; if (u.compare(0,strlen(c), c)==0) { offset = strlen(c); this->_protocol=string(c,0,offset-3); } } default_port = this->_protocol=="https" ? 443 : 80; slash_pos = u.find_first_of('/', offset+1 ); urlpath = slash_pos==string::npos ? "/" : u.substr(slash_pos); urldomain = string( u.begin()+offset, slash_pos != string::npos ? u.begin()+slash_pos : u.end() ); urlpath = (hash_pos = urlpath.find("#"))!=string::npos ? urlpath.substr(0,hash_pos) : urlpath; urlport = (colon_pos = urldomain.find(":"))!=string::npos ? urldomain.substr(colon_pos+1) : ""; urldomain = urldomain.substr(0, colon_pos!=string::npos ? colon_pos : urldomain.length()); this->_domain = _tolower(urldomain); this->_query = (qmark_pos = urlpath.find("?"))!=string::npos ? urlpath.substr(qmark_pos+1) : ""; this->_path = qmark_pos!=string::npos ? urlpath.substr(0,qmark_pos) : urlpath; this->_port = urlport.length()==0 ? default_port : _atoi(urlport) ; }; private: static inline string _trim(const string& input) { string str = input; size_t endpos = str.find_last_not_of(" \t\n\r"); if( string::npos != endpos ) { str = str.substr( 0, endpos+1 ); } size_t startpos = str.find_first_not_of(" \t\n\r"); if( string::npos != startpos ) { str = str.substr( startpos ); } return str; }; static inline string _tolower(const string& input) { string str = input; std::transform(str.begin(), str.end(), str.begin(), ::tolower); return str; }; static inline int _atoi(const string& input) { int r; std::stringstream(input) >> r; return r; }; }; int main(int argc, char **argv) { HTTPURL u("https://Mail.google.com:80/mail/?action=send#action=send"); cout << "protocol: " << u.protocol << endl; cout << "domain: " << u.domain << endl; cout << "port: " << u.port << endl; cout << "path: " << u.path << endl; cout << "query: " << u.query << endl; return 0; }
code snippets are licensed under Creative Commons CC-By-SA 3.0 (unless otherwise specified)
KingVIP
on
2016-01-19 10:38:42
many thanks!
|
Benjamin Sergeant
on
2017-08-30 03:48:47
Thanks a lot for your code Zed, just used it and it works great. I used the regular expression version, which one can use with C++11 now.
|