(* Getting mushroom data from catalogueoflife.org *) yearData = "2019"; (* the latest data in old format *) maintainDataSearch[name_String] := Module[{url, filedata, textcondit, text, matchtd, matcha, matchb, match1, match2, data1, data2}, (* no longer working: url = "http://www.catalogueoflife.org/col/search/all/items/999/key/" <> StringReplace[name, " " -> "+"] <> "/match/1/fossil/0"; *) url = "http://www.catalogueoflife.org/annual-checklist/" <> yearData <> "/search/all/key/" <> StringReplace[name, " " -> "+"] <> "/fossil/1/match/1"; dirsearch = ToFileName[{dirdata, "search"}]; checkdir1[dirsearch]; filedata = ToFileName[dirsearch, ToLowerCase[StringReplace[name, " " -> "_"]] <> ".htm"]; textcondit = ((StringLength[#] > 1000)&); urlsave[url, filedata, textcondit]; text = Import[filedata, "TEXT"]; matchtd = "\\s*(.{3,555}?)"; (* accepted *) matcha = "\\s*\\s*\\s*(.{3,222}?)\\s*\\s*"; match1 = RegularExpression["(?i)(?s)" <> matcha <> matchtd <> matchtd <> matchtd <> matchtd <> "\\s*"]; data1 = StringCases[text, match1 -> {"$2", "$3", "$4", "$5", "$6", "$7"}]; nPrint[name, ": ", Length[data1], " accepted: ", TableForm[Map[removetags, data1, {2}]]]; (* synonyms *) matchb = "\\s*\\s*\\s*(.{3,222}?)\\s*\\s*"; match2 = RegularExpression["(?i)(?s)" <> matchb <> matchtd <> matchtd <> matchtd <> matchtd <> "\\s*"]; data2 = StringCases[text, match2 -> {"$2", "$3", "$4", "$5", "$6", "$7"}]; nPrint[name, ": ", Length[data2], " synonyms: ", TableForm[Map[removetags, data2, {2}]]]; (* ++++++++++++++++++ *) (* provisionally accepted *) (*If[!StringFreeQ[text, "rovisionally"], Print["Provisionally: ", name]];*) (* ++++++++++++++++++++ *) (* *) Join[data1, data2]]; maintainGetData := Module[{data, ids, url, filedata, textcondit}, Print["Running maintainGetData ..."]; Do[ data = maintainDataSearch[name]; iddata[name] = data; If[data === {}, Print["Warning: NO IDs found during maintainDataSearch[name]! name = ", name], ids = Transpose[data][[1]]; ids = Union[ids]; Do[ (* No longer works: url = "http://www.catalogueoflife.org/col/webservice?id=" <> id <> "&response=full"; *) url = "http://www.catalogueoflife.org/annual-checklist/" <> yearData <> "/webservice?id=" <> id <> "&response=full"; dirids = ToFileName[{dirdata, "ids"}]; checkdir1[dirids]; filedata = ToFileName[dirids, id <> ".xml"]; textcondit = ((StringLength[#] > 1000)&); urlsave[url, filedata, textcondit], {id, ids}]], {name, namespic}]; namespic0 = namespic; namespic = Select[namespic0, iddata[#] =!= {}&]; datalist = Select[datalist, iddata[#[[1]]] =!= {}&]; ];