(* Getting mushroom data from catalogueoflife.org *)
yearData = "2019"; (* the latest data in old format *)
maintainDataSearch[name_String] := Module[{url, filedata, textcondit, text, matchtd, matcha, matchb, match1, match2, data1, data2},
(* no longer working:
url = "http://www.catalogueoflife.org/col/search/all/items/999/key/" <>
StringReplace[name, " " -> "+"] <> "/match/1/fossil/0";
*)
url = "http://www.catalogueoflife.org/annual-checklist/" <> yearData <> "/search/all/key/" <>
StringReplace[name, " " -> "+"] <> "/fossil/1/match/1";
dirsearch = ToFileName[{dirdata, "search"}];
checkdir1[dirsearch];
filedata = ToFileName[dirsearch, ToLowerCase[StringReplace[name, " " -> "_"]] <> ".htm"];
textcondit = ((StringLength[#] > 1000)&);
urlsave[url, filedata, textcondit];
text = Import[filedata, "TEXT"];
matchtd = "\\s*
(.{3,555}?)";
(* accepted *)
matcha = "\\s*\\s*\\s*(.{3,222}?)\\s*\\s*";
match1 = RegularExpression["(?i)(?s)" <> matcha <> matchtd <> matchtd <> matchtd <> matchtd <> "\\s*"];
data1 = StringCases[text, match1 -> {"$2", "$3", "$4", "$5", "$6", "$7"}];
nPrint[name, ": ", Length[data1], " accepted: ", TableForm[Map[removetags, data1, {2}]]];
(* synonyms *)
matchb = "\\s*\\s*\\s*(.{3,222}?)\\s*\\s*";
match2 = RegularExpression["(?i)(?s)" <> matchb <> matchtd <> matchtd <> matchtd <> matchtd <> "\\s*"];
data2 = StringCases[text, match2 -> {"$2", "$3", "$4", "$5", "$6", "$7"}];
nPrint[name, ": ", Length[data2], " synonyms: ", TableForm[Map[removetags, data2, {2}]]];
(* ++++++++++++++++++ *)
(* provisionally accepted *)
(*If[!StringFreeQ[text, "rovisionally"], Print["Provisionally: ", name]];*)
(* ++++++++++++++++++++ *)
(* *)
Join[data1, data2]];
maintainGetData := Module[{data, ids, url, filedata, textcondit},
Print["Running maintainGetData ..."];
Do[
data = maintainDataSearch[name];
iddata[name] = data;
If[data === {}, Print["Warning: NO IDs found during maintainDataSearch[name]! name = ", name],
ids = Transpose[data][[1]];
ids = Union[ids];
Do[
(* No longer works:
url = "http://www.catalogueoflife.org/col/webservice?id=" <> id <> "&response=full";
*)
url = "http://www.catalogueoflife.org/annual-checklist/" <> yearData <> "/webservice?id=" <> id <> "&response=full";
dirids = ToFileName[{dirdata, "ids"}];
checkdir1[dirids];
filedata = ToFileName[dirids, id <> ".xml"];
textcondit = ((StringLength[#] > 1000)&);
urlsave[url, filedata, textcondit], {id, ids}]], {name, namespic}];
namespic0 = namespic;
namespic = Select[namespic0, iddata[#] =!= {}&];
datalist = Select[datalist, iddata[#[[1]]] =!= {}&];
];