// TODO 关于close client, 增加 a b c 的关闭, 是否可以调整关闭位置
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
public class JsoupPart {
// 此 MAIN 函数用于获取药品数据, 只需要运行一次获得文件就可
public static void main(String[] args) throws Exception{
long startTime = System.currentTimeMillis();
// HttpClient 请求获取 https://drugs.dxy.cn/ 首页 html
CloseableHttpClient client = HttpClients.createDefault();
HttpGet get = new HttpGet("https://drugs.dxy.cn/");
CloseableHttpResponse response = client.execute(get);
String content = "";
if(response.getStatusLine().getStatusCode() == 200){
content = EntityUtils.toString(response.getEntity(), "UTF-8");
}
//关闭链接
//client.close();
// 将前面获取到的 html 文本转换为 Document 对象
Document doc = Jsoup.parse(content);
// 首页左边分类名称
String FirstLevelName[] = new String[50];
// 首页左边分类id
String FirstLevelId[] = new String[50];
// 获取 json 对象(为了得到全部下一级链接)
String tmpjson = doc.select("script[type=application/json]").toString();
int tmpj1 = tmpjson.lastIndexOf("<");
int tmpj2 = tmpjson.indexOf(">");
tmpjson = tmpjson.substring(tmpj2+1, tmpj1);
JSONObject json = JSON.parseObject(tmpjson);
// 分层获取信息,直到得到 Category List
JSONObject props = json.getJSONObject("props");
JSONObject pageProps = props.getJSONObject("pageProps");
// 首页左边的 List (即药品分类信息
JSONArray firstLevelCategoryList = pageProps.getJSONArray("firstLevelCategoryList");
// 遍历 firstLevelCategoryList 将左边 List 的 name 和 id 存在数组中
int FirstLevelSize = firstLevelCategoryList.size();
for(int i=0;i<FirstLevelSize;i++) {
FirstLevelName[i] = (firstLevelCategoryList.getJSONObject(i)).get("name").toString();
FirstLevelId[i] = (firstLevelCategoryList.getJSONObject(i)).get("id").toString();
}
// 首页右边的 List (左边的下一层
JSONArray secondLevelCategoryList = pageProps.getJSONArray("secondLevelCategoryList");
int SecondLevelSize = secondLevelCategoryList.size();
// 遍历访问首页右边的所有 List
for(int i=0;i<SecondLevelSize;i++) {
// FirstName 为药品类别名, 即首页左边栏中的药品一层分类
String FirstName = "";
// SecondName 为首页右边栏中的药品二级分类, 用处不大所以未做处理
//String SecondName = secondLevelCategoryList.getJSONObject(i).getString("name").toString();
// FirstId 为首页左边栏中的药品分类id
String FirstId = secondLevelCategoryList.getJSONObject(i).getString("supId").toString();
// SecondId 为首页右边栏中的药品二层分类id, 用于获取右边链接的 url, 即 SecondUrl
String SecondId = secondLevelCategoryList.getJSONObject(i).getString("id").toString();
String SecondUrl = "https://drugs.dxy.cn/category/" + SecondId;
// 遍历药品分类的 Id, 用于确定右边的药品二级分类属于哪一个一级分类
for(int j=0;j<FirstLevelSize;j++) {
if(FirstLevelId[j].equals(FirstId)) {
FirstName = FirstLevelName[j];
break;
}
}
// HttpClient 请求获取网页 html
HttpGet get_c = new HttpGet(SecondUrl);
CloseableHttpResponse response_c = client.execute(get_c);
content = "";
if(response_c.getStatusLine().getStatusCode()==200){
content = EntityUtils.toString(response_c.getEntity(), "UTF-8");
}
// 将前面获取到的 html 文本转换为Document对象
doc = Jsoup.parse(content);
// 获取当前页面的 PageSize
String tmpstring = doc.select("script[id=__NEXT_DATA__]").toString();
int tmp1 = tmpstring.indexOf("pageSize");
int tmp2 = tmpstring.indexOf("total");
int PageSize = Integer.valueOf(tmpstring.substring(tmp1+10, tmp2-2));
// 遍历当前页面的所有有效 Page
for(int j = 0;j < PageSize;j++) {
// HttpClient 请求获取网页 html
HttpGet get_b = new HttpGet(SecondUrl + "?page=" + j);
CloseableHttpResponse response_b = client.execute(get_b);
content = "";
if(response_b.getStatusLine().getStatusCode()==200){
content = EntityUtils.toString(response_b.getEntity(), "UTF-8");
}
// 将前面获取到的 html 文本转换为Document对象
doc = Jsoup.parse(content);
// 选择以 /drug 开头的链接的 Elements
Elements DrugsElements = doc.select("a[href^=/drug]");
//System.out.println(DrugsElements);
// 遍历访问以上获得的 Elements
int NumberElements = DrugsElements.size();
for(int k = 0;k < NumberElements;k++) {
// 获取具体药品链接地址
String ElementTmp = DrugsElements.get(k).toString();
int elementtmp1 = ElementTmp.indexOf("\"");
int elementtmp2 = ElementTmp.lastIndexOf("\"");
ElementTmp = ElementTmp.substring(elementtmp1+1, elementtmp2);
ElementTmp = "https://drugs.dxy.cn" + ElementTmp;
System.out.println(ElementTmp);
// HttpClient 请求获取网页 html
HttpGet get_a = new HttpGet(ElementTmp);
CloseableHttpResponse response_a = client.execute(get_a);
content = "";
if(response_a.getStatusLine().getStatusCode() == 200){
content = EntityUtils.toString(response_a.getEntity(), "UTF-8");
}
// 将前面获取到的 html 文本转换为Document对象
doc = Jsoup.parse(content);
// 获取文本
String OneOfDrugs = doc.text();
//System.out.println(doc);
//System.out.println("------------------------------------------------------------");
// 获得药品的必要属性
// 每行最后注释的数字为搜索标记的长度
int MedicineNameIndex1 = OneOfDrugs.indexOf("通用名称"); // + 5
int MedicineNameIndex2 = OneOfDrugs.indexOf("英文名称"); // + 5
int MedicineNameIndex3 = OneOfDrugs.indexOf("商品名称"); // + 5
int MedicineComponentIndex = OneOfDrugs.indexOf("【成份】"); // + 5
int MedicineIndicationIndex = OneOfDrugs.indexOf("【适应症】"); // + 6
int MedicineUsageIndex = OneOfDrugs.indexOf("【用法用量】"); // + 7
int MedicinePrecautionsIndex = OneOfDrugs.indexOf("【注意事项】"); // + 7
int MedicineContraindicationsIndex = OneOfDrugs.indexOf("【禁忌】"); // + 5
int MedicineGravidaIndex = OneOfDrugs.indexOf("【孕妇及哺乳期妇女用药】"); // + 13
int MedicinePharamacologicalActionIndex = OneOfDrugs.indexOf("【药理作用】"); // + 7
int MedicinePharmacokineticsIndex = OneOfDrugs.indexOf("【药代动力学】"); // + 8
int MedicineChemicalCompositionIndex = OneOfDrugs.indexOf("【化学成份】"); // + 7
int MedicineOTCIndex = OneOfDrugs.indexOf("【是否OTC】"); // + 8
// 获取各个属性的内容
// 获取通用名称
String MedicineNameString1 = "";
if(MedicineNameIndex1 >= 0) {
MedicineNameString1 = OneOfDrugs.substring(MedicineNameIndex1 + 5);
if(MedicineNameIndex2 >= 0) {
MedicineNameString1 = MedicineNameString1.substring(0,MedicineNameIndex2 - MedicineNameIndex1 - 5);
}else if(MedicineNameIndex3 >= 0) {
MedicineN