当前位置:   article > 正文

导出语雀所有文档为markdown文件(不需要超级会员tok)_语雀文章爬取

语雀文章爬取

前言:2023-10-23日,语雀挂了一会,导致我没法写文档,这让我意识到文档全部放在云服务器上是不靠谱的,还需要进行本地备份,所以有了本篇文章内容。
重要提示:代码没有经过严谨测试,但我本人用起来没问题,我就不管了,不想浪费太多时间在这个上面,导出的文件会按每个知识库一个目录进行储存

1、首先导入apache httpclient依赖

  1. <!-- apache http -->
  2. <dependency>
  3. <groupId>org.apache.httpcomponents</groupId>
  4. <artifactId>httpclient</artifactId>
  5. <version>4.5.10</version>
  6. </dependency>

2、获取你的authTokencsrfToken

        首先进入你任意一个知识库,按下F12打开开发者工具,然后点击到网络,输入"yuque",然后清空日志,点击任意一篇文章,然后语雀会发送一个请求到语雀的服务器来获取你这篇文章的评论,这个请求中中就携带了你的authTokencsrfToken

        然后在右边的标头中一直下滑,找到请求标头,即可看到authTokencsrfToken,分别复制到下面的代码中进行替换即可

  1. import org.apache.http.HttpEntity;
  2. import org.apache.http.HttpResponse;
  3. import org.apache.http.client.methods.HttpGet;
  4. import org.apache.http.client.methods.HttpPost;
  5. import org.apache.http.entity.StringEntity;
  6. import org.apache.http.impl.client.CloseableHttpClient;
  7. import org.apache.http.impl.client.HttpClients;
  8. import org.apache.http.util.EntityUtils;
  9. import org.json.JSONArray;
  10. import org.json.JSONObject;
  11. import java.io.FileOutputStream;
  12. import java.io.IOException;
  13. import java.io.InputStream;
  14. import java.net.URLDecoder;
  15. public class YuqueAPIClient {
  16. static String authToken = "待替换";
  17. static String csrfToken = "待替换";
  18. public static void main(String[] args) {
  19. String baseURL = "https://www.yuque.com/api";
  20. String personalBooksURL = baseURL + "/mine/personal_books";
  21. try (CloseableHttpClient httpClient = HttpClients.createDefault()) {
  22. // Step 1: 获取所有知识库的Id
  23. HttpGet personalBooksRequest = new HttpGet(personalBooksURL);
  24. personalBooksRequest.setHeader("Cookie", authToken);
  25. personalBooksRequest.setHeader("X-Csrf-Token", csrfToken);
  26. HttpResponse personalBooksResponse = httpClient.execute(personalBooksRequest);
  27. HttpEntity personalBooksEntity = personalBooksResponse.getEntity();
  28. String personalBooksResponseString = EntityUtils.toString(personalBooksEntity);
  29. JSONObject personalBooksData = new JSONObject(personalBooksResponseString);
  30. JSONArray personalBooks = personalBooksData.getJSONArray("data");
  31. // 1.1、遍历知识库
  32. for (int i = 0; i < personalBooks.length(); i++) {
  33. JSONObject book = personalBooks.getJSONObject(i);
  34. String bookId = String.valueOf(book.get("id"));
  35. String bookName = (String) book.get("name");
  36. // Step 2: 获取知识库下的所有文章
  37. String bookURL = baseURL + "/docs/?book_id=" + bookId;
  38. HttpGet docRequest = new HttpGet(bookURL);
  39. docRequest.setHeader("Cookie", authToken);
  40. docRequest.setHeader("X-Csrf-Token", csrfToken);
  41. HttpResponse docResponse = httpClient.execute(docRequest);
  42. HttpEntity docEntity = docResponse.getEntity();
  43. String docResponseString = EntityUtils.toString(docEntity);
  44. JSONObject docData = new JSONObject(docResponseString);
  45. JSONArray docs = docData.getJSONArray("data");
  46. // Step 3: 遍历知识库下的有所文章
  47. for (int j = 0; j < docs.length(); j++) {
  48. JSONObject docEntry = docs.getJSONObject(j);
  49. String docId = String.valueOf(docEntry.get("id"));
  50. String exportURL = baseURL + "/docs/" + docId + "/export";
  51. HttpPost exportRequest = new HttpPost(exportURL);
  52. exportRequest.setHeader("Cookie", authToken);
  53. exportRequest.setHeader("X-Csrf-Token", csrfToken);
  54. JSONObject jsonObject = new JSONObject();
  55. jsonObject.put("type", "markdown");
  56. jsonObject.put("force", 0);
  57. // jsonObject.put("options", "{\"latexType\": 2,\"enableAnchor\": 0,\"enableBreak\": 0}");
  58. // latexType:1、导出 LaTeX 公式图片 2、导出 LaTeX 公式为 Markdown 语法
  59. // enableAnchor:导出保持语雀的锚点
  60. // enableBreak:导出保持语雀的换行
  61. jsonObject.put("options", "{\"latexType\":2}");
  62. StringEntity entity = new StringEntity(jsonObject.toString());
  63. entity.setContentType("application/json");
  64. exportRequest.setEntity(entity);
  65. HttpResponse exportResponse = httpClient.execute(exportRequest);
  66. HttpEntity exportEntity = exportResponse.getEntity();
  67. String exportResponseString = EntityUtils.toString(exportEntity);
  68. JSONObject exportData = new JSONObject(exportResponseString).getJSONObject("data");
  69. String downloadURL = (String) exportData.get("url");
  70. try {
  71. // Step 3.1:保存文章到本地
  72. saveFileFromURL(downloadURL, bookName);
  73. }
  74. catch (Exception e) {
  75. e.printStackTrace();
  76. }
  77. }
  78. }
  79. } catch (IOException e) {
  80. e.printStackTrace();
  81. }
  82. }
  83. public static void saveFileFromURL(String fileURL, String bookName) {
  84. try (CloseableHttpClient httpClient = HttpClients.createDefault()) {
  85. HttpGet httpGet = new HttpGet(fileURL);
  86. httpGet.setHeader("Cookie", authToken);
  87. httpGet.setHeader("X-Csrf-Token", csrfToken);
  88. HttpResponse response = httpClient.execute(httpGet);
  89. HttpEntity entity = response.getEntity();
  90. String fileName;
  91. if (response.getHeaders("Content-Disposition")[0].getElements()[0].getParameters().length == 2) {
  92. String file = response.getHeaders("Content-Disposition")[0].getElements()[0].getParameters()[1].getValue();
  93. fileName = URLDecoder.decode(file);
  94. fileName = fileName.substring(fileName.lastIndexOf("'"));
  95. }
  96. else
  97. fileName = response.getHeaders("Content-Disposition")[0].getElements()[0].getParameters()[0].getValue();
  98. if (entity != null) {
  99. try (InputStream inputStream = entity.getContent();
  100. FileOutputStream outputStream = new FileOutputStream("./files/" + bookName + "/" + fileName)) {
  101. byte[] buffer = new byte[1024];
  102. int n;
  103. while ((n = inputStream.read(buffer)) != -1) {
  104. outputStream.write(buffer, 0, n);
  105. }
  106. }
  107. }
  108. } catch (IOException e) {
  109. e.printStackTrace();
  110. }
  111. }
  112. }

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/很楠不爱3/article/detail/719578
推荐阅读
  

闽ICP备14008679号