{"payload":{"feedbackUrl":"https://github.com/orgs/community/discussions/53140","repo":{"id":180136168,"defaultBranch":"master","name":"trafilatura","ownerLogin":"adbar","currentUserCanPush":false,"isFork":false,"isEmpty":false,"createdAt":"2019-04-08T11:38:48.000Z","ownerAvatar":"https://avatars.githubusercontent.com/u/2125866?v=4","public":true,"private":false,"isOrgOwned":false},"refInfo":{"name":"","listCacheKey":"v0:1718100771.0","currentOid":""},"activityList":{"items":[{"before":"254d42d3ddb4ba6287c7893950eb6decbd76f5b9","after":"307bcceac6a41100372f2fb99f54b3b9f4730bd3","ref":"refs/heads/review_baseline","pushedAt":"2024-06-11T10:40:22.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"adbar","name":"Adrien Barbaresi","path":"/adbar","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/2125866?s=80&v=4"},"commit":{"message":"simplify","shortMessageHtmlLink":"simplify"}},{"before":null,"after":"254d42d3ddb4ba6287c7893950eb6decbd76f5b9","ref":"refs/heads/review_baseline","pushedAt":"2024-06-11T10:12:51.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"adbar","name":"Adrien Barbaresi","path":"/adbar","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/2125866?s=80&v=4"},"commit":{"message":"baseline: review extractor sequence, JSON parsing, and cleaning","shortMessageHtmlLink":"baseline: review extractor sequence, JSON parsing, and cleaning"}},{"before":"543f405f39e94aaf1b8586b4e38cecfa91256f8f","after":null,"ref":"refs/heads/convert_to_html","pushedAt":"2024-06-11T10:09:47.000Z","pushType":"branch_deletion","commitsCount":0,"pusher":{"login":"adbar","name":"Adrien Barbaresi","path":"/adbar","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/2125866?s=80&v=4"}},{"before":"f97f67f98016b909830de238b9b5f34997ba4a51","after":"665330172a58ae14fc975855dffb2fe9be9533c4","ref":"refs/heads/master","pushedAt":"2024-06-11T10:09:46.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"adbar","name":"Adrien Barbaresi","path":"/adbar","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/2125866?s=80&v=4"},"commit":{"message":"extraction: add HTML as output format (#614)\n\n* extractor: add XML to HTML conversion\r\n\r\n* add option to extract and CLI\r\n\r\n* add metadata\r\n\r\n* simplify attribute processing","shortMessageHtmlLink":"extraction: add HTML as output format (#614)"}},{"before":"300f44a141a52b8a07478f7659f441c8f14b09e5","after":"543f405f39e94aaf1b8586b4e38cecfa91256f8f","ref":"refs/heads/convert_to_html","pushedAt":"2024-06-11T10:03:00.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"adbar","name":"Adrien Barbaresi","path":"/adbar","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/2125866?s=80&v=4"},"commit":{"message":"simplify attribute processing","shortMessageHtmlLink":"simplify attribute processing"}},{"before":"6e0b6c2afba66fbb44e8cf0c508f5b1e55a86240","after":"300f44a141a52b8a07478f7659f441c8f14b09e5","ref":"refs/heads/convert_to_html","pushedAt":"2024-06-10T16:12:53.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"adbar","name":"Adrien Barbaresi","path":"/adbar","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/2125866?s=80&v=4"},"commit":{"message":"add metadata","shortMessageHtmlLink":"add metadata"}},{"before":"885ad4a46313d60919e1810767d15d140322e3fe","after":"6e0b6c2afba66fbb44e8cf0c508f5b1e55a86240","ref":"refs/heads/convert_to_html","pushedAt":"2024-06-10T15:39:56.000Z","pushType":"push","commitsCount":6,"pusher":{"login":"adbar","name":"Adrien Barbaresi","path":"/adbar","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/2125866?s=80&v=4"},"commit":{"message":"fix merge conflict","shortMessageHtmlLink":"fix merge conflict"}},{"before":"e959104ed80a74fef292b295d9e385b000c3a64e","after":null,"ref":"refs/heads/update_docs","pushedAt":"2024-06-07T17:05:13.000Z","pushType":"branch_deletion","commitsCount":0,"pusher":{"login":"adbar","name":"Adrien Barbaresi","path":"/adbar","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/2125866?s=80&v=4"}},{"before":"2e2022af2e8071fd9b444dd5e73b270387342419","after":"f97f67f98016b909830de238b9b5f34997ba4a51","ref":"refs/heads/master","pushedAt":"2024-06-07T17:05:12.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"adbar","name":"Adrien Barbaresi","path":"/adbar","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/2125866?s=80&v=4"},"commit":{"message":"docs: general overhaul, add page on deduplication (#618)\n\n* docs: update and extend\r\n\r\n* remove installation_gui and improve the rest","shortMessageHtmlLink":"docs: general overhaul, add page on deduplication (#618)"}},{"before":"0e4aececa6125ca0731d873e172d4d16d9a534e6","after":"e959104ed80a74fef292b295d9e385b000c3a64e","ref":"refs/heads/update_docs","pushedAt":"2024-06-07T16:55:38.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"adbar","name":"Adrien Barbaresi","path":"/adbar","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/2125866?s=80&v=4"},"commit":{"message":"remove installation_gui and improve the rest","shortMessageHtmlLink":"remove installation_gui and improve the rest"}},{"before":"261c9198fb4d2d34536f783c4d5f4bbc1a43afe3","after":null,"ref":"refs/heads/with_metadata","pushedAt":"2024-06-07T14:40:10.000Z","pushType":"branch_deletion","commitsCount":0,"pusher":{"login":"adbar","name":"Adrien Barbaresi","path":"/adbar","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/2125866?s=80&v=4"}},{"before":"29e6bfe9f3d53bbf7381f9c813fcef4e354301c0","after":"2e2022af2e8071fd9b444dd5e73b270387342419","ref":"refs/heads/master","pushedAt":"2024-06-07T14:40:10.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"adbar","name":"Adrien Barbaresi","path":"/adbar","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/2125866?s=80&v=4"},"commit":{"message":"with_metadata: use argument as switch and add to options (#613)\n\n* use with_metadata argument as switch\r\n\r\n* add markdown meta title test\r\n\r\n* clean up\r\n\r\n* simplify only_with_metadata code","shortMessageHtmlLink":"with_metadata: use argument as switch and add to options (#613)"}},{"before":"1bec38ec394926ca511e85e6f2248b57ae346aac","after":"261c9198fb4d2d34536f783c4d5f4bbc1a43afe3","ref":"refs/heads/with_metadata","pushedAt":"2024-06-07T14:34:46.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"adbar","name":"Adrien Barbaresi","path":"/adbar","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/2125866?s=80&v=4"},"commit":{"message":"simplify only_with_metadata code","shortMessageHtmlLink":"simplify only_with_metadata code"}},{"before":null,"after":"0e4aececa6125ca0731d873e172d4d16d9a534e6","ref":"refs/heads/update_docs","pushedAt":"2024-06-06T14:43:29.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"adbar","name":"Adrien Barbaresi","path":"/adbar","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/2125866?s=80&v=4"},"commit":{"message":"docs: update and extend","shortMessageHtmlLink":"docs: update and extend"}},{"before":"04c2fdf585da085fe7712a9b366ee0fe799f4233","after":"29e6bfe9f3d53bbf7381f9c813fcef4e354301c0","ref":"refs/heads/master","pushedAt":"2024-06-06T14:38:00.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"adbar","name":"Adrien Barbaresi","path":"/adbar","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/2125866?s=80&v=4"},"commit":{"message":"eval: add annotated files (#197)\n\n* set up coinbase annotation\r\n\r\n* Completed Coinbase article annotation.\r\n\r\n* Delete evaldata.p\r\n\r\n* Corrected false apostrophe symbol\r\n\r\n* Added Docker annotation.\r\n\r\n* Added Kickstarter annotation.\r\n\r\n* Added Stack Exchange annotation.\r\n\r\n* Added Stack Exchange annotation.\r\n\r\n* Added geeks3d annotation and beginning of Python docs annotation but did not request pages into directory eval yet.\r\n\r\n* Prepared evaldata.py for 5 upcoming annotations.\r\n\r\n* Prepared 5 annotations by adding filenames to each of them.\r\n\r\n* Added annotations for all five articles.\r\n\r\n* Added 10th annotation.\r\n\r\n* Added first draft of annotation script.\r\n\r\n* fixed minor thing in iterate_over_urls.py\r\n\r\n* prepared scripts for debugging and will revert to this commit in case I mess up the eval-data.py file\r\n\r\n* am cleaning up as I debug, will soon be completely functional\r\n\r\n* minor\r\n\r\n* cleaned up documentation\r\n\r\n* cleaning up documentation\r\n\r\n* documented indexing the boto3 response\r\n\r\n* figured out minor issue with json\r\n\r\n* try to save the PR\r\n\r\n* review annotated data\r\n\r\n* delete unused file\r\n\r\n---------\r\n\r\nCo-authored-by: administrator \r\nCo-authored-by: Adrien Barbaresi \r\nCo-authored-by: Julius Hamilton \r\nCo-authored-by: Adrien Barbaresi ","shortMessageHtmlLink":"eval: add annotated files (#197)"}},{"before":"f8288744491ca696caccc870feed69c5c036c2ce","after":"1bec38ec394926ca511e85e6f2248b57ae346aac","ref":"refs/heads/with_metadata","pushedAt":"2024-06-06T11:43:51.000Z","pushType":"push","commitsCount":4,"pusher":{"login":"adbar","name":"Adrien Barbaresi","path":"/adbar","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/2125866?s=80&v=4"},"commit":{"message":"clean up","shortMessageHtmlLink":"clean up"}},{"before":"e6715de396ad6146636b77db068d1a9da705fa00","after":null,"ref":"refs/heads/fix_htmlprocessing_syntax","pushedAt":"2024-06-06T11:09:13.000Z","pushType":"branch_deletion","commitsCount":0,"pusher":{"login":"adbar","name":"Adrien Barbaresi","path":"/adbar","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/2125866?s=80&v=4"}},{"before":"950c348f4ea882eaddc631bcdbf192db831f8750","after":"04c2fdf585da085fe7712a9b366ee0fe799f4233","ref":"refs/heads/master","pushedAt":"2024-06-06T11:09:12.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"adbar","name":"Adrien Barbaresi","path":"/adbar","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/2125866?s=80&v=4"},"commit":{"message":"HTML cleaning: fix processing syntax and simplify code (#615)","shortMessageHtmlLink":"HTML cleaning: fix processing syntax and simplify code (#615)"}},{"before":null,"after":"e6715de396ad6146636b77db068d1a9da705fa00","ref":"refs/heads/fix_htmlprocessing_syntax","pushedAt":"2024-06-05T18:22:27.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"adbar","name":"Adrien Barbaresi","path":"/adbar","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/2125866?s=80&v=4"},"commit":{"message":"extraction: fix processing syntax and simplify code","shortMessageHtmlLink":"extraction: fix processing syntax and simplify code"}},{"before":null,"after":"885ad4a46313d60919e1810767d15d140322e3fe","ref":"refs/heads/convert_to_html","pushedAt":"2024-06-05T17:53:06.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"adbar","name":"Adrien Barbaresi","path":"/adbar","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/2125866?s=80&v=4"},"commit":{"message":"extractor: add XML to HTML conversion","shortMessageHtmlLink":"extractor: add XML to HTML conversion"}},{"before":"b36b6fad68b02cef00d615c5a061e78b52504e6b","after":"950c348f4ea882eaddc631bcdbf192db831f8750","ref":"refs/heads/master","pushedAt":"2024-06-04T12:49:38.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"adbar","name":"Adrien Barbaresi","path":"/adbar","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/2125866?s=80&v=4"},"commit":{"message":"evaluation: add data, rewrite script, update packages (#606)\n\n* new functions, add output, include small\r\n\r\n* further code clean-up\r\n\r\n* add output files\r\n\r\n* start object-oriented restructuring\r\n\r\n* add pandas\r\n\r\n* add eval data as json file\r\n\r\n* adjust print statements\r\n\r\n* fix outputs\r\n\r\n* Update eval-requirements.txt\r\n\r\n* add results dir\r\n\r\n* round in output files\r\n\r\n* add new evaluation files\r\n\r\n* adjust new files\r\n\r\n* add further files\r\n\r\n* re-run evaluation with trafilatura 1.9.0 and new data\r\n\r\n* finalize changes\r\n\r\n* review structure and setup\r\n\r\n* adapt evaluation\r\n\r\n* simplify code, test and improve usability\r\n\r\n* replace empty file\r\n\r\n* rm double entry\r\n\r\n* rm further duplicates\r\n\r\n* add html2txt and update docs\r\n\r\n* regroup, use binary as input, test\r\n\r\n* fixes\r\n\r\n---------\r\n\r\nCo-authored-by: Adrien Barbaresi ","shortMessageHtmlLink":"evaluation: add data, rewrite script, update packages (#606)"}},{"before":"e437097633c8005f1d895ac1063fff891cb95e33","after":"f8288744491ca696caccc870feed69c5c036c2ce","ref":"refs/heads/with_metadata","pushedAt":"2024-06-03T16:44:45.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"adbar","name":"Adrien Barbaresi","path":"/adbar","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/2125866?s=80&v=4"},"commit":{"message":"add markdown meta title test","shortMessageHtmlLink":"add markdown meta title test"}},{"before":null,"after":"e437097633c8005f1d895ac1063fff891cb95e33","ref":"refs/heads/with_metadata","pushedAt":"2024-06-03T15:54:01.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"adbar","name":"Adrien Barbaresi","path":"/adbar","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/2125866?s=80&v=4"},"commit":{"message":"use with_metadata argument as switch","shortMessageHtmlLink":"use with_metadata argument as switch"}},{"before":"847e6c1d41205b908b1edb44d775d1a8f0e804ca","after":null,"ref":"refs/heads/dependabot/pip/dependencies-98f4b8e004","pushedAt":"2024-06-03T10:08:34.000Z","pushType":"branch_deletion","commitsCount":0,"pusher":{"login":"adbar","name":"Adrien Barbaresi","path":"/adbar","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/2125866?s=80&v=4"}},{"before":null,"after":"847e6c1d41205b908b1edb44d775d1a8f0e804ca","ref":"refs/heads/dependabot/pip/dependencies-98f4b8e004","pushedAt":"2024-06-01T16:34:48.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"dependabot[bot]","name":null,"path":"/apps/dependabot","primaryAvatarUrl":"https://avatars.githubusercontent.com/in/29110?s=80&v=4"},"commit":{"message":"build(deps): bump the dependencies group with 5 updates\n\nBumps the dependencies group with 5 updates:\n\n| Package | From | To |\n| --- | --- | --- |\n| [trafilatura](https://github.com/adbar/trafilatura) | `1.8.1` | `1.10.0` |\n| [html2text](https://github.com/Alir3z4/html2text) | `2020.1.16` | `2024.2.26` |\n| [html-text](https://github.com/zytedata/html-text) | `0.5.2` | `0.6.2` |\n| [justext](https://github.com/miso-belica/jusText) | `3.0.0` | `3.0.1` |\n| [resiliparse](https://github.com/chatnoir-eu/chatnoir-resiliparse) | `0.14.5` | `0.14.7` |\n\n\nUpdates `trafilatura` from 1.8.1 to 1.10.0\n- [Release notes](https://github.com/adbar/trafilatura/releases)\n- [Changelog](https://github.com/adbar/trafilatura/blob/master/HISTORY.md)\n- [Commits](https://github.com/adbar/trafilatura/compare/v1.8.1...v1.10.0)\n\nUpdates `html2text` from 2020.1.16 to 2024.2.26\n- [Release notes](https://github.com/Alir3z4/html2text/releases)\n- [Changelog](https://github.com/Alir3z4/html2text/blob/master/ChangeLog.rst)\n- [Commits](https://github.com/Alir3z4/html2text/compare/2020.1.16...2024.2.26)\n\nUpdates `html-text` from 0.5.2 to 0.6.2\n- [Changelog](https://github.com/zytedata/html-text/blob/master/CHANGES.rst)\n- [Commits](https://github.com/zytedata/html-text/compare/0.5.2...0.6.2)\n\nUpdates `justext` from 3.0.0 to 3.0.1\n- [Release notes](https://github.com/miso-belica/jusText/releases)\n- [Changelog](https://github.com/miso-belica/jusText/blob/main/CHANGELOG.rst)\n- [Commits](https://github.com/miso-belica/jusText/compare/v3.0.0...v3.0.1)\n\nUpdates `resiliparse` from 0.14.5 to 0.14.7\n- [Commits](https://github.com/chatnoir-eu/chatnoir-resiliparse/compare/v0.14.5...v0.14.7)\n\n---\nupdated-dependencies:\n- dependency-name: trafilatura\n dependency-type: direct:production\n update-type: version-update:semver-minor\n dependency-group: dependencies\n- dependency-name: html2text\n dependency-type: direct:production\n update-type: version-update:semver-major\n dependency-group: dependencies\n- dependency-name: html-text\n dependency-type: direct:production\n update-type: version-update:semver-minor\n dependency-group: dependencies\n- dependency-name: justext\n dependency-type: direct:production\n update-type: version-update:semver-patch\n dependency-group: dependencies\n- dependency-name: resiliparse\n dependency-type: direct:production\n update-type: version-update:semver-patch\n dependency-group: dependencies\n...\n\nSigned-off-by: dependabot[bot] ","shortMessageHtmlLink":"build(deps): bump the dependencies group with 5 updates"}},{"before":"7e245aa323ebabd4d72ea7f37b3dc2a007e57650","after":null,"ref":"refs/heads/prepare_v1.10","pushedAt":"2024-05-30T15:45:29.000Z","pushType":"branch_deletion","commitsCount":0,"pusher":{"login":"adbar","name":"Adrien Barbaresi","path":"/adbar","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/2125866?s=80&v=4"}},{"before":"bbf7bec12f2d0491c9d0dfc974dab822f4a8a65c","after":"b36b6fad68b02cef00d615c5a061e78b52504e6b","ref":"refs/heads/master","pushedAt":"2024-05-30T15:45:28.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"adbar","name":"Adrien Barbaresi","path":"/adbar","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/2125866?s=80&v=4"},"commit":{"message":"prepare version 1.10.0 (#608)\n\n* prepare version 1.10.0\r\n\r\n* fixes","shortMessageHtmlLink":"prepare version 1.10.0 (#608)"}},{"before":"3099393e8d1c79cf6b1503220a5b5a77e050f0c4","after":"7e245aa323ebabd4d72ea7f37b3dc2a007e57650","ref":"refs/heads/prepare_v1.10","pushedAt":"2024-05-30T15:34:38.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"adbar","name":"Adrien Barbaresi","path":"/adbar","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/2125866?s=80&v=4"},"commit":{"message":"fixes","shortMessageHtmlLink":"fixes"}},{"before":null,"after":"3099393e8d1c79cf6b1503220a5b5a77e050f0c4","ref":"refs/heads/prepare_v1.10","pushedAt":"2024-05-30T15:29:15.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"adbar","name":"Adrien Barbaresi","path":"/adbar","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/2125866?s=80&v=4"},"commit":{"message":"prepare version 1.10.0","shortMessageHtmlLink":"prepare version 1.10.0"}},{"before":"9569dad27dab2ebedac74856f339b60e95124ff5","after":"bbf7bec12f2d0491c9d0dfc974dab822f4a8a65c","ref":"refs/heads/master","pushedAt":"2024-05-30T12:51:02.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"adbar","name":"Adrien Barbaresi","path":"/adbar","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/2125866?s=80&v=4"},"commit":{"message":"Markdown fixes: table formatting (#601)\n\n* fix: do not add new lines in markdown cells\r\n\r\n* fix: markdown tables can only have one header\r\n\r\n* fix: add a space after text before a paragraph in a cell\r\n\r\nParagraph is a block level element so we would normally have a new line before\r\nit. However, here we are in a markdown cell and can't have new lines, so add a\r\nspace. Otherwise words will get concatenated.\r\n\r\n* fix: match maximum cell count on each row\r\n\r\nMarkdown does not support colspan, but at least this way we don't lose any cell\r\ndata.\r\n\r\n* fix: cells always need to append vertical bars\r\n\r\nCurrently there was a scenario where if a cell only contains a single

with\r\nsome text but no text directly, then vertical bars would not get appended for\r\nthat cells.\r\n\r\n* Fix table processing tests and add a few more\r\n\r\nAdded the following table tests in text format:\r\n- removing new lines in cells,\r\n- only allowing a single header row,\r\n- handling colspan by appending columns.\r\n\r\n* fix: remove row span attribute once it is no longer useful","shortMessageHtmlLink":"Markdown fixes: table formatting (#601)"}}],"hasNextPage":true,"hasPreviousPage":false,"activityType":"all","actor":null,"timePeriod":"all","sort":"DESC","perPage":30,"cursor":"djE6ks8AAAAEYhZpaAA","startCursor":null,"endCursor":null}},"title":"Activity ยท adbar/trafilatura"}