|
| 1 | +# code to download CAIE past papers (back exams) for students to get all the back exams they need for their exams |
| 2 | + |
| 3 | +import requests,lxml,os |
| 4 | +from bs4 import BeautifulSoup |
| 5 | + |
| 6 | +class DownloadPapers: |
| 7 | + def __init__(self): |
| 8 | + self.websites = ["https://pastpapers.co/cie/?dir=A-Level", "https://pastpapers.co/cie/?dir=IGCSE","https://pastpapers.co/cie/?dir=O-Level", "https://pastpapers.co/cie/?dir=Pre-U"] |
| 9 | + self.years = ["2000", "2001", "2002", "2003", '2004', "2005", "2006", "2007", "2008", "2009", "2010", "2011", "2012","2013", "2014", "2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023", "2024"] |
| 10 | + |
| 11 | + |
| 12 | + |
| 13 | + #this downloads all the a-level,igcse,pre-u,o-level past paper (back exams) |
| 14 | + def download_all(self)->None: |
| 15 | + |
| 16 | + for website in self.websites: |
| 17 | + |
| 18 | + topics_soup = BeautifulSoup(requests.get(url=website).text, "lxml") |
| 19 | + qualification = website.split("=")[-1].replace("%20", " ").replace("%26", "&") |
| 20 | + os.makedirs(qualification) |
| 21 | + topics = [f'https://pastpapers.co/cie/{topic["href"]}' for topic in topics_soup.find_all(name="a", class_="clearfix") if topic["href"] != 'https://pastpapers.co/cie/'] |
| 22 | + |
| 23 | + for topic in topics: |
| 24 | + |
| 25 | + years_soup = BeautifulSoup(requests.get(url=topic).text, "lxml") |
| 26 | + subject_name = topic.split("/")[-1].replace("%20", " ").replace("%26", "&") |
| 27 | + os.makedirs(f"{qualification}/{subject_name}") |
| 28 | + times = [f'https://pastpapers.co/cie/{time["href"]}' for time in years_soup.find_all(name="a", class_="clearfix") if time["href"] not in self.websites] |
| 29 | + print(times) |
| 30 | + |
| 31 | + for time in times: |
| 32 | + |
| 33 | + final_soup = BeautifulSoup(requests.get(url=time).text, "lxml") |
| 34 | + papers = [f'https://pastpapers.co/cie/{a["href"]}' for a in final_soup.find_all("a", class_="clearfix")][:-1] |
| 35 | + session = time.split("/")[-1].replace("%20", " ").replace("%26", "&") |
| 36 | + |
| 37 | + if session in self.years: |
| 38 | + |
| 39 | + times = [paper.replace("%20", " ") for paper in papers if "pdf" not in paper] |
| 40 | + |
| 41 | + for time in times: |
| 42 | + |
| 43 | + final_soup = BeautifulSoup(requests.get(url=time).text, "lxml") |
| 44 | + papers = [f'https://pastpapers.co/cie/{a["href"]}' for a in final_soup.find_all("a", class_="clearfix")][:-1] |
| 45 | + print(papers) |
| 46 | + session = time.split("/")[-1].replace("%20", " ") |
| 47 | + os.makedirs(f"{qualification}/{subject_name}/{session}") |
| 48 | + |
| 49 | + for paper in papers: |
| 50 | + |
| 51 | + paper_name = paper.split("/")[-1].replace("%20", " ").replace("%26", "&") |
| 52 | + with open(f"{qualification}/{subject_name}/{session}/{paper_name}", "wb") as f: |
| 53 | + f.write(requests.get(url=paper).content) |
| 54 | + print(f"{qualification}/{subject_name}/{session}/{paper_name} was downloaded") |
| 55 | + |
| 56 | + else: |
| 57 | + os.makedirs(f"{qualification}/{subject_name}/{session}") |
| 58 | + for paper in papers: |
| 59 | + |
| 60 | + print(paper) |
| 61 | + paper_name = paper.split("/")[-1] |
| 62 | + with open(f"{qualification}/{subject_name}/{session}/{paper_name}", "wb") as f: |
| 63 | + f.write(requests.get(url=paper).content) |
| 64 | + print(f"{qualification}/{subject_name}/{session}/{paper_name} was downloaded") |
| 65 | + |
| 66 | + #this shows all the topics |
| 67 | + def show_topics(self) -> None: |
| 68 | + for website in self.websites: |
| 69 | + |
| 70 | + print(website.split("=")[-1]) |
| 71 | + topics_soup = BeautifulSoup(requests.get(url=website).text, "lxml") |
| 72 | + topics = [f'https://pastpapers.co/cie/{topic["href"]}' for topic in topics_soup.find_all(name="a", class_="clearfix") if topic["href"] != 'https://pastpapers.co/cie/'] |
| 73 | + print(topics) |
| 74 | + |
| 75 | + #when given the qualification and the subject name it will download all that subject's past papers |
| 76 | + #You need to get the end of the url from pastpapers.co |
| 77 | + #for example for https://pastpapers.co/cie/?dir=IGCSE/Information%20and%20Communication%20Technology%20%289-1%29%20%280983%29 |
| 78 | + #for the qualification igcse will be good enough |
| 79 | + #but the subject name needs to be exactly Information%20and%20Communication%20Technology%20%289-1%29%20%280983%29 the same |
| 80 | + #Example |
| 81 | + #dp = DownloadPapers() |
| 82 | + #dp.download_subject("igcse","Information%20and%20Communication%20Technology%20%289-1%29%20%280983%29") |
| 83 | + |
| 84 | + def download_subject(self,qualification: str, subject_name: str)->None: |
| 85 | + |
| 86 | + if qualification.lower() == "igcse": |
| 87 | + qualification = qualification.upper() |
| 88 | + else: |
| 89 | + qualification = qualification.title() |
| 90 | + |
| 91 | + website = f"https://pastpapers.co/cie/?dir={qualification}" |
| 92 | + link = f"https://pastpapers.co/cie/?dir={qualification}/{subject_name}" |
| 93 | + soup = BeautifulSoup(requests.get(url=link).text, "lxml") |
| 94 | + times = [f'https://pastpapers.co/cie/{time["href"]}' for time in soup.find_all(name="a", class_="clearfix") if time["href"] != website] |
| 95 | + |
| 96 | + if times == self.websites: |
| 97 | + print("The qualification or the subject name is wrong") |
| 98 | + else: |
| 99 | + os.makedirs(qualification) |
| 100 | + os.makedirs(f"{qualification}/{subject_name}") |
| 101 | + |
| 102 | + for time in times: |
| 103 | + |
| 104 | + final_soup = BeautifulSoup(requests.get(url=time).text, "lxml") |
| 105 | + papers = [f'https://pastpapers.co/cie/{a["href"]}' for a in final_soup.find_all("a", class_="clearfix")][:-1] |
| 106 | + session = time.split("/")[-1].replace("%20", " ").replace("%26", "&") |
| 107 | + |
| 108 | + if session in self.years: |
| 109 | + times = [paper.replace("%20", " ") for paper in papers if "pdf" not in paper] |
| 110 | + |
| 111 | + for time in times: |
| 112 | + |
| 113 | + final_soup = BeautifulSoup(requests.get(url=time).text, "lxml") |
| 114 | + papers = [f'https://pastpapers.co/cie/{a["href"]}' for a in final_soup.find_all("a", class_="clearfix")][:-1] |
| 115 | + print(papers) |
| 116 | + session = time.split("/")[-1].replace("%20", " ") |
| 117 | + os.makedirs(f"{qualification}/{subject_name}/{session}") |
| 118 | + |
| 119 | + for paper in papers: |
| 120 | + paper_name = paper.split("/")[-1].replace("%20", " ").replace("%26", "&") |
| 121 | + with open(f"{qualification}/{subject_name}/{session}/{paper_name}", "wb") as f: |
| 122 | + f.write(requests.get(url=paper).content) |
| 123 | + print(f"{qualification}/{subject_name}/{session}/{paper_name} was downloaded") |
| 124 | + else: |
| 125 | + os.makedirs(f"{qualification}/{subject_name}/{session}") |
| 126 | + for paper in papers: |
| 127 | + |
| 128 | + print(paper) |
| 129 | + paper_name = paper.split("/")[-1] |
| 130 | + with open(f"{qualification}/{subject_name}/{session}/{paper_name}", "wb") as f: |
| 131 | + f.write(requests.get(url=paper).content) |
| 132 | + print(f"{qualification}/{subject_name}/{session}/{paper_name} was downloaded") |
| 133 | + |
| 134 | + #this will download a specific qualification |
| 135 | + #you just need the name of the qualification right letter wise so "O-LEVEL" or "o-level" will be fine |
| 136 | + def download_qualification(self,name: str)->None: |
| 137 | + qualifications = ["a-level", "igcse", "o-level", "pre-u"] |
| 138 | + |
| 139 | + if name.lower() in qualifications: |
| 140 | + qualification = name.title() |
| 141 | + |
| 142 | + if name.lower() == "igcse": |
| 143 | + qualification = name.upper() |
| 144 | + |
| 145 | + os.makedirs(qualification) |
| 146 | + website = f"https://pastpapers.co/cie/?dir={qualification}" |
| 147 | + topics_soup = BeautifulSoup(requests.get(url=website).text, "lxml") |
| 148 | + topics = [f'https://pastpapers.co/cie/{topic["href"]}' for topic in topics_soup.find_all(name="a", class_="clearfix") if topic["href"] != 'https://pastpapers.co/cie/'] |
| 149 | + |
| 150 | + for topic in topics: |
| 151 | + |
| 152 | + years_soup = BeautifulSoup(requests.get(url=topic).text, "lxml") |
| 153 | + subject_name = topic.split("/")[-1].replace("%20", " ").replace("%26", "&") |
| 154 | + os.makedirs(f"{qualification}/{subject_name}") |
| 155 | + times = [f'https://pastpapers.co/cie/{time["href"]}' for time in |
| 156 | + years_soup.find_all(name="a", class_="clearfix") if time["href"] != website] |
| 157 | + print(times) |
| 158 | + |
| 159 | + for time in times: |
| 160 | + |
| 161 | + final_soup = BeautifulSoup(requests.get(url=time).text, "lxml") |
| 162 | + papers = [f'https://pastpapers.co/cie/{a["href"]}' for a in |
| 163 | + final_soup.find_all("a", class_="clearfix")][:-1] |
| 164 | + session = time.split("/")[-1].replace("%20", " ").replace("%26", "&") |
| 165 | + |
| 166 | + if session in self.years: |
| 167 | + times = [paper.replace("%20", " ") for paper in papers if "pdf" not in paper] |
| 168 | + |
| 169 | + for time in times: |
| 170 | + |
| 171 | + final_soup = BeautifulSoup(requests.get(url=time).text, "lxml") |
| 172 | + papers = [f'https://pastpapers.co/cie/{a["href"]}' for a in |
| 173 | + final_soup.find_all("a", class_="clearfix")][:-1] |
| 174 | + print(papers) |
| 175 | + session = time.split("/")[-1].replace("%20", " ") |
| 176 | + os.makedirs(f"{qualification}/{subject_name}/{session}") |
| 177 | + |
| 178 | + for paper in papers: |
| 179 | + paper_name = paper.split("/")[-1].replace("%20", " ").replace("%26", "&") |
| 180 | + with open(f"{qualification}/{subject_name}/{session}/{paper_name}", "wb") as f: |
| 181 | + f.write(requests.get(url=paper).content) |
| 182 | + print(f"{qualification}/{subject_name}/{session}/{paper_name} was downloaded") |
| 183 | + else: |
| 184 | + os.makedirs(f"{qualification}/{subject_name}/{session}") |
| 185 | + for paper in papers: |
| 186 | + |
| 187 | + print(paper) |
| 188 | + paper_name = paper.split("/")[-1] |
| 189 | + with open(f"{qualification}/{subject_name}/{session}/{paper_name}", "wb") as f: |
| 190 | + f.write(requests.get(url=paper).content) |
| 191 | + print(f"{qualification}/{subject_name}/{session}/{paper_name} was downloaded") |
| 192 | + else: |
| 193 | + print("The qualification name is wrong") |
| 194 | + |
| 195 | +if __name__ == "__main__": |
| 196 | + dp = DownloadPapers() |
| 197 | + #dp.download_subject("igcse","Information%20and%20Communication%20Technology%20%289-1%29%20%280983%29") |
0 commit comments