User:Slaporte/Automated categorization

From Wikisource
Jump to: navigation, search

This is the code I use to categorize

  • 7/14/2010: Added new categories and an example script.

Code[edit]

 1 $catlist = array(
 2 	"tax" => "[[Category:United States Supreme Court decisions on taxation]]",
 3 	"civil.*jurisdiction" => "[[Category:United States Supreme Court decisions on civil procedure]]",
 4 	"Sherman" => "[[Category:United States Supreme Court decisions on antitrust]]",
 5 	"antitrust" => "[[Category:United States Supreme Court decisions on antitrust]]",
 6 	"copyright" => "[[Category:United States Supreme Court decisions on copyright]]",
 7 	"impeach" => "[[Category:United States Supreme Court decisions on evidence]]",
 8 	"class action" => "[[Category:United States Supreme Court decisions on class action]]",
 9 	"ERISA" => "[[Category:United States Supreme Court decisions on ERISA]]",
10 	//"employee benefit" => "[[Category:United States Supreme Court decisions on ERISA]]",
11 	"treaty" => "[[Category:United States Supreme Court decisions on treaties]]",
12 	"constitutional" => "[[Category:United States Supreme Court decisions on constitutionality]]",
13 	"delegation of .* power" => "[[Category:United States Supreme Court decisions on separation of Powers]]",
14 	"discrimination" => "[[Category:United States Supreme Court decisions on civil rights]]",
15 	"§ 1983" => "[[Category:United States Supreme Court decisions on civil rights]]",
16 	"ethical obligation" => "[[Category:United States Supreme Court decisions on professional responsibility]]",
17 	"Rule 12\(b\)\(6\)" => "[[Category:United States Supreme Court decisions on civil procedure]]",
18 	"NEPA" => "[[Category:United States Supreme Court decisions on environmental aw]]",
19 	"environmental" => "[[Category:United States Supreme Court decisions on environmental law]]",
20 	"law enforcement" => "[[Category:United States Supreme Court decisions on criminal law]]",
21 	"First Amendment" => "[[Category:United States Supreme Court decisions on the First Amendment]]",
22 	"Second Amendment" => "[[Category:United States Supreme Court decisions on the Second Amendment]]",
23 	"Fourth Amendment" => "[[Category:United States Supreme Court decisions on the Fourth Amendment]]",
24 	"Fifth Amendment" => "[[Category:United States Supreme Court decisions on the Fifth Amendment]]",
25 	"Eighth Amendment" => "[[Category:United States Supreme Court decisions on the Eighth Amendment]]",
26 	"[Dd]ue [Pp]rocess" => "[[Category:United States Supreme Court decisions on due process]]",
27 	"community property" => "[[Category:United States Supreme Court decisions on property]]",
28 	"disparate-impact" => "[[Category:United States Supreme Court decisions on civil rights]]",
29 	"freedom of speech" => "[[Category:United States Supreme Court decisions on freedom of speech]]", 
30 	"time, place, and manner" => "[[Category:United States Supreme Court decisions on freedom of speech]]",
31 	"clear and present danger" => "[[Category:United States Supreme Court decisions on freedom of speech]]",
32 	"free exercise of religion" => "[[Category:United States Supreme Court decisions on religion]]",
33 	"Establishment Clause" => "[[Category:United States Supreme Court decisions on religion]]",
34 	"Sixth Amendment" => "[[Category:United States Supreme Court decisions on the Sixth Amendment]]",
35 	"Commerce Clause" => "[[Category:United States Supreme Court decisions on the Commerce Clause]]",
36 	"justiciable" => "[[Category:United States Supreme Court decisions on justiciability]]",
37 	"justiciability" => "[[Category:United States Supreme Court decisions on justiciability]]",
38 	"abortion" => "[[Category:United States Supreme Court decisions on abortion]]",
39 	"SEC" => "[[Category:United States Supreme Court decisions on securities]]",
40 	"arbitrary and capricious" => "[[Category:United States Supreme Court decisions on statutory interpretation]]",
41 	"complete diversity" => "[[Category:United States Supreme Court decisions on civil procedure]]",
42 );
43 
44 function categoryGuess($txt, $list){
45 	$categories = array();
46 	foreach($list as $key => $cat) {
47 		if(preg_match("/$key/",$txt)){
48 			if(!in_array($cat,$categories)) {
49 				$categories[] = $cat;
50 			}
51 		}
52 	}
53 	
54 	if($categories == array()){
55 		$categories[] = "[[Category:Uncategorized United States Supreme Court decision]]";
56 	} else {
57 		$categories[] = "[[Category:Automated categorization]]";
58 	}
59 	
60 	return $categories;
61 }

Example script[edit]

 1 <?php
 2 function getRawText($url){
 3 	$ch = curl_init();
 4 	$timeout = 5;
 5 	curl_setopt($ch,CURLOPT_URL,$url);
 6 	curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);
 7 	curl_setopt($ch,CURLOPT_CONNECTTIMEOUT,$timeout);
 8 	curl_setopt($ch, CURLOPT_USERAGENT, 'User-Agent: guessingcategory/1 [[User:Slaporte]]');
 9 	$data = curl_exec($ch);
10 	curl_close($ch);
11 	
12 	return $data;
13 }
14 function categoryGuess($txt, $list){
15 
16 	$categories = array();
17 
18 	foreach($list as $key => $cat) {
19 	
20 		if(preg_match("/$key/",$txt)){
21 			if(!in_array($cat,$categories)) {
22 				$categories[] = $cat;
23 			}
24 		}
25 	
26 	}
27 	
28 	if($categories == array()){
29 		$categories[] = "[[Category:Uncategorized United States Supreme Court decision]]";
30 	} else {
31 		$categories[] = "[[Category:Automated categorization]]";
32 	}
33 	
34 	return $categories;
35 }
36 function getList(){
37 	$txt = getRawText("http://en.wikisource.org/w/index.php?action=raw&title=User:Slaporte/Automated_categorization/list");
38 	$txt = str_replace("<nowiki>\n","",$txt);
39 	$txt = str_replace("<pre>\n","",$txt);
40 	$txt = str_replace("\n</nowiki>","",$txt);
41 	$txt = str_replace("\n</pre>","",$txt);
42 	$list = explode("\n",$txt);
43 	foreach($list as $k=>$item) {
44 		$pair[$k] = explode("=>",$item);
45 		if(isset($pair[$k][1])){
46 			$pair[$k][0] = trim($pair[$k][0]);
47 			$pair[$k][1] = trim($pair[$k][1]);
48 		}
49 		$cats[$pair[$k][0]] = $pair[$k][1];
50 	}
51 	return $cats;
52 }
53 function displayCats($cats){
54 	foreach($cats as $cat){
55 		print $cat."\n";
56 	}
57 }
58 if(isset($_POST["PageName"])){
59 	$page = $_POST["PageName"];
60 }
61 ?>
62 <html>
63 <head><title>CategoryGuesser</title>
64 </head>
65 <body>
66 <div id=main>
67 <h1>Court Case Category Suggestion Tool</h1>
68 <form method="post" action="categoryguess.php">
69 <label for="PageName">Page title:</label>
70 <input type="text" name="PageName" value="<?PHP if(isset($page)){print $page;} ?>">
71 <div class="subtitle"><p>Enter the title of the page on wikisource, such as <i>International Shoe v. State of Washington</i></p></div>
72 <button type="submit" value="Submit" id="find">Submit</button>
73 <br />
74 <div id='results'>
75 <?php
76 if(isset($_POST["PageName"])){
77 	$page = $_POST["PageName"];
78 	$page = str_replace(" ","_",$page);
79 	$url = "http://en.wikisource.org/w/index.php?action=raw&title=".$page;
80 	print "<br/><label for='cats'>Suggested Categories:</label><br/><br/>";
81 	print "<textarea cols=70 rows=30 name='cats'>";
82 	displayCats(categoryGuess(getRawText($url),getlist()));
83 	print "</textarea>";
84 }
85 ?>
86 </div>
87 </form>
88 </div>
89 <p><a href="http://en.wikisource.org/wiki/User:Slaporte/Automated_categorization/list">Add or edit</a> category suggestions (live!). <a href="http://en.wikisource.org/wiki/User:Slaporte/Automated_categorization">source code and documentation</a> available.</p><p>leave <a href="http://en.wikisource.org/wiki/User_talk:Slaporte/Automated_categorization">feedback</a>.</p>
90 </body>
91 </html>